diff --git a/CHANGELOG.md b/CHANGELOG.md index c619353d9..949107b44 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,14 +1,19 @@ # Changes ## Unreleased +* Added function `region_flag_expr` to flag problematic regions [(#349)](https://github.com/broadinstitute/gnomad_methods/pull/349/files) +* Added function `missing_callstats_expr` to create a Hail Struct with missing values that is inserted into frequency annotation arrays when data is missing [(#349)](https://github.com/broadinstitute/gnomad_methods/pull/349/files) +* Added function `set_female_y_metrics_to_na_expr` to set Y-variant frequency callstats for female-specific metrics to missing [(#349)](https://github.com/broadinstitute/gnomad_methods/pull/349/files) +* Added function `make_faf_index_dict` to create a look-up Dictionary for entries contained in the filter allele frequency annotation array [(#349)](https://github.com/broadinstitute/gnomad_methods/pull/349/files) +* Added function `make_freq_index_dict` to create a look-up Dictionary for entries contained in the frequency annotation array [(#349)](https://github.com/broadinstitute/gnomad_methods/pull/349/files) ## Version 0.5.0 - April 22nd, 2021 ### Fixed * Fix for error in `generate_trio_stats_expr` that led to an incorrect untransmitted count. [(#238)](https://github.com/broadinstitute/gnomad_methods/pull/238) -* Fix for error in `compute_quantile_bin` that caused incorrect binning when a single score overlapped multiple bins. [(#238)](https://github.com/broadinstitute/gnomad_methods/pull/238) -* Fixed `create_binned_ht` because it produced a "Cannot combine expressions from different source objects error". [(#238)](https://github.com/broadinstitute/gnomad_methods/pull/238) +* Fix for error in `compute_quantile_bin` that caused incorrect binning when a single score overlapped multiple bins [(#238)](https://github.com/broadinstitute/gnomad_methods/pull/238) +* Fixed `create_binned_ht` because it produced a "Cannot combine expressions from different source objects error" [(#238)](https://github.com/broadinstitute/gnomad_methods/pull/238) * Fixed handling of missing entries (not within a ref block / alt site) when computing `coverage_stats` in `sparse_mt.py` [[#242]](https://github.com/broadinstitute/gnomad_methods/pull/242) * Fix for error in `compute_stratified_sample_qc` where `gt_expr` caused error [(#259)](https://github.com/broadinstitute/gnomad_methods/pull/259) * Fix for error in `default_lift_data` caused by missing `results` field in `new_locus` [(#270)](https://github.com/broadinstitute/gnomad_methods/pull/270) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 6434f0885..6c76d4454 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -38,6 +38,7 @@ def pop_max_expr( pops_to_exclude: Optional[Set[str]] = None, ) -> hl.expr.StructExpression: """ + Create an expression containing the frequency information about the population that has the highest AF in `freq_meta`. Populations specified in `pops_to_exclude` are excluded and only frequencies from adj populations are considered. @@ -761,7 +762,7 @@ def add_variant_type(alt_alleles: hl.expr.ArrayExpression) -> hl.expr.StructExpr def annotation_type_is_numeric(t: Any) -> bool: """ - Given an annotation type, returns whether it is a numerical type or not. + Given an annotation type, return whether it is a numerical type or not. :param t: Type to test :return: If the input type is numeric @@ -959,6 +960,81 @@ def unphase_call_expr(call_expr: hl.expr.CallExpression) -> hl.expr.CallExpressi ) +def region_flag_expr( + t: Union[hl.Table, hl.MatrixTable], + non_par: bool = True, + prob_regions: Dict[str, hl.Table] = None, +) -> hl.expr.StructExpression: + """ + Create a `region_flag` struct that contains flags for problematic regions (i.e., LCR, decoy, segdup, and nonpar regions). + + .. note:: No hg38 resources for decoy or self chain are available yet. + + :param t: Input Table/MatrixTable + :param non_par: If True, flag loci that occur within pseudoautosomal regions on sex chromosomes + :param prob_regions: If supplied, flag loci that occur within regions defined in Hail Table(s) + :return: `region_flag` struct row annotation + """ + prob_flags_expr = ( + {"non_par": (t.locus.in_x_nonpar() | t.locus.in_y_nonpar())} if non_par else {} + ) + + if prob_regions is not None: + prob_flags_expr.update( + { + region_name: hl.is_defined(region_table[t.locus]) + for region_name, region_table in prob_regions.items() + } + ) + + return hl.struct(**prob_flags_expr) + + +def missing_callstats_expr() -> hl.expr.StructExpression: + """ + Create a missing callstats struct for insertion into frequency annotation arrays when data is missing. + + :return: Hail Struct with missing values for each callstats element + """ + return hl.struct( + AC=hl.missing(hl.tint32), + AF=hl.missing(hl.tfloat64), + AN=hl.missing(hl.tint32), + homozygote_count=hl.missing(hl.tint32), + ) + + +def set_female_y_metrics_to_na_expr( + t: Union[hl.Table, hl.MatrixTable] +) -> hl.expr.ArrayExpression: + """ + Set Y-variant frequency callstats for female-specific metrics to missing structs. + + .. note:: Requires freq, freq_meta, and freq_index_dict annotations to be present in Table or MatrixTable + + :param t: Table or MatrixTable for which to adjust female metrics + :return: Hail array expression to set female Y-variant metrics to missing values + """ + female_idx = hl.map( + lambda x: t.freq_index_dict[x], + hl.filter(lambda x: x.contains("XX"), t.freq_index_dict.keys()), + ) + freq_idx_range = hl.range(hl.len(t.freq_meta)) + + new_freq_expr = hl.if_else( + (t.locus.in_y_nonpar() | t.locus.in_y_par()), + hl.map( + lambda x: hl.if_else( + female_idx.contains(x), missing_callstats_expr(), t.freq[x] + ), + freq_idx_range, + ), + t.freq, + ) + + return new_freq_expr + + def hemi_expr( locus: hl.expr.LocusExpression, sex_expr: hl.expr.StringExpression, diff --git a/gnomad/utils/release.py b/gnomad/utils/release.py new file mode 100644 index 000000000..cd0a8a4a6 --- /dev/null +++ b/gnomad/utils/release.py @@ -0,0 +1,93 @@ +# noqa: D100 + +from typing import Dict, List, Optional + +from gnomad.resources.grch38.gnomad import ( + GROUPS, + POPS, + SEXES, + SUBSETS, +) +from gnomad.utils.vcf import index_globals + + +def make_faf_index_dict( + faf_meta: List[Dict[str, str]], + groups: List[str] = ["adj"], + pops: List[str] = POPS, + sexes: List[str] = SEXES, + label_delimiter: str = "_", +) -> Dict[str, int]: + """ + Create a look-up Dictionary for entries contained in the filter allele frequency annotation array. + + :param faf_meta: Global annotation containing the set of groupings for each element of the faf array + (e.g., [{'group': 'adj'}, {'group': 'adj', 'pop': 'nfe'}]) + :param groups: List of sample groups [adj, raw]. Default is GROUPS + :param pops: List of sample global population names for gnomAD genomes. Default is POPS + :param sexes: List of sample sexes used in VCF export. Default is SEXES + :param label_delimiter: String used as delimiter when making group label combinations + :return: Dictionary of faf annotation population groupings, where values are the corresponding 0-based indices for the + groupings in the faf_meta array + """ + + def _get_index(label_groups): + return index_globals(faf_meta, label_groups, label_delimiter) + + index_dict = { + **_get_index(dict(group=groups)), + **_get_index(dict(group=groups, pop=pops)), + **_get_index(dict(group=groups, sex=sexes)), + **_get_index(dict(group=groups, pop=pops, sex=sexes)), + } + return index_dict + + +def make_freq_index_dict( + freq_meta: List[Dict[str, str]], + groups: List[str] = GROUPS, + pops: List[str] = POPS, + sexes: List[str] = SEXES, + subsets: List[str] = SUBSETS, + downsamplings: Optional[List[int]] = None, + label_delimiter: str = "_", +) -> Dict[str, int]: + """ + Create a look-up Dictionary for entries contained in the frequency annotation array. + + .. note: + + Downsampling groupings are only computed on 'adj'-filtered genotypes + + :param freq_meta: List containing the set of groupings for each element of the freq array + (e.g., [{'group': 'adj'}, {'group': 'adj', 'pop': 'nfe'}]) + :param groups: List of sample groups [adj, raw]. Default is GROUPS + :param pops: List of sample global population names for gnomAD genomes. Default is POPS + :param sexes: List of sample sexes used in VCF export. Default is SEXES + :param subset_list: List of sample subsets in dataset. Default is SUBSETS + :param downsamplings: List of downsampling cohort sizes present in global frequency array + :param label_delimiter: String used as delimiter when making group label combinations + :return: Dictionary keyed by the grouping combinations found in the frequency array, where values are the corresponding + 0-based indices for the groupings in the freq_meta array + """ + + def _get_index(label_groups): + return index_globals(freq_meta, label_groups, label_delimiter) + + index_dict = { + **_get_index(dict(group=groups)), + **_get_index(dict(group=groups, pop=pops)), + **_get_index(dict(group=groups, sex=sexes)), + **_get_index(dict(group=groups, pop=pops, sex=sexes)), + **_get_index(dict(group=groups, subset=subsets)), + **_get_index(dict(group=groups, subset=subsets, pop=pops)), + **_get_index(dict(group=groups, subset=subsets, sex=sexes)), + **_get_index(dict(group=groups, subset=subsets, pop=pops, sex=sexes)), + } + + if downsamplings: + index_dict.update( + {**_get_index(dict(downsampling=downsamplings, group=["adj"], pop=pops))} + ) + + return index_dict diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index 2ae84af60..d91b9206d 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -54,8 +54,10 @@ "AS_MQ", "AS_MQRankSum", "AS_pab_max", + "AS_QUALapprox", "AS_QD", "AS_ReadPosRankSum", + "AS_SB_TABLE", "AS_SOR", "AS_VarDP", "InbreedingCoeff", @@ -69,8 +71,10 @@ "FS", "MQ", "MQRankSum", + "QUALapprox", "QD", "ReadPosRankSum", + "SB", "SOR", "VarDP", ]