diff --git a/gnomad/utils/constraint.py b/gnomad/utils/constraint.py index 797eb76d9..2332c86dc 100644 --- a/gnomad/utils/constraint.py +++ b/gnomad/utils/constraint.py @@ -226,6 +226,31 @@ def count_variants_by_group( ) +def get_downsampling_freq_indices( + freq_meta_expr: hl.expr.ArrayExpression, + pop: str = "global", + variant_quality: str = "adj", +) -> hl.expr.ArrayExpression: + """ + Get indices of dictionaries in meta dictionaries that only have the "downsampling" key with specified "pop" and "variant_quality" values. + + :param freq_meta_expr: ArrayExpression containing the set of groupings for each + element of the `freq_expr` array (e.g., [{'group': 'adj'}, {'group': 'adj', + 'pop': 'nfe'}, {'downsampling': '5000', 'group': 'adj', 'pop': 'global'}]). + :param pop: Population to use for filtering by the 'pop' key in `freq_meta_expr`. + Default is 'global'. + :param variant_quality: Variant quality to use for filtering by the 'group' key in + `freq_meta_expr`. Default is 'adj'. + """ + indices = hl.enumerate(freq_meta_expr).filter( + lambda f: (f[1].get("group") == variant_quality) + & (f[1].get("pop") == pop) + & f[1].contains("downsampling") + ) + # Get an array of indices and meta dictionaries sorted by "downsampling" key. + return hl.sorted(indices, key=lambda f: hl.int(f[1]["downsampling"])) + + def downsampling_counts_expr( freq_expr: hl.expr.ArrayExpression, freq_meta_expr: hl.expr.ArrayExpression, @@ -257,18 +282,10 @@ def downsampling_counts_expr( :return: Aggregation Expression for an array of the variant counts in downsamplings for specified population. """ - # Get indices of dictionaries in meta dictionaries that only have the - # "downsampling" key with specified "group" and "pop" values. - indices = hl.enumerate(freq_meta_expr).filter( - lambda f: (f[1].size() == 3) - & (f[1].get("group") == variant_quality) - & (f[1].get("pop") == pop) - & f[1].contains("downsampling") - ) # Get an array of indices sorted by "downsampling" key. - sorted_indices = hl.sorted(indices, key=lambda f: hl.int(f[1]["downsampling"])).map( - lambda x: x[0] - ) + sorted_indices = get_downsampling_freq_indices( + freq_meta_expr, pop, variant_quality + ).map(lambda x: x[0]) def _get_criteria(i: hl.expr.Int32Expression) -> hl.expr.Int32Expression: """ diff --git a/gnomad/utils/filtering.py b/gnomad/utils/filtering.py index ee5366c1b..ebe7bb215 100644 --- a/gnomad/utils/filtering.py +++ b/gnomad/utils/filtering.py @@ -443,3 +443,63 @@ def filter_y_nonpar( if isinstance(t, hl.Table) else t.filter_rows(non_par_expr) ) + + +def filter_by_numeric_expr_range( + t: Union[hl.MatrixTable, hl.Table], + filter_expr: hl.NumericExpression, + filter_range: tuple, + keep_between: bool = True, + inclusive: bool = True, +) -> Union[hl.MatrixTable, hl.Table]: + """ + Filter rows in the Table/MatrixTable based on the range of a numeric expression. + + :param t: Input Table/MatrixTable. + :param filter_expr: NumericExpression to apply `filter_range` to. + :param filter_range: Range of values to apply to `filter_expr`. + :param keep_between: Whether to keep the values between `filter_range` instead of keeping values outside `filter_range`. Default is True. + :param inclusive: Whether or not to include the `filter_range` values themselves. Default is True. + :return: Table/MatrixTable filtered to rows with specified criteria. + """ + if inclusive and keep_between or not inclusive and not keep_between: + criteria = (filter_expr >= filter_range[0]) & (filter_expr <= filter_range[1]) + else: + criteria = (filter_expr > filter_range[0]) & (filter_expr < filter_range[1]) + + if isinstance(t, hl.MatrixTable): + return t.filter_rows(criteria, keep=keep_between) + else: + return t.filter(criteria, keep=keep_between) + + +def filter_for_mu( + ht: hl.Table, gerp_lower_cutoff: float = -3.9885, gerp_upper_cutoff: float = 2.6607 +) -> hl.Table: + """ + Filter to non-coding annotations and remove GERP outliers. + + .. note:: + + Values for `gerp_lower_cutoff` and `gerp_upper_cutoff` default to -3.9885 and + 2.6607, respectively. These values were precalculated on the GRCh37 context + table and define the 5th and 95th percentiles. + + :param ht: Input Table. + :param gerp_lower_cutoff: Minimum GERP score for variant to be included. Default is -3.9885. + :param gerp_upper_cutoff: Maximum GERP score for variant to be included. Default is 2.6607. + :return: Table filtered to intron or intergenic variants with GERP outliers removed. + """ + ht = filter_by_numeric_expr_range( + ht, + filter_expr=ht.gerp, + filter_range=(gerp_lower_cutoff, gerp_upper_cutoff), + keep_between=True, + inclusive=False, + ) + ht = ht.filter( + (ht.vep.most_severe_consequence == "intron_variant") + | (ht.vep.most_severe_consequence == "intergenic_variant") + ) + + return ht