Skip to content
Merged
39 changes: 28 additions & 11 deletions gnomad/utils/constraint.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,31 @@ def count_variants_by_group(
)


def get_downsampling_freq_indices(
freq_meta_expr: hl.expr.ArrayExpression,
pop: str = "global",
variant_quality: str = "adj",
) -> hl.expr.ArrayExpression:
"""
Get indices of dictionaries in meta dictionaries that only have the "downsampling" key with specified "pop" and "variant_quality" values.

:param freq_meta_expr: ArrayExpression containing the set of groupings for each
element of the `freq_expr` array (e.g., [{'group': 'adj'}, {'group': 'adj',
'pop': 'nfe'}, {'downsampling': '5000', 'group': 'adj', 'pop': 'global'}]).
:param pop: Population to use for filtering by the 'pop' key in `freq_meta_expr`.
Default is 'global'.
:param variant_quality: Variant quality to use for filtering by the 'group' key in
`freq_meta_expr`. Default is 'adj'.
"""
indices = hl.enumerate(freq_meta_expr).filter(
lambda f: (f[1].get("group") == variant_quality)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just confirming lambda f: (f[1].size() == 3) is not needed. There is no case where there will be something like: {'downsampling': '5000', 'group': 'adj', 'pop': 'global'} and {'downsampling': '5000', 'group': 'adj', 'pop': 'global', 'other_strata':'some_val} and we only want {'downsampling': '5000', 'group': 'adj', 'pop': 'global'}?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

at least not in the existing datasets, but I could leave it in to be on the safe size

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's fine for now and we can update if our downsampling frequencies get more complex

& (f[1].get("pop") == pop)
& f[1].contains("downsampling")
)
# Get an array of indices and meta dictionaries sorted by "downsampling" key.
return hl.sorted(indices, key=lambda f: hl.int(f[1]["downsampling"]))


def downsampling_counts_expr(
freq_expr: hl.expr.ArrayExpression,
freq_meta_expr: hl.expr.ArrayExpression,
Expand Down Expand Up @@ -257,18 +282,10 @@ def downsampling_counts_expr(
:return: Aggregation Expression for an array of the variant counts in downsamplings
for specified population.
"""
# Get indices of dictionaries in meta dictionaries that only have the
# "downsampling" key with specified "group" and "pop" values.
indices = hl.enumerate(freq_meta_expr).filter(
lambda f: (f[1].size() == 3)
& (f[1].get("group") == variant_quality)
& (f[1].get("pop") == pop)
& f[1].contains("downsampling")
)
# Get an array of indices sorted by "downsampling" key.
sorted_indices = hl.sorted(indices, key=lambda f: hl.int(f[1]["downsampling"])).map(
lambda x: x[0]
)
sorted_indices = get_downsampling_freq_indices(
freq_meta_expr, pop, variant_quality
).map(lambda x: x[0])

def _get_criteria(i: hl.expr.Int32Expression) -> hl.expr.Int32Expression:
"""
Expand Down
60 changes: 60 additions & 0 deletions gnomad/utils/filtering.py
Original file line number Diff line number Diff line change
Expand Up @@ -443,3 +443,63 @@ def filter_y_nonpar(
if isinstance(t, hl.Table)
else t.filter_rows(non_par_expr)
)


def filter_by_numeric_expr_range(
t: Union[hl.MatrixTable, hl.Table],
filter_expr: hl.NumericExpression,
filter_range: tuple,
keep_between: bool = True,
inclusive: bool = True,
) -> Union[hl.MatrixTable, hl.Table]:
"""
Filter rows in the Table/MatrixTable based on the range of a numeric expression.

:param t: Input Table/MatrixTable.
:param filter_expr: NumericExpression to apply `filter_range` to.
:param filter_range: Range of values to apply to `filter_expr`.
:param keep_between: Whether to keep the values between `filter_range` instead of keeping values outside `filter_range`. Default is True.
:param inclusive: Whether or not to include the `filter_range` values themselves. Default is True.
:return: Table/MatrixTable filtered to rows with specified criteria.
"""
if inclusive and keep_between or not inclusive and not keep_between:
criteria = (filter_expr >= filter_range[0]) & (filter_expr <= filter_range[1])
else:
criteria = (filter_expr > filter_range[0]) & (filter_expr < filter_range[1])

if isinstance(t, hl.MatrixTable):
return t.filter_rows(criteria, keep=keep_between)
else:
return t.filter(criteria, keep=keep_between)


def filter_for_mu(
ht: hl.Table, gerp_lower_cutoff: float = -3.9885, gerp_upper_cutoff: float = 2.6607
) -> hl.Table:
"""
Filter to non-coding annotations and remove GERP outliers.

.. note::

Values for `gerp_lower_cutoff` and `gerp_upper_cutoff` default to -3.9885 and
2.6607, respectively. These values were precalculated on the GRCh37 context
table and define the 5th and 95th percentiles.

:param ht: Input Table.
:param gerp_lower_cutoff: Minimum GERP score for variant to be included. Default is -3.9885.
:param gerp_upper_cutoff: Maximum GERP score for variant to be included. Default is 2.6607.
:return: Table filtered to intron or intergenic variants with GERP outliers removed.
"""
ht = filter_by_numeric_expr_range(
ht,
filter_expr=ht.gerp,
filter_range=(gerp_lower_cutoff, gerp_upper_cutoff),
keep_between=True,
inclusive=False,
)
ht = ht.filter(
(ht.vep.most_severe_consequence == "intron_variant")
| (ht.vep.most_severe_consequence == "intergenic_variant")
)

return ht