From 36361fffbe9833ad0130afffc04726ce74f86b23 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Fri, 20 Oct 2023 12:11:10 -0600 Subject: [PATCH 01/16] Change `get_downsampling_freq_indices` and `downsampling_counts_expr` to support both 'pop' and 'gen_anc' keys inf metadata --- gnomad/utils/constraint.py | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/gnomad/utils/constraint.py b/gnomad/utils/constraint.py index f0a51574b..f1c55a808 100644 --- a/gnomad/utils/constraint.py +++ b/gnomad/utils/constraint.py @@ -230,21 +230,32 @@ def get_downsampling_freq_indices( freq_meta_expr: hl.expr.ArrayExpression, pop: str = "global", variant_quality: str = "adj", + genetic_ancestry_label: Optional[str] = None, ) -> hl.expr.ArrayExpression: """ - Get indices of dictionaries in meta dictionaries that only have the "downsampling" key with specified "pop" and "variant_quality" values. + Get indices of dictionaries in meta dictionaries that only have the "downsampling" key with specified `genetic_ancestry_label` and "variant_quality" values. :param freq_meta_expr: ArrayExpression containing the set of groupings for each element of the `freq_expr` array (e.g., [{'group': 'adj'}, {'group': 'adj', 'pop': 'nfe'}, {'downsampling': '5000', 'group': 'adj', 'pop': 'global'}]). - :param pop: Population to use for filtering by the 'pop' key in `freq_meta_expr`. - Default is 'global'. + :param pop: Population to use for filtering by the `genetic_ancestry_label` key in + `freq_meta_expr`. Default is 'global'. :param variant_quality: Variant quality to use for filtering by the 'group' key in `freq_meta_expr`. Default is 'adj'. + :param genetic_ancestry_label: Label defining the genetic ancestry groups. If None, + "gen_anc" or "pop" is used (in that order of preference) if present. Default is + None. + :return: ArrayExpression of indices of dictionaries in `freq_meta_expr` that only + have the "downsampling" key with specified `genetic_ancestry_label` and + "variant_quality" values. """ + if genetic_ancestry_label is None: + gen_anc = ["gen_anc", "pop"] + else: + gen_anc = [genetic_ancestry_label] indices = hl.enumerate(freq_meta_expr).filter( lambda f: (f[1].get("group") == variant_quality) - & (f[1].get("pop") == pop) + & (hl.any([f[1].get(l, "") == pop for l in gen_anc])) & f[1].contains("downsampling") ) # Get an array of indices and meta dictionaries sorted by "downsampling" key. @@ -258,33 +269,37 @@ def downsampling_counts_expr( variant_quality: str = "adj", singleton: bool = False, max_af: Optional[float] = None, + genetic_ancestry_label: Optional[str] = None, ) -> hl.expr.ArrayExpression: """ Return an aggregation expression to compute an array of counts of all downsamplings found in `freq_expr` where specified criteria is met. The frequency metadata (`freq_meta_expr`) should be in a similar format to the `freq_meta` annotation added by `annotate_freq()`. Each downsampling should have - 'group', 'pop', and 'downsampling' keys. Included downsamplings are those where - 'group' == `variant_quality` and 'pop' == `pop`. + 'group', `genetic_ancestry_label`, and 'downsampling' keys. Included downsamplings + are those where 'group' == `variant_quality` and `genetic_ancestry_label` == `pop`. :param freq_expr: ArrayExpression of Structs with 'AC' and 'AF' annotations. :param freq_meta_expr: ArrayExpression containing the set of groupings for each element of the `freq_expr` array (e.g., [{'group': 'adj'}, {'group': 'adj', 'pop': 'nfe'}, {'downsampling': '5000', 'group': 'adj', 'pop': 'global'}]). - :param pop: Population to use for filtering by the 'pop' key in `freq_meta_expr`. - Default is 'global'. + :param pop: Population to use for filtering by the `genetic_ancestry_label` key in + `freq_meta_expr`. Default is 'global'. :param variant_quality: Variant quality to use for filtering by the 'group' key in `freq_meta_expr`. Default is 'adj'. :param singleton: Whether to filter to only singletons before counting (AC == 1). Default is False. :param max_af: Maximum variant allele frequency to keep. By default no allele frequency cutoff is applied. + :param genetic_ancestry_label: Label defining the genetic ancestry groups. If None, + "gen_anc" or "pop" is used (in that order of preference) if present. Default is + None. :return: Aggregation Expression for an array of the variant counts in downsamplings for specified population. """ # Get an array of indices sorted by "downsampling" key. sorted_indices = get_downsampling_freq_indices( - freq_meta_expr, pop, variant_quality + freq_meta_expr, pop, variant_quality, genetic_ancestry_label ).map(lambda x: x[0]) def _get_criteria(i: hl.expr.Int32Expression) -> hl.expr.Int32Expression: From ce2db443e95bbad72d1b593933b8a75f813fed16 Mon Sep 17 00:00:00 2001 From: klaricch Date: Wed, 27 Mar 2024 10:52:22 -0400 Subject: [PATCH 02/16] add subset param --- gnomad/utils/constraint.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/gnomad/utils/constraint.py b/gnomad/utils/constraint.py index f1c55a808..ee7e34a25 100644 --- a/gnomad/utils/constraint.py +++ b/gnomad/utils/constraint.py @@ -231,6 +231,7 @@ def get_downsampling_freq_indices( pop: str = "global", variant_quality: str = "adj", genetic_ancestry_label: Optional[str] = None, + subset: Optional[str] = "None", ) -> hl.expr.ArrayExpression: """ Get indices of dictionaries in meta dictionaries that only have the "downsampling" key with specified `genetic_ancestry_label` and "variant_quality" values. @@ -245,6 +246,8 @@ def get_downsampling_freq_indices( :param genetic_ancestry_label: Label defining the genetic ancestry groups. If None, "gen_anc" or "pop" is used (in that order of preference) if present. Default is None. + :param subset: Subset to use for filtering by the 'subset' key in + `freq_meta_expr`. Default is "None". :return: ArrayExpression of indices of dictionaries in `freq_meta_expr` that only have the "downsampling" key with specified `genetic_ancestry_label` and "variant_quality" values. @@ -257,6 +260,7 @@ def get_downsampling_freq_indices( lambda f: (f[1].get("group") == variant_quality) & (hl.any([f[1].get(l, "") == pop for l in gen_anc])) & f[1].contains("downsampling") + & (f[1].get("subset", "None") == subset) ) # Get an array of indices and meta dictionaries sorted by "downsampling" key. return hl.sorted(indices, key=lambda f: hl.int(f[1]["downsampling"])) @@ -270,6 +274,7 @@ def downsampling_counts_expr( singleton: bool = False, max_af: Optional[float] = None, genetic_ancestry_label: Optional[str] = None, + subset: Optional[str] = "None", ) -> hl.expr.ArrayExpression: """ Return an aggregation expression to compute an array of counts of all downsamplings found in `freq_expr` where specified criteria is met. @@ -294,12 +299,14 @@ def downsampling_counts_expr( :param genetic_ancestry_label: Label defining the genetic ancestry groups. If None, "gen_anc" or "pop" is used (in that order of preference) if present. Default is None. + :param subset: Subset to use for filtering by the 'subset' key in + `freq_meta_expr`. Default is "None". :return: Aggregation Expression for an array of the variant counts in downsamplings for specified population. """ # Get an array of indices sorted by "downsampling" key. sorted_indices = get_downsampling_freq_indices( - freq_meta_expr, pop, variant_quality, genetic_ancestry_label + freq_meta_expr, pop, variant_quality, genetic_ancestry_label, subset ).map(lambda x: x[0]) def _get_criteria(i: hl.expr.Int32Expression) -> hl.expr.Int32Expression: From 832ad8a7c2cfe07f3004cf59f6205acfc8db72b3 Mon Sep 17 00:00:00 2001 From: klaricch Date: Wed, 27 Mar 2024 15:31:16 -0400 Subject: [PATCH 03/16] add downsamplings param --- gnomad/utils/constraint.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/gnomad/utils/constraint.py b/gnomad/utils/constraint.py index ee7e34a25..e8250f836 100644 --- a/gnomad/utils/constraint.py +++ b/gnomad/utils/constraint.py @@ -7,6 +7,7 @@ import hail as hl from hail.utils.misc import divide_null, new_temp_file +from gnomad.resources.grch38.gnomad import DOWNSAMPLINGS from gnomad.utils.vep import explode_by_vep_annotation, process_consequences logging.basicConfig( @@ -232,6 +233,7 @@ def get_downsampling_freq_indices( variant_quality: str = "adj", genetic_ancestry_label: Optional[str] = None, subset: Optional[str] = "None", + downsamplings: list = DOWNSAMPLINGS["v4"], ) -> hl.expr.ArrayExpression: """ Get indices of dictionaries in meta dictionaries that only have the "downsampling" key with specified `genetic_ancestry_label` and "variant_quality" values. @@ -248,6 +250,7 @@ def get_downsampling_freq_indices( None. :param subset: Subset to use for filtering by the 'subset' key in `freq_meta_expr`. Default is "None". + :param downsamplings: List of strings specifying what levels of downsamplings to obtain. :return: ArrayExpression of indices of dictionaries in `freq_meta_expr` that only have the "downsampling" key with specified `genetic_ancestry_label` and "variant_quality" values. @@ -260,6 +263,7 @@ def get_downsampling_freq_indices( lambda f: (f[1].get("group") == variant_quality) & (hl.any([f[1].get(l, "") == pop for l in gen_anc])) & f[1].contains("downsampling") + & hl.literal(downsamplings).contains(hl.int(f[1].get("downsampling", "0"))) & (f[1].get("subset", "None") == subset) ) # Get an array of indices and meta dictionaries sorted by "downsampling" key. @@ -275,6 +279,7 @@ def downsampling_counts_expr( max_af: Optional[float] = None, genetic_ancestry_label: Optional[str] = None, subset: Optional[str] = "None", + downsamplings: list = DOWNSAMPLINGS["v4"], ) -> hl.expr.ArrayExpression: """ Return an aggregation expression to compute an array of counts of all downsamplings found in `freq_expr` where specified criteria is met. @@ -301,12 +306,18 @@ def downsampling_counts_expr( None. :param subset: Subset to use for filtering by the 'subset' key in `freq_meta_expr`. Default is "None". + :param downsamplings: List of strings specifying what levels of downsamplings to obtain. :return: Aggregation Expression for an array of the variant counts in downsamplings for specified population. """ # Get an array of indices sorted by "downsampling" key. sorted_indices = get_downsampling_freq_indices( - freq_meta_expr, pop, variant_quality, genetic_ancestry_label, subset + freq_meta_expr, + pop, + variant_quality, + genetic_ancestry_label, + subset, + downsamplings, ).map(lambda x: x[0]) def _get_criteria(i: hl.expr.Int32Expression) -> hl.expr.Int32Expression: From 0dc5a160f1b8613b64198683e3e06d8e7746a0be Mon Sep 17 00:00:00 2001 From: klaricch Date: Thu, 28 Mar 2024 16:12:33 -0400 Subject: [PATCH 04/16] pop -> gen_anc --- gnomad/utils/constraint.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/gnomad/utils/constraint.py b/gnomad/utils/constraint.py index e8250f836..c7d0b520e 100644 --- a/gnomad/utils/constraint.py +++ b/gnomad/utils/constraint.py @@ -229,7 +229,7 @@ def count_variants_by_group( def get_downsampling_freq_indices( freq_meta_expr: hl.expr.ArrayExpression, - pop: str = "global", + gen_anc: str = "global", variant_quality: str = "adj", genetic_ancestry_label: Optional[str] = None, subset: Optional[str] = "None", @@ -241,7 +241,7 @@ def get_downsampling_freq_indices( :param freq_meta_expr: ArrayExpression containing the set of groupings for each element of the `freq_expr` array (e.g., [{'group': 'adj'}, {'group': 'adj', 'pop': 'nfe'}, {'downsampling': '5000', 'group': 'adj', 'pop': 'global'}]). - :param pop: Population to use for filtering by the `genetic_ancestry_label` key in + :param gen_anc: Genetic ancestry group to use for filtering by the `genetic_ancestry_label` key in `freq_meta_expr`. Default is 'global'. :param variant_quality: Variant quality to use for filtering by the 'group' key in `freq_meta_expr`. Default is 'adj'. @@ -261,7 +261,7 @@ def get_downsampling_freq_indices( gen_anc = [genetic_ancestry_label] indices = hl.enumerate(freq_meta_expr).filter( lambda f: (f[1].get("group") == variant_quality) - & (hl.any([f[1].get(l, "") == pop for l in gen_anc])) + & (hl.any([f[1].get(l, "") == gen_anc for l in gen_anc])) & f[1].contains("downsampling") & hl.literal(downsamplings).contains(hl.int(f[1].get("downsampling", "0"))) & (f[1].get("subset", "None") == subset) @@ -273,7 +273,7 @@ def get_downsampling_freq_indices( def downsampling_counts_expr( freq_expr: hl.expr.ArrayExpression, freq_meta_expr: hl.expr.ArrayExpression, - pop: str = "global", + gen_anc: str = "global", variant_quality: str = "adj", singleton: bool = False, max_af: Optional[float] = None, @@ -293,7 +293,7 @@ def downsampling_counts_expr( :param freq_meta_expr: ArrayExpression containing the set of groupings for each element of the `freq_expr` array (e.g., [{'group': 'adj'}, {'group': 'adj', 'pop': 'nfe'}, {'downsampling': '5000', 'group': 'adj', 'pop': 'global'}]). - :param pop: Population to use for filtering by the `genetic_ancestry_label` key in + :param gen_anc: Genetic ancestry group to use for filtering by the `genetic_ancestry_label` key in `freq_meta_expr`. Default is 'global'. :param variant_quality: Variant quality to use for filtering by the 'group' key in `freq_meta_expr`. Default is 'adj'. @@ -313,7 +313,7 @@ def downsampling_counts_expr( # Get an array of indices sorted by "downsampling" key. sorted_indices = get_downsampling_freq_indices( freq_meta_expr, - pop, + gen_anc, variant_quality, genetic_ancestry_label, subset, @@ -337,7 +337,7 @@ def _get_criteria(i: hl.expr.Int32Expression) -> hl.expr.Int32Expression: # Map `_get_criteria` function to each downsampling indexed by `sorted_indices` to # generate a list of 1's and 0's for each variant, where the length of the array is - # the total number of downsamplings for the specified population and each element + # the total number of downsamplings for the specified genetic ancestry group and each element # in the array indicates if the variant in the downsampling indexed by # `sorted_indices` meets the specified criteria. # Return an array sum aggregation that aggregates arrays generated from mapping. From 01374224db51455735d5659f793e7a1563d2f0bf Mon Sep 17 00:00:00 2001 From: klaricch Date: Thu, 28 Mar 2024 16:17:48 -0400 Subject: [PATCH 05/16] revert --- gnomad/utils/constraint.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/gnomad/utils/constraint.py b/gnomad/utils/constraint.py index c7d0b520e..e8250f836 100644 --- a/gnomad/utils/constraint.py +++ b/gnomad/utils/constraint.py @@ -229,7 +229,7 @@ def count_variants_by_group( def get_downsampling_freq_indices( freq_meta_expr: hl.expr.ArrayExpression, - gen_anc: str = "global", + pop: str = "global", variant_quality: str = "adj", genetic_ancestry_label: Optional[str] = None, subset: Optional[str] = "None", @@ -241,7 +241,7 @@ def get_downsampling_freq_indices( :param freq_meta_expr: ArrayExpression containing the set of groupings for each element of the `freq_expr` array (e.g., [{'group': 'adj'}, {'group': 'adj', 'pop': 'nfe'}, {'downsampling': '5000', 'group': 'adj', 'pop': 'global'}]). - :param gen_anc: Genetic ancestry group to use for filtering by the `genetic_ancestry_label` key in + :param pop: Population to use for filtering by the `genetic_ancestry_label` key in `freq_meta_expr`. Default is 'global'. :param variant_quality: Variant quality to use for filtering by the 'group' key in `freq_meta_expr`. Default is 'adj'. @@ -261,7 +261,7 @@ def get_downsampling_freq_indices( gen_anc = [genetic_ancestry_label] indices = hl.enumerate(freq_meta_expr).filter( lambda f: (f[1].get("group") == variant_quality) - & (hl.any([f[1].get(l, "") == gen_anc for l in gen_anc])) + & (hl.any([f[1].get(l, "") == pop for l in gen_anc])) & f[1].contains("downsampling") & hl.literal(downsamplings).contains(hl.int(f[1].get("downsampling", "0"))) & (f[1].get("subset", "None") == subset) @@ -273,7 +273,7 @@ def get_downsampling_freq_indices( def downsampling_counts_expr( freq_expr: hl.expr.ArrayExpression, freq_meta_expr: hl.expr.ArrayExpression, - gen_anc: str = "global", + pop: str = "global", variant_quality: str = "adj", singleton: bool = False, max_af: Optional[float] = None, @@ -293,7 +293,7 @@ def downsampling_counts_expr( :param freq_meta_expr: ArrayExpression containing the set of groupings for each element of the `freq_expr` array (e.g., [{'group': 'adj'}, {'group': 'adj', 'pop': 'nfe'}, {'downsampling': '5000', 'group': 'adj', 'pop': 'global'}]). - :param gen_anc: Genetic ancestry group to use for filtering by the `genetic_ancestry_label` key in + :param pop: Population to use for filtering by the `genetic_ancestry_label` key in `freq_meta_expr`. Default is 'global'. :param variant_quality: Variant quality to use for filtering by the 'group' key in `freq_meta_expr`. Default is 'adj'. @@ -313,7 +313,7 @@ def downsampling_counts_expr( # Get an array of indices sorted by "downsampling" key. sorted_indices = get_downsampling_freq_indices( freq_meta_expr, - gen_anc, + pop, variant_quality, genetic_ancestry_label, subset, @@ -337,7 +337,7 @@ def _get_criteria(i: hl.expr.Int32Expression) -> hl.expr.Int32Expression: # Map `_get_criteria` function to each downsampling indexed by `sorted_indices` to # generate a list of 1's and 0's for each variant, where the length of the array is - # the total number of downsamplings for the specified genetic ancestry group and each element + # the total number of downsamplings for the specified population and each element # in the array indicates if the variant in the downsampling indexed by # `sorted_indices` meets the specified criteria. # Return an array sum aggregation that aggregates arrays generated from mapping. From 7f963ba0e3acea1b23b63867dd9ed2ab1b984a78 Mon Sep 17 00:00:00 2001 From: klaricch Date: Thu, 28 Mar 2024 16:21:10 -0400 Subject: [PATCH 06/16] small edit --- gnomad/utils/constraint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/utils/constraint.py b/gnomad/utils/constraint.py index e8250f836..8a2ec28cc 100644 --- a/gnomad/utils/constraint.py +++ b/gnomad/utils/constraint.py @@ -215,7 +215,7 @@ def count_variants_by_group( pop, ) agg[f"singleton_downsampling_counts_{pop}"] = downsampling_counts_expr( - freq_expr, freq_meta_expr, pop, singleton=True + freq_expr, freq_meta_expr, pop, max_af=max_af, singleton=True ) # Apply each variant count aggregation in `agg` to get counts for all # combinations of `grouping`. From 5228c629755d0f2a7530126b4e492776723e67a7 Mon Sep 17 00:00:00 2001 From: klaricch Date: Thu, 4 Apr 2024 12:41:17 -0400 Subject: [PATCH 07/16] Apply suggestions from code review Co-authored-by: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> --- gnomad/utils/constraint.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/gnomad/utils/constraint.py b/gnomad/utils/constraint.py index 4a0876ab2..3096a973a 100644 --- a/gnomad/utils/constraint.py +++ b/gnomad/utils/constraint.py @@ -233,7 +233,7 @@ def get_downsampling_freq_indices( variant_quality: str = "adj", genetic_ancestry_label: Optional[str] = None, subset: Optional[str] = "None", - downsamplings: list = DOWNSAMPLINGS["v4"], + downsamplings: Optional[List[int]] = None, ) -> hl.expr.ArrayExpression: """ Get indices of dictionaries in meta dictionaries that only have the "downsampling" key with specified `genetic_ancestry_label` and "variant_quality" values. @@ -250,7 +250,8 @@ def get_downsampling_freq_indices( None. :param subset: Subset to use for filtering by the 'subset' key in `freq_meta_expr`. Default is "None". - :param downsamplings: List of strings specifying what levels of downsamplings to obtain. + :param downsamplings: Optional List of integers specifying what downsampling + indices to obtain. Default is None, which will return all downsampling indices. :return: ArrayExpression of indices of dictionaries in `freq_meta_expr` that only have the "downsampling" key with specified `genetic_ancestry_label` and "variant_quality" values. @@ -278,8 +279,8 @@ def downsampling_counts_expr( singleton: bool = False, max_af: Optional[float] = None, genetic_ancestry_label: Optional[str] = None, - subset: Optional[str] = "None", - downsamplings: list = DOWNSAMPLINGS["v4"], + subset: Optional[str] = None, + downsamplings: Optional[List[int]] = None, ) -> hl.expr.ArrayExpression: """ Return an aggregation expression to compute an array of counts of all downsamplings found in `freq_expr` where specified criteria is met. From 1674fb1d40063531ea2befb51e4f76c33c3c9e9f Mon Sep 17 00:00:00 2001 From: klaricch Date: Thu, 4 Apr 2024 12:45:04 -0400 Subject: [PATCH 08/16] use overall model for global pop --- gnomad/utils/constraint.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/gnomad/utils/constraint.py b/gnomad/utils/constraint.py index 3096a973a..25d078639 100644 --- a/gnomad/utils/constraint.py +++ b/gnomad/utils/constraint.py @@ -250,7 +250,7 @@ def get_downsampling_freq_indices( None. :param subset: Subset to use for filtering by the 'subset' key in `freq_meta_expr`. Default is "None". - :param downsamplings: Optional List of integers specifying what downsampling + :param downsamplings: Optional List of integers specifying what downsampling indices to obtain. Default is None, which will return all downsampling indices. :return: ArrayExpression of indices of dictionaries in `freq_meta_expr` that only have the "downsampling" key with specified `genetic_ancestry_label` and @@ -985,6 +985,14 @@ def compute_expected_variants( intercept = plateau_model[0] agg_func = hl.agg.sum ann_to_sum = ["observed_variants", "possible_variants"] + # If genetic ancestry group is "global", use the overall plateau model, but sum across array of downsampling counts. + elif pop == "global": + plateau_model = hl.literal(plateau_models_expr.total)[cpg_expr] + slope = plateau_model[1] + intercept = plateau_model[0] + agg_func = hl.agg.array_sum + pop = f"_{pop}" + ann_to_sum = [f"downsampling_counts{pop}"] else: plateau_model = hl.literal(plateau_models_expr[pop]) slope = hl.map(lambda f: f[cpg_expr][1], plateau_model) From 003fdc4fa523183ba8e87eaefe7641f209673c10 Mon Sep 17 00:00:00 2001 From: klaricch Date: Thu, 4 Apr 2024 12:49:48 -0400 Subject: [PATCH 09/16] PR suggestion --- gnomad/utils/constraint.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/gnomad/utils/constraint.py b/gnomad/utils/constraint.py index 25d078639..19a142bb4 100644 --- a/gnomad/utils/constraint.py +++ b/gnomad/utils/constraint.py @@ -260,13 +260,25 @@ def get_downsampling_freq_indices( gen_anc = ["gen_anc", "pop"] else: gen_anc = [genetic_ancestry_label] - indices = hl.enumerate(freq_meta_expr).filter( - lambda f: (f[1].get("group") == variant_quality) - & (hl.any([f[1].get(l, "") == pop for l in gen_anc])) - & f[1].contains("downsampling") - & hl.literal(downsamplings).contains(hl.int(f[1].get("downsampling", "0"))) - & (f[1].get("subset", "None") == subset) - ) + + def _get_filter_expr(m: hl.expr.StructExpression) -> hl.expr.BooleanExpression: + filter_expr = ( + (m.get("group") == variant_quality) + & (hl.any([m.get(l, "") == pop for l in gen_anc])) + & m.contains("downsampling") + ) + if downsamplings is not None: + filter_expr &= hl.literal(downsamplings).contains( + hl.int(m.get("downsampling", "0")) + ) + if subset is None: + filter_expr &= ~m.contains("subset") + else: + filter_expr &= m.get("subset", "") == subset + return filter_expr + + indices = hl.enumerate(freq_meta_expr).filter(lambda f: _get_filter_expr(f[1])) + # Get an array of indices and meta dictionaries sorted by "downsampling" key. return hl.sorted(indices, key=lambda f: hl.int(f[1]["downsampling"])) From 68e182fd2d24cc9d840769cee8ed8d7e30e5c15c Mon Sep 17 00:00:00 2001 From: klaricch Date: Thu, 4 Apr 2024 12:51:41 -0400 Subject: [PATCH 10/16] Update gnomad/utils/constraint.py Co-authored-by: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> --- gnomad/utils/constraint.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/gnomad/utils/constraint.py b/gnomad/utils/constraint.py index 19a142bb4..aa4de9ae2 100644 --- a/gnomad/utils/constraint.py +++ b/gnomad/utils/constraint.py @@ -317,9 +317,12 @@ def downsampling_counts_expr( :param genetic_ancestry_label: Label defining the genetic ancestry groups. If None, "gen_anc" or "pop" is used (in that order of preference) if present. Default is None. - :param subset: Subset to use for filtering by the 'subset' key in - `freq_meta_expr`. Default is "None". - :param downsamplings: List of strings specifying what levels of downsamplings to obtain. + :param subset: Subset to use for filtering by the 'subset' key in `freq_meta_expr`. + Default is None, which will return all downsampling counts without a 'subset' + key in `freq_meta_expr`. If specified, only downsamplings with the specified + subset will be included. + :param downsamplings: Optional List of integers specifying what downsampling + indices to obtain. Default is None, which will return all downsampling counts. :return: Aggregation Expression for an array of the variant counts in downsamplings for specified population. """ From 0fdebdd94adb30bc6fab04e58feb66e2daafcbc3 Mon Sep 17 00:00:00 2001 From: klaricch Date: Thu, 4 Apr 2024 12:51:49 -0400 Subject: [PATCH 11/16] Update gnomad/utils/constraint.py Co-authored-by: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> --- gnomad/utils/constraint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/utils/constraint.py b/gnomad/utils/constraint.py index aa4de9ae2..ddd20ea3d 100644 --- a/gnomad/utils/constraint.py +++ b/gnomad/utils/constraint.py @@ -232,7 +232,7 @@ def get_downsampling_freq_indices( pop: str = "global", variant_quality: str = "adj", genetic_ancestry_label: Optional[str] = None, - subset: Optional[str] = "None", + subset: Optional[str] = None, downsamplings: Optional[List[int]] = None, ) -> hl.expr.ArrayExpression: """ From d4ac5bf5d58dc6e1c8494d71f5f1d2dd17de993d Mon Sep 17 00:00:00 2001 From: klaricch Date: Thu, 4 Apr 2024 12:52:05 -0400 Subject: [PATCH 12/16] Update gnomad/utils/constraint.py Co-authored-by: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> --- gnomad/utils/constraint.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gnomad/utils/constraint.py b/gnomad/utils/constraint.py index ddd20ea3d..48220ecac 100644 --- a/gnomad/utils/constraint.py +++ b/gnomad/utils/constraint.py @@ -248,8 +248,9 @@ def get_downsampling_freq_indices( :param genetic_ancestry_label: Label defining the genetic ancestry groups. If None, "gen_anc" or "pop" is used (in that order of preference) if present. Default is None. - :param subset: Subset to use for filtering by the 'subset' key in - `freq_meta_expr`. Default is "None". + :param subset: Subset to use for filtering by the 'subset' key in `freq_meta_expr`. + Default is None, which will return all downsampling indices without a 'subset' + key in `freq_meta_expr`. :param downsamplings: Optional List of integers specifying what downsampling indices to obtain. Default is None, which will return all downsampling indices. :return: ArrayExpression of indices of dictionaries in `freq_meta_expr` that only From 793f1c5e2a28ae2a708a83c25735b560ad60d028 Mon Sep 17 00:00:00 2001 From: klaricch Date: Thu, 4 Apr 2024 12:59:30 -0400 Subject: [PATCH 13/16] small edit --- gnomad/utils/constraint.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gnomad/utils/constraint.py b/gnomad/utils/constraint.py index 48220ecac..cd3fdd807 100644 --- a/gnomad/utils/constraint.py +++ b/gnomad/utils/constraint.py @@ -1001,7 +1001,8 @@ def compute_expected_variants( intercept = plateau_model[0] agg_func = hl.agg.sum ann_to_sum = ["observed_variants", "possible_variants"] - # If genetic ancestry group is "global", use the overall plateau model, but sum across array of downsampling counts. + # If genetic ancestry group is "global", use the overall plateau model, + # but sum across array of downsampling counts. elif pop == "global": plateau_model = hl.literal(plateau_models_expr.total)[cpg_expr] slope = plateau_model[1] From b296616821b4dd31fd78738453d6228b61c565c4 Mon Sep 17 00:00:00 2001 From: klaricch Date: Fri, 5 Apr 2024 12:55:03 -0400 Subject: [PATCH 14/16] small edit --- gnomad/utils/constraint.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/gnomad/utils/constraint.py b/gnomad/utils/constraint.py index cd3fdd807..4287e0b29 100644 --- a/gnomad/utils/constraint.py +++ b/gnomad/utils/constraint.py @@ -205,7 +205,11 @@ def count_variants_by_group( pop, ) agg[f"downsampling_counts_{pop}"] = downsampling_counts_expr( - freq_expr, freq_meta_expr, pop, max_af=max_af + freq_expr, + freq_meta_expr, + pop, + max_af=max_af, + downsamplings=DOWNSAMPLINGS["v4"], ) if count_singletons: logger.info( @@ -215,7 +219,12 @@ def count_variants_by_group( pop, ) agg[f"singleton_downsampling_counts_{pop}"] = downsampling_counts_expr( - freq_expr, freq_meta_expr, pop, max_af=max_af, singleton=True + freq_expr, + freq_meta_expr, + pop, + max_af=max_af, + downsamplings=DOWNSAMPLINGS["v4"], + singleton=True, ) # Apply each variant count aggregation in `agg` to get counts for all # combinations of `grouping`. From 34653f0526b5ce5a7eda691f0c33107347ec6c1a Mon Sep 17 00:00:00 2001 From: klaricch Date: Fri, 5 Apr 2024 16:16:52 -0400 Subject: [PATCH 15/16] fix downsample param --- gnomad/utils/constraint.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/gnomad/utils/constraint.py b/gnomad/utils/constraint.py index 4287e0b29..43854b9c5 100644 --- a/gnomad/utils/constraint.py +++ b/gnomad/utils/constraint.py @@ -7,7 +7,6 @@ import hail as hl from hail.utils.misc import divide_null, new_temp_file -from gnomad.resources.grch38.gnomad import DOWNSAMPLINGS from gnomad.utils.vep import explode_by_vep_annotation, process_consequences logging.basicConfig( @@ -56,6 +55,7 @@ def count_variants_by_group( freq_meta_expr: Optional[hl.expr.ArrayExpression] = None, count_singletons: bool = False, count_downsamplings: Tuple[str] = (), + downsamplings: Optional[List[int]] = None, additional_grouping: Tuple[str] = (), partition_hint: int = 100, omit_methylation: bool = False, @@ -130,6 +130,8 @@ def count_variants_by_group( Default is False. :param count_downsamplings: Tuple of populations to use for downsampling counts. Default is (). + :param downsamplings: Optional List of integers specifying what downsampling + indices to obtain. Default is None, which will return all downsampling counts. :param additional_grouping: Additional features to group by. e.g. 'exome_coverage'. Default is (). :param partition_hint: Target number of partitions for aggregation. Default is 100. @@ -209,7 +211,7 @@ def count_variants_by_group( freq_meta_expr, pop, max_af=max_af, - downsamplings=DOWNSAMPLINGS["v4"], + downsamplings=downsamplings, ) if count_singletons: logger.info( @@ -223,7 +225,7 @@ def count_variants_by_group( freq_meta_expr, pop, max_af=max_af, - downsamplings=DOWNSAMPLINGS["v4"], + downsamplings=downsamplings, singleton=True, ) # Apply each variant count aggregation in `agg` to get counts for all From 1adda99b7f68bf7d1f445f01523527b4cc549108 Mon Sep 17 00:00:00 2001 From: klaricch Date: Fri, 5 Apr 2024 16:46:51 -0400 Subject: [PATCH 16/16] revert changes --- gnomad/utils/constraint.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/gnomad/utils/constraint.py b/gnomad/utils/constraint.py index 43854b9c5..77039540e 100644 --- a/gnomad/utils/constraint.py +++ b/gnomad/utils/constraint.py @@ -1012,15 +1012,6 @@ def compute_expected_variants( intercept = plateau_model[0] agg_func = hl.agg.sum ann_to_sum = ["observed_variants", "possible_variants"] - # If genetic ancestry group is "global", use the overall plateau model, - # but sum across array of downsampling counts. - elif pop == "global": - plateau_model = hl.literal(plateau_models_expr.total)[cpg_expr] - slope = plateau_model[1] - intercept = plateau_model[0] - agg_func = hl.agg.array_sum - pop = f"_{pop}" - ann_to_sum = [f"downsampling_counts{pop}"] else: plateau_model = hl.literal(plateau_models_expr[pop]) slope = hl.map(lambda f: f[cpg_expr][1], plateau_model)