From afa29f58fc6f75dffc52b864b808f9ca1a8de051 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Tue, 28 Mar 2023 09:32:51 -0400 Subject: [PATCH 1/4] First attempt at strata groupings --- gnomad/utils/annotations.py | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index cc0c62306..db84d3798 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -334,6 +334,9 @@ def annotate_freq( pop_expr: Optional[hl.expr.StringExpression] = None, subpop_expr: Optional[hl.expr.StringExpression] = None, additional_strata_expr: Optional[Dict[str, hl.expr.StringExpression]] = None, + additional_strata_grouping_expr: Optional[ + Dict[str, hl.expr.StringExpression] + ] = None, downsamplings: Optional[List[int]] = None, ) -> hl.MatrixTable: """ @@ -410,7 +413,10 @@ def annotate_freq( # Annotate cols with provided cuts mt = mt.annotate_cols(_freq_meta=_freq_meta_expr) - # Get counters for sex, pop and subpop if set + if additional_strata_grouping_expr is None: + additional_strata_grouping_expr = {} + + # Get counters for sex, pop and if set subpop and additional strata cut_dict = { cut: hl.agg.filter( hl.is_defined(mt._freq_meta[cut]), hl.agg.counter(mt._freq_meta[cut]) @@ -509,6 +515,28 @@ def annotate_freq( + sample_group_filters ) + # Add additional groupings to strata, e.g. strata-pop, strata-sex + if additional_strata_grouping_expr is not None: + for strata in additional_strata_grouping_expr: + if strata not in cut_data.keys(): + raise KeyError( + "%s is not an existing annotation and thus cannot be combined with" + " additional strata" + ) + sample_group_filters.extend( + [ + ( + {strata: str(s_value), add_strata: str(as_value)}, + (mt._freq_meta[strata] == s_value) + & (mt._freq_meta[add_strata] == as_value), + ) + for strata in additional_strata_expr + for s_value in cut_data.get(strata, {}) + for add_strata in additional_strata_grouping_expr + for as_value in cut_data.get(add_strata, {}) + ] + ) + freq_sample_count = mt.aggregate_cols( [hl.agg.count_where(x[1]) for x in sample_group_filters] ) From 721eed5b1c1c504647365d0625683edd39b34e01 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Tue, 28 Mar 2023 10:01:56 -0400 Subject: [PATCH 2/4] Add additional groupings to cut_dict --- gnomad/utils/annotations.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index db84d3798..e6ec80f55 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -403,6 +403,8 @@ def annotate_freq( additional_strata_expr = {} _freq_meta_expr = hl.struct(**additional_strata_expr) + if additional_strata_grouping_expr is not None: + _freq_meta_expr = _freq_meta_expr.annotate(**additional_strata_grouping_expr) if sex_expr is not None: _freq_meta_expr = _freq_meta_expr.annotate(sex=sex_expr) if pop_expr is not None: From a6be26773e9f7285dd0744c963ade9066b4baed4 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Tue, 28 Mar 2023 10:16:28 -0400 Subject: [PATCH 3/4] Add additional group to docstring --- gnomad/utils/annotations.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index e6ec80f55..aaf1bcee6 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -391,6 +391,7 @@ def annotate_freq( :param pop_expr: When specified, frequencies are stratified by population. If `sex_expr` is also specified, then a pop/sex stratifiction is added. :param subpop_expr: When specified, frequencies are stratified by sub-continental population. Note that `pop_expr` is required as well when using this option. :param additional_strata_expr: When specified, frequencies are stratified by the given additional strata found in the dict. This can e.g. be used to stratify by platform. + :param additional_strata_grouping_expr: When specified, frequencies are further stratified by groups within the additional_strata_expr. This can e.g. be used to stratify by platform-population. :param downsamplings: When specified, frequencies are computed by downsampling the data to the number of samples given in the list. Note that if `pop_expr` is specified, downsamplings by population is also computed. :return: MatrixTable with `freq` annotation """ @@ -399,6 +400,12 @@ def annotate_freq( "annotate_freq requires pop_expr when using subpop_expr" ) + if additional_strata_grouping_expr is not None and additional_strata_expr is None: + raise NotImplementedError( + "annotate_freq requires additional_strata_expr when using" + " additional_strata_grouping_expr" + ) + if additional_strata_expr is None: additional_strata_expr = {} @@ -517,7 +524,7 @@ def annotate_freq( + sample_group_filters ) - # Add additional groupings to strata, e.g. strata-pop, strata-sex + # Add additional groupings to strata, e.g. strata-pop, strata-sex, strata-pop-sex if additional_strata_grouping_expr is not None: for strata in additional_strata_grouping_expr: if strata not in cut_data.keys(): From 16e1ae27c931f57107642a9fedb583bce16e87ad Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Wed, 5 Apr 2023 15:35:26 -0400 Subject: [PATCH 4/4] Remove unnecessary cut_dict check --- gnomad/utils/annotations.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index aaf1bcee6..416017fc0 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -410,7 +410,9 @@ def annotate_freq( additional_strata_expr = {} _freq_meta_expr = hl.struct(**additional_strata_expr) - if additional_strata_grouping_expr is not None: + if additional_strata_grouping_expr is None: + additional_strata_grouping_expr = {} + else: _freq_meta_expr = _freq_meta_expr.annotate(**additional_strata_grouping_expr) if sex_expr is not None: _freq_meta_expr = _freq_meta_expr.annotate(sex=sex_expr) @@ -422,9 +424,6 @@ def annotate_freq( # Annotate cols with provided cuts mt = mt.annotate_cols(_freq_meta=_freq_meta_expr) - if additional_strata_grouping_expr is None: - additional_strata_grouping_expr = {} - # Get counters for sex, pop and if set subpop and additional strata cut_dict = { cut: hl.agg.filter( @@ -526,12 +525,6 @@ def annotate_freq( # Add additional groupings to strata, e.g. strata-pop, strata-sex, strata-pop-sex if additional_strata_grouping_expr is not None: - for strata in additional_strata_grouping_expr: - if strata not in cut_data.keys(): - raise KeyError( - "%s is not an existing annotation and thus cannot be combined with" - " additional strata" - ) sample_group_filters.extend( [ (