From 36369be059e13997bf25ef4154dbe512d5042078 Mon Sep 17 00:00:00 2001 From: averywpx Date: Thu, 27 Oct 2022 12:08:56 -0400 Subject: [PATCH 01/23] draft functions --- gnomad/utils/constraint.py | 49 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/gnomad/utils/constraint.py b/gnomad/utils/constraint.py index 99f7f1234..059b58459 100644 --- a/gnomad/utils/constraint.py +++ b/gnomad/utils/constraint.py @@ -360,3 +360,52 @@ def collapse_strand( if isinstance(t, hl.Table) else t.annotate_rows(**collapse_expr) ) + +def annotate_constraint_groupings(ht: Union[hl.Table, hl.MatrixTable], + custom_model: str = None) -> Tuple[Union[hl.Table, hl.MatrixTable], List[str]]: + """ + Add constraint annotations to be used for groupings. + + Function adds the following annotations: + - annotation - could be 'most_severe_consequence' of either 'worst_csq_by_gene' or 'transcript_consequences', or 'csq' of 'tx_annotation' + - modifier - classic lof annotation, LOFTEE annotation, or PolyPhen annotation + - gene - gene_symbol + - coverage - exome coverage + - transcript (added when custom model isn't specified as worst_csq) + - canonical (added when custom model isn't specified as worst_csq) + + ..note:: + HT must be exploded against whatever axis. + + :param ht: Input Table or MatrixTable. + :param custom_model: The customized model (one of "standard" or "worst_csq" for now), defaults to None. + :return: A tuple of input Table or MatrixTable with grouping annotations added and the names of added annotations. + """ + if custom_model == 'worst_csq': + groupings = { + 'annotation': ht.worst_csq_by_gene.most_severe_consequence, + 'modifier': hl.case() + .when(hl.is_defined(ht.worst_csq_by_gene.lof), + ht.worst_csq_by_gene.lof) + .when(hl.is_defined(ht.worst_csq_by_gene.polyphen_prediction), + ht.worst_csq_by_gene.polyphen_prediction) + .default('None'), + 'gene': ht.worst_csq_by_gene.gene_symbol, + 'coverage': ht.exome_coverage + } + else: + groupings = { + 'annotation': ht.transcript_consequences.most_severe_consequence, + 'modifier': hl.case() + .when(hl.is_defined(ht.transcript_consequences.lof), + ht.transcript_consequences.lof) + .when(hl.is_defined(ht.transcript_consequences.polyphen_prediction), + ht.transcript_consequences.polyphen_prediction) + .default('None'), + 'transcript': ht.transcript_consequences.transcript_id, + 'gene': ht.transcript_consequences.gene_symbol, + 'canonical': hl.or_else(ht.transcript_consequences.canonical == 1, False), + 'coverage': ht.exome_coverage + } + ht = ht.annotate(**groupings) + return ht, tuple(groupings.keys()) \ No newline at end of file From 6a30c2056564fc24fe70763dce44f10998b051b7 Mon Sep 17 00:00:00 2001 From: averywpx Date: Thu, 27 Oct 2022 12:45:46 -0400 Subject: [PATCH 02/23] add annotate_constraint_groupings() --- gnomad/utils/constraint.py | 82 +++++++++++++++++++++++--------------- 1 file changed, 49 insertions(+), 33 deletions(-) diff --git a/gnomad/utils/constraint.py b/gnomad/utils/constraint.py index 059b58459..d50a75443 100644 --- a/gnomad/utils/constraint.py +++ b/gnomad/utils/constraint.py @@ -361,51 +361,67 @@ def collapse_strand( else t.annotate_rows(**collapse_expr) ) -def annotate_constraint_groupings(ht: Union[hl.Table, hl.MatrixTable], - custom_model: str = None) -> Tuple[Union[hl.Table, hl.MatrixTable], List[str]]: + +def annotate_constraint_groupings( + ht: Union[hl.Table, hl.MatrixTable], custom_model: str = None +) -> Tuple[Union[hl.Table, hl.MatrixTable], Tuple[str]]: """ Add constraint annotations to be used for groupings. - + Function adds the following annotations: - - annotation - could be 'most_severe_consequence' of either 'worst_csq_by_gene' or 'transcript_consequences', or 'csq' of 'tx_annotation' + - annotation -'most_severe_consequence' of either 'worst_csq_by_gene' + or 'transcript_consequences' - modifier - classic lof annotation, LOFTEE annotation, or PolyPhen annotation - - gene - gene_symbol + of either 'worst_csq_by_gene' or 'transcript_consequences' + - gene - 'gene_symbol' of either 'worst_csq_by_gene' or + 'transcript_consequences' - coverage - exome coverage - - transcript (added when custom model isn't specified as worst_csq) - - canonical (added when custom model isn't specified as worst_csq) - + - transcript (added when custom model is specified as "standard") + - canonical (added when custom model is specified as "standard") + ..note:: HT must be exploded against whatever axis. :param ht: Input Table or MatrixTable. - :param custom_model: The customized model (one of "standard" or "worst_csq" for now), defaults to None. - :return: A tuple of input Table or MatrixTable with grouping annotations added and the names of added annotations. + :param custom_model: The customized model (one of "standard" or "worst_csq"). + Default is None. + :return: A tuple of input Table or MatrixTable with grouping annotations added and + the names of added annotations. """ - if custom_model == 'worst_csq': + if custom_model == "worst_csq": groupings = { - 'annotation': ht.worst_csq_by_gene.most_severe_consequence, - 'modifier': hl.case() - .when(hl.is_defined(ht.worst_csq_by_gene.lof), - ht.worst_csq_by_gene.lof) - .when(hl.is_defined(ht.worst_csq_by_gene.polyphen_prediction), - ht.worst_csq_by_gene.polyphen_prediction) - .default('None'), - 'gene': ht.worst_csq_by_gene.gene_symbol, - 'coverage': ht.exome_coverage + "annotation": ht.worst_csq_by_gene.most_severe_consequence, + "modifier": hl.case() + .when(hl.is_defined(ht.worst_csq_by_gene.lof), ht.worst_csq_by_gene.lof) + .when( + hl.is_defined(ht.worst_csq_by_gene.polyphen_prediction), + ht.worst_csq_by_gene.polyphen_prediction, + ) + .default("None"), + "gene": ht.worst_csq_by_gene.gene_symbol, + "coverage": ht.exome_coverage, } else: groupings = { - 'annotation': ht.transcript_consequences.most_severe_consequence, - 'modifier': hl.case() - .when(hl.is_defined(ht.transcript_consequences.lof), - ht.transcript_consequences.lof) - .when(hl.is_defined(ht.transcript_consequences.polyphen_prediction), - ht.transcript_consequences.polyphen_prediction) - .default('None'), - 'transcript': ht.transcript_consequences.transcript_id, - 'gene': ht.transcript_consequences.gene_symbol, - 'canonical': hl.or_else(ht.transcript_consequences.canonical == 1, False), - 'coverage': ht.exome_coverage + "annotation": ht.transcript_consequences.most_severe_consequence, + "modifier": hl.case() + .when( + hl.is_defined(ht.transcript_consequences.lof), + ht.transcript_consequences.lof, + ) + .when( + hl.is_defined(ht.transcript_consequences.polyphen_prediction), + ht.transcript_consequences.polyphen_prediction, + ) + .default("None"), + "transcript": ht.transcript_consequences.transcript_id, + "gene": ht.transcript_consequences.gene_symbol, + "canonical": hl.or_else(ht.transcript_consequences.canonical == 1, False), + "coverage": ht.exome_coverage, } - ht = ht.annotate(**groupings) - return ht, tuple(groupings.keys()) \ No newline at end of file + ht = ( + ht.annotate(**groupings) + if isinstance(ht, hl.Table) + else ht.annotate_rows(**groupings) + ) + return ht, tuple(groupings.keys()) From fea8b944e37cd5c86769175caedec2abf1523ce8 Mon Sep 17 00:00:00 2001 From: averywpx Date: Fri, 28 Oct 2022 11:12:46 -0400 Subject: [PATCH 03/23] change params --- gnomad/utils/constraint.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/gnomad/utils/constraint.py b/gnomad/utils/constraint.py index d50a75443..e1465a8ca 100644 --- a/gnomad/utils/constraint.py +++ b/gnomad/utils/constraint.py @@ -363,7 +363,14 @@ def collapse_strand( def annotate_constraint_groupings( - ht: Union[hl.Table, hl.MatrixTable], custom_model: str = None + most_severe_consequence_expr, + lof_expr, + polyphen_prediction_expr, + gene_symbol_expr, + exome_coverage_expr, + transcript_id_expr, + canonical_expr, + custom_model: str = None ) -> Tuple[Union[hl.Table, hl.MatrixTable], Tuple[str]]: """ Add constraint annotations to be used for groupings. @@ -389,6 +396,14 @@ def annotate_constraint_groupings( the names of added annotations. """ if custom_model == "worst_csq": + context_ht = process_consequences(context_ht) + context_ht = context_ht.transmute( + worst_csq_by_gene=context_ht.vep.worst_csq_by_gene + ) + context_ht = context_ht.explode(context_ht.worst_csq_by_gene) + exome_ht = process_consequences(exome_ht) + exome_ht = exome_ht.transmute(worst_csq_by_gene=exome_ht.vep.worst_csq_by_gene) + exome_ht = exome_ht.explode(exome_ht.worst_csq_by_gene) groupings = { "annotation": ht.worst_csq_by_gene.most_severe_consequence, "modifier": hl.case() @@ -402,6 +417,15 @@ def annotate_constraint_groupings( "coverage": ht.exome_coverage, } else: + context_ht = context_ht.transmute( + transcript_consequences=context_ht.vep.transcript_consequences + ) + context_ht = context_ht.explode(context_ht.transcript_consequences) + exome_ht = exome_ht.transmute( + transcript_consequences=exome_ht.vep.transcript_consequences + ) + exome_ht = exome_ht.explode(exome_ht.transcript_consequences) + groupings = { "annotation": ht.transcript_consequences.most_severe_consequence, "modifier": hl.case() From 6fdee27a7d3ae16c162efb617f562ad0449feb76 Mon Sep 17 00:00:00 2001 From: averywpx Date: Fri, 28 Oct 2022 16:39:43 -0400 Subject: [PATCH 04/23] reformat to adapt function in gnomad-constraint --- gnomad/utils/constraint.py | 133 ++++++++++++++++--------------------- 1 file changed, 59 insertions(+), 74 deletions(-) diff --git a/gnomad/utils/constraint.py b/gnomad/utils/constraint.py index e1465a8ca..86c55b605 100644 --- a/gnomad/utils/constraint.py +++ b/gnomad/utils/constraint.py @@ -1,6 +1,7 @@ """Script containing generic constraint functions that may be used in the constraint pipeline.""" import logging +from re import T from typing import Any, Optional, Tuple, Union import hail as hl @@ -12,6 +13,8 @@ logger = logging.getLogger("constraint_utils") logger.setLevel(logging.INFO) +from gnomad.utils.vep import process_consequences + def annotate_with_mu( ht: hl.Table, @@ -363,89 +366,71 @@ def collapse_strand( def annotate_constraint_groupings( - most_severe_consequence_expr, - lof_expr, - polyphen_prediction_expr, - gene_symbol_expr, - exome_coverage_expr, - transcript_id_expr, - canonical_expr, - custom_model: str = None + t: Union[hl.Table, hl.MatrixTable], + vep_annotation: str, + vep_root: str = "vep", ) -> Tuple[Union[hl.Table, hl.MatrixTable], Tuple[str]]: """ Add constraint annotations to be used for groupings. Function adds the following annotations: - - annotation -'most_severe_consequence' of either 'worst_csq_by_gene' - or 'transcript_consequences' + - annotation -'most_severe_consequence' annotation in `vep_annotation` - modifier - classic lof annotation, LOFTEE annotation, or PolyPhen annotation - of either 'worst_csq_by_gene' or 'transcript_consequences' - - gene - 'gene_symbol' of either 'worst_csq_by_gene' or - 'transcript_consequences' - - coverage - exome coverage - - transcript (added when custom model is specified as "standard") - - canonical (added when custom model is specified as "standard") - - ..note:: - HT must be exploded against whatever axis. - - :param ht: Input Table or MatrixTable. - :param custom_model: The customized model (one of "standard" or "worst_csq"). - Default is None. + in `vep_annotation` + - gene - 'gene_symbol' annotation inside `vep_annotation` + - coverage - exome coverage in `t` + - transcript (added when `vep_annotation` is specified as "transcript_consequences") + - canonical (added when `vep_annotation` is specified as "transcript_consequences") + + :param t: Input Table or MatrixTable. + :param vep_annotation: Name of annotation in VEP annotation that will be used for + constraint annotation. + :param vep_root: Name used for VEP annotation. Default is 'vep'. :return: A tuple of input Table or MatrixTable with grouping annotations added and the names of added annotations. """ - if custom_model == "worst_csq": - context_ht = process_consequences(context_ht) - context_ht = context_ht.transmute( - worst_csq_by_gene=context_ht.vep.worst_csq_by_gene - ) - context_ht = context_ht.explode(context_ht.worst_csq_by_gene) - exome_ht = process_consequences(exome_ht) - exome_ht = exome_ht.transmute(worst_csq_by_gene=exome_ht.vep.worst_csq_by_gene) - exome_ht = exome_ht.explode(exome_ht.worst_csq_by_gene) - groupings = { - "annotation": ht.worst_csq_by_gene.most_severe_consequence, - "modifier": hl.case() - .when(hl.is_defined(ht.worst_csq_by_gene.lof), ht.worst_csq_by_gene.lof) - .when( - hl.is_defined(ht.worst_csq_by_gene.polyphen_prediction), - ht.worst_csq_by_gene.polyphen_prediction, - ) - .default("None"), - "gene": ht.worst_csq_by_gene.gene_symbol, - "coverage": ht.exome_coverage, - } - else: - context_ht = context_ht.transmute( - transcript_consequences=context_ht.vep.transcript_consequences + if vep_annotation not in t[vep_root].keys(): + raise ValueError(f'{vep_annotation} is not a row field of the VEP annotation in Table') + + # Annotate 'worst_csq_by_gene' to t if t is used to build the "worst_cq" model. + if vep_annotation == "worst_csq_by_gene": + t = process_consequences(t) + # Create top-level annotation for `vep_annotation` by pulling out from `vep_root` + # annotation. + t = t.transmute(**{vep_annotation: t[vep_root][vep_annotation]}) + # Explode the `vep_annotation`. + t = t.explode(t[vep_annotation]) + + vep_annotation_expr = t[vep_annotation] + lof_expr = vep_annotation_expr.lof + polyphen_prediction_expr = vep_annotation_expr.polyphen_prediction + + # Create constraint annotations to be used for groupings. + groupings = { + "annotation": vep_annotation_expr.most_severe_consequence, + "modifier": hl.case() + .when( + hl.is_defined(lof_expr), + lof_expr, ) - context_ht = context_ht.explode(context_ht.transcript_consequences) - exome_ht = exome_ht.transmute( - transcript_consequences=exome_ht.vep.transcript_consequences + .when( + hl.is_defined(polyphen_prediction_expr), + polyphen_prediction_expr, ) - exome_ht = exome_ht.explode(exome_ht.transcript_consequences) - - groupings = { - "annotation": ht.transcript_consequences.most_severe_consequence, - "modifier": hl.case() - .when( - hl.is_defined(ht.transcript_consequences.lof), - ht.transcript_consequences.lof, - ) - .when( - hl.is_defined(ht.transcript_consequences.polyphen_prediction), - ht.transcript_consequences.polyphen_prediction, - ) - .default("None"), - "transcript": ht.transcript_consequences.transcript_id, - "gene": ht.transcript_consequences.gene_symbol, - "canonical": hl.or_else(ht.transcript_consequences.canonical == 1, False), - "coverage": ht.exome_coverage, - } - ht = ( - ht.annotate(**groupings) - if isinstance(ht, hl.Table) - else ht.annotate_rows(**groupings) + .default("None"), + "gene": vep_annotation_expr.gene_symbol, + "coverage": t.exome_coverage, + } + + # Add 'transcript' and 'canonical' annotation if grouping is used for the + # "standard" model. + if vep_annotation == "transcript_consequences": + groupings["transcript"] = vep_annotation_expr.transcript_id + groupings["canonical"] = hl.or_else(vep_annotation_expr.canonical == 1, False) + + t = ( + t.annotate(**groupings) + if isinstance(t, hl.Table) + else t.annotate_rows(**groupings) ) - return ht, tuple(groupings.keys()) + return t, tuple(groupings.keys()) From d9b2860c7907f9c7fd113d8beb832553ca21f635 Mon Sep 17 00:00:00 2001 From: averywpx Date: Fri, 28 Oct 2022 17:44:42 -0400 Subject: [PATCH 05/23] fix docstring indent --- gnomad/utils/constraint.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gnomad/utils/constraint.py b/gnomad/utils/constraint.py index 86c55b605..e32ad845c 100644 --- a/gnomad/utils/constraint.py +++ b/gnomad/utils/constraint.py @@ -384,10 +384,10 @@ def annotate_constraint_groupings( :param t: Input Table or MatrixTable. :param vep_annotation: Name of annotation in VEP annotation that will be used for - constraint annotation. + constraint annotation. :param vep_root: Name used for VEP annotation. Default is 'vep'. :return: A tuple of input Table or MatrixTable with grouping annotations added and - the names of added annotations. + the names of added annotations. """ if vep_annotation not in t[vep_root].keys(): raise ValueError(f'{vep_annotation} is not a row field of the VEP annotation in Table') From 32e9227a1fa1269abd5eb2fa402a56036fc0badd Mon Sep 17 00:00:00 2001 From: averywpx Date: Mon, 31 Oct 2022 14:41:04 -0400 Subject: [PATCH 06/23] black reformat --- gnomad/utils/constraint.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/gnomad/utils/constraint.py b/gnomad/utils/constraint.py index e32ad845c..c9d67f583 100644 --- a/gnomad/utils/constraint.py +++ b/gnomad/utils/constraint.py @@ -383,14 +383,16 @@ def annotate_constraint_groupings( - canonical (added when `vep_annotation` is specified as "transcript_consequences") :param t: Input Table or MatrixTable. - :param vep_annotation: Name of annotation in VEP annotation that will be used for - constraint annotation. + :param vep_annotation: Name of annotation in VEP annotation that will be used for + constraint annotation. :param vep_root: Name used for VEP annotation. Default is 'vep'. :return: A tuple of input Table or MatrixTable with grouping annotations added and the names of added annotations. """ if vep_annotation not in t[vep_root].keys(): - raise ValueError(f'{vep_annotation} is not a row field of the VEP annotation in Table') + raise ValueError( + f"{vep_annotation} is not a row field of the VEP annotation in Table" + ) # Annotate 'worst_csq_by_gene' to t if t is used to build the "worst_cq" model. if vep_annotation == "worst_csq_by_gene": From c75b9814ad9e7a1037994f3029891b536f62f0b5 Mon Sep 17 00:00:00 2001 From: averywpx Date: Tue, 1 Nov 2022 15:09:35 -0400 Subject: [PATCH 07/23] delete unused import --- gnomad/utils/constraint.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gnomad/utils/constraint.py b/gnomad/utils/constraint.py index c9d67f583..150577f61 100644 --- a/gnomad/utils/constraint.py +++ b/gnomad/utils/constraint.py @@ -1,7 +1,6 @@ """Script containing generic constraint functions that may be used in the constraint pipeline.""" import logging -from re import T from typing import Any, Optional, Tuple, Union import hail as hl From e57aa783e80611a73f28168cf69f08e0e072d5d4 Mon Sep 17 00:00:00 2001 From: averywpx Date: Tue, 1 Nov 2022 20:50:12 -0400 Subject: [PATCH 08/23] switch to list --- gnomad/utils/constraint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/utils/constraint.py b/gnomad/utils/constraint.py index 150577f61..4feaf4213 100644 --- a/gnomad/utils/constraint.py +++ b/gnomad/utils/constraint.py @@ -434,4 +434,4 @@ def annotate_constraint_groupings( if isinstance(t, hl.Table) else t.annotate_rows(**groupings) ) - return t, tuple(groupings.keys()) + return t, list(groupings.keys()) From 5750d1c91b75996ebae7e0a9191614d9c60dc1b9 Mon Sep 17 00:00:00 2001 From: averywpx Date: Wed, 2 Nov 2022 09:46:25 -0400 Subject: [PATCH 09/23] fix small bug --- gnomad/utils/constraint.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/gnomad/utils/constraint.py b/gnomad/utils/constraint.py index 4feaf4213..68fd5b97e 100644 --- a/gnomad/utils/constraint.py +++ b/gnomad/utils/constraint.py @@ -4,6 +4,7 @@ from typing import Any, Optional, Tuple, Union import hail as hl +from gnomad.utils.vep import process_consequences logging.basicConfig( format="%(asctime)s (%(name)s %(lineno)s): %(message)s", @@ -12,8 +13,6 @@ logger = logging.getLogger("constraint_utils") logger.setLevel(logging.INFO) -from gnomad.utils.vep import process_consequences - def annotate_with_mu( ht: hl.Table, @@ -388,14 +387,14 @@ def annotate_constraint_groupings( :return: A tuple of input Table or MatrixTable with grouping annotations added and the names of added annotations. """ + # Annotate 'worst_csq_by_gene' to t if t is used to build the "worst_cq" model. + if vep_annotation == "worst_csq_by_gene": + t = process_consequences(t) + if vep_annotation not in t[vep_root].keys(): raise ValueError( f"{vep_annotation} is not a row field of the VEP annotation in Table" ) - - # Annotate 'worst_csq_by_gene' to t if t is used to build the "worst_cq" model. - if vep_annotation == "worst_csq_by_gene": - t = process_consequences(t) # Create top-level annotation for `vep_annotation` by pulling out from `vep_root` # annotation. t = t.transmute(**{vep_annotation: t[vep_root][vep_annotation]}) @@ -434,4 +433,4 @@ def annotate_constraint_groupings( if isinstance(t, hl.Table) else t.annotate_rows(**groupings) ) - return t, list(groupings.keys()) + return t, tuple(groupings.keys()) From 7f6fef102d2fdb9399f272025e487f5a73c60880 Mon Sep 17 00:00:00 2001 From: averywpx Date: Wed, 2 Nov 2022 09:53:14 -0400 Subject: [PATCH 10/23] apply isort --- gnomad/utils/constraint.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gnomad/utils/constraint.py b/gnomad/utils/constraint.py index 68fd5b97e..d9eaf2407 100644 --- a/gnomad/utils/constraint.py +++ b/gnomad/utils/constraint.py @@ -4,6 +4,7 @@ from typing import Any, Optional, Tuple, Union import hail as hl + from gnomad.utils.vep import process_consequences logging.basicConfig( From 53b3b6cf255342637b12740b6ca9162f2a2484a4 Mon Sep 17 00:00:00 2001 From: averywpx Date: Wed, 9 Nov 2022 20:56:09 -0500 Subject: [PATCH 11/23] fix change requests --- gnomad/utils/constraint.py | 134 +++++++++++++++++++++++++------------ 1 file changed, 91 insertions(+), 43 deletions(-) diff --git a/gnomad/utils/constraint.py b/gnomad/utils/constraint.py index d9eaf2407..5434b058a 100644 --- a/gnomad/utils/constraint.py +++ b/gnomad/utils/constraint.py @@ -1,7 +1,7 @@ """Script containing generic constraint functions that may be used in the constraint pipeline.""" - +# cSpell: disable import logging -from typing import Any, Optional, Tuple, Union +from typing import Any, Dict, Optional, Tuple, Union import hail as hl @@ -364,31 +364,21 @@ def collapse_strand( ) -def annotate_constraint_groupings( +def explode_by_vep_annotation( t: Union[hl.Table, hl.MatrixTable], vep_annotation: str, vep_root: str = "vep", -) -> Tuple[Union[hl.Table, hl.MatrixTable], Tuple[str]]: +) -> Union[hl.Table, hl.MatrixTable]: """ - Add constraint annotations to be used for groupings. - - Function adds the following annotations: - - annotation -'most_severe_consequence' annotation in `vep_annotation` - - modifier - classic lof annotation, LOFTEE annotation, or PolyPhen annotation - in `vep_annotation` - - gene - 'gene_symbol' annotation inside `vep_annotation` - - coverage - exome coverage in `t` - - transcript (added when `vep_annotation` is specified as "transcript_consequences") - - canonical (added when `vep_annotation` is specified as "transcript_consequences") + Annotate specified VEP annotation if it's not in the input Table or MatrixTable and explode the VEP annotation. :param t: Input Table or MatrixTable. - :param vep_annotation: Name of annotation in VEP annotation that will be used for - constraint annotation. + :param vep_annotation: Name of annotation in VEP annotation that will be annotated, + if necessary, and explode. :param vep_root: Name used for VEP annotation. Default is 'vep'. - :return: A tuple of input Table or MatrixTable with grouping annotations added and - the names of added annotations. + :return: Table or MatrixTable with exploded VEP annotation. """ - # Annotate 'worst_csq_by_gene' to t if t is used to build the "worst_cq" model. + # Annotate 'worst_csq_by_gene' to t if it's specified for `vep_annotation`. if vep_annotation == "worst_csq_by_gene": t = process_consequences(t) @@ -397,41 +387,99 @@ def annotate_constraint_groupings( f"{vep_annotation} is not a row field of the VEP annotation in Table" ) # Create top-level annotation for `vep_annotation` by pulling out from `vep_root` - # annotation. - t = t.transmute(**{vep_annotation: t[vep_root][vep_annotation]}) - # Explode the `vep_annotation`. - t = t.explode(t[vep_annotation]) + # annotation and explode the `vep_annotation`. + if isinstance(t, hl.Table): + t = t.transmute(**{vep_annotation: t[vep_root][vep_annotation]}) + t = t.explode(t[vep_annotation]) + else: + t = t.transmute_rows(**{vep_annotation: t[vep_root][vep_annotation]}) + t = t.explode_rows(t[vep_annotation]) + + return t + + +def get_constraint_grouping_expr( + vep_annotation_expr: hl.StructExpression, + coverage_expr: hl.Int32Expression = None, + include_transcript_group: bool = True, + include_canonical_group: bool = True, +) -> Dict[str, Union[hl.StringExpression, hl.nt32Expression, hl.BooleanExpression]]: + """ + Collect annotations used for constraint groupings. - vep_annotation_expr = t[vep_annotation] + Function collects the following annotations: + - annotation - 'most_severe_consequence' annotation in `vep_annotation` + - modifier - classic lof annotation, LOFTEE annotation, or PolyPhen annotation + in `vep_annotation` + - gene - 'gene_symbol' annotation inside `vep_annotation` + - coverage - exome coverage + - transcript (added when `include_transcript_group` is True) + - canonical (added when `include_canonical_group` is True) + + :param vep_annotation_expr: StructExpression of VEP annotation. + :param coverage_expr: Int32Expression of exome coverage. Default is None. + :param include_transcript_group: Wether to include the transcript annotation in the + groupings. Default is True. + :param include_canonical_group: Wether to include canonical annotation in the + groupings. Default is True. + :return: A dictionary with keys as annotation names and values as actual + annotations. + """ lof_expr = vep_annotation_expr.lof polyphen_prediction_expr = vep_annotation_expr.polyphen_prediction # Create constraint annotations to be used for groupings. groupings = { "annotation": vep_annotation_expr.most_severe_consequence, - "modifier": hl.case() - .when( - hl.is_defined(lof_expr), - lof_expr, - ) - .when( - hl.is_defined(polyphen_prediction_expr), - polyphen_prediction_expr, - ) - .default("None"), + "modifier": hl.coalesce(lof_expr, polyphen_prediction_expr, "None"), "gene": vep_annotation_expr.gene_symbol, - "coverage": t.exome_coverage, } + if coverage_expr is not None: + groupings["coverage"] = coverage_expr - # Add 'transcript' and 'canonical' annotation if grouping is used for the - # "standard" model. - if vep_annotation == "transcript_consequences": + # Add 'transcript' and 'canonical' annotation if requested + if include_transcript_group: groupings["transcript"] = vep_annotation_expr.transcript_id + if include_canonical_group: groupings["canonical"] = hl.or_else(vep_annotation_expr.canonical == 1, False) - t = ( - t.annotate(**groupings) - if isinstance(t, hl.Table) - else t.annotate_rows(**groupings) + return groupings + + +def annotate_constraint_groupings( + ht: hl.Table, vep_annotation: str = "transcript_consequences" +) -> Tuple[Union[hl.Table, hl.MatrixTable], Tuple[str]]: + """ + Annotate constraint annotations used for groupings. + + Function explods the specified VEP annotation and annotates the following + annotations: + - annotation -'most_severe_consequence' annotation in `vep_annotation` + - modifier - classic lof annotation, LOFTEE annotation, or PolyPhen annotation + in `vep_annotation` + - gene - 'gene_symbol' annotation inside `vep_annotation` + - coverage - exome coverage in `t` + - transcript (added when `vep_annotation` is specified as + "transcript_consequences") + - canonical (added when `vep_annotation` is specified as + "transcript_consequences") + + :param t: Input Table or MatrixTable. + :param vep_annotation: Name of annotation in VEP annotation that will be used for + constraint annotation. + :return: A tuple of input Table or MatrixTable with grouping annotations added and + the names of added annotations. + """ + ht = explode_by_vep_annotation(ht, vep_annotation) + if vep_annotation == "transcript_consequences": + include_transcript_group = include_canonical_group = True + else: + include_transcript_group = include_canonical_group = False + groupings = get_constraint_grouping_expr( + ht[vep_annotation], + coverage_expr=ht.exome_coverage, + include_transcript_group=include_transcript_group, + include_canonical_group=include_canonical_group, ) - return t, tuple(groupings.keys()) + + return ht.annotate(**groupings), tuple(groupings.keys()) From 5e521bcbbe45547b66704b13459734a6f6af62de Mon Sep 17 00:00:00 2001 From: averywpx Date: Wed, 9 Nov 2022 21:19:30 -0500 Subject: [PATCH 12/23] fix docstrings --- gnomad/utils/constraint.py | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/gnomad/utils/constraint.py b/gnomad/utils/constraint.py index 5434b058a..b1f448dd5 100644 --- a/gnomad/utils/constraint.py +++ b/gnomad/utils/constraint.py @@ -1,5 +1,5 @@ """Script containing generic constraint functions that may be used in the constraint pipeline.""" -# cSpell: disable + import logging from typing import Any, Dict, Optional, Tuple, Union @@ -384,10 +384,10 @@ def explode_by_vep_annotation( if vep_annotation not in t[vep_root].keys(): raise ValueError( - f"{vep_annotation} is not a row field of the VEP annotation in Table" + f"{vep_annotation} is not a row field of the VEP annotation in" + " Table/MatrixTable!" ) - # Create top-level annotation for `vep_annotation` by pulling out from `vep_root` - # annotation and explode the `vep_annotation`. + # Create top-level annotation for `vep_annotation` and explode it. if isinstance(t, hl.Table): t = t.transmute(**{vep_annotation: t[vep_root][vep_annotation]}) t = t.explode(t[vep_annotation]) @@ -416,6 +416,16 @@ def get_constraint_grouping_expr( - transcript (added when `include_transcript_group` is True) - canonical (added when `include_canonical_group` is True) + .. note:: + This function expects that the following fields are present in + `vep_annotation_expr`: + - lof + - polyphen_prediction + - most_severe_consequence + - gene_symbol + - transcript_id (if `include_transcript_group` is True) + - canonical (if `include_canonical_group` is True) + :param vep_annotation_expr: StructExpression of VEP annotation. :param coverage_expr: Int32Expression of exome coverage. Default is None. :param include_transcript_group: Wether to include the transcript annotation in the @@ -437,7 +447,7 @@ def get_constraint_grouping_expr( if coverage_expr is not None: groupings["coverage"] = coverage_expr - # Add 'transcript' and 'canonical' annotation if requested + # Add 'transcript' and 'canonical' annotation if requested. if include_transcript_group: groupings["transcript"] = vep_annotation_expr.transcript_id if include_canonical_group: @@ -452,7 +462,7 @@ def annotate_constraint_groupings( """ Annotate constraint annotations used for groupings. - Function explods the specified VEP annotation and annotates the following + Function explodes the specified VEP annotation and annotates the following annotations: - annotation -'most_severe_consequence' annotation in `vep_annotation` - modifier - classic lof annotation, LOFTEE annotation, or PolyPhen annotation @@ -464,17 +474,26 @@ def annotate_constraint_groupings( - canonical (added when `vep_annotation` is specified as "transcript_consequences") + .. note:: + This function expects that the following fields are present in `ht`: + - vep + - exome_coverage + :param t: Input Table or MatrixTable. - :param vep_annotation: Name of annotation in VEP annotation that will be used for - constraint annotation. + :param vep_annotation: Name of annotation in VEP annotation (one of + "transcript_consequences" and "worst_csq_by_gene") that will be used for + constraint annotation. Defaults to "transcript_consequences". :return: A tuple of input Table or MatrixTable with grouping annotations added and the names of added annotations. """ + # Explode the specified VEP annotation. ht = explode_by_vep_annotation(ht, vep_annotation) if vep_annotation == "transcript_consequences": include_transcript_group = include_canonical_group = True else: include_transcript_group = include_canonical_group = False + + # Collect the annotations used for groupings. groupings = get_constraint_grouping_expr( ht[vep_annotation], coverage_expr=ht.exome_coverage, From cc18a32aad9a3369380a3302d5870046d8654074 Mon Sep 17 00:00:00 2001 From: averywpx Date: Thu, 10 Nov 2022 12:25:31 -0500 Subject: [PATCH 13/23] fix small issues --- gnomad/resources/resource_utils.py | 39 ++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/gnomad/resources/resource_utils.py b/gnomad/resources/resource_utils.py index 9b3a311e4..5ee9e7074 100644 --- a/gnomad/resources/resource_utils.py +++ b/gnomad/resources/resource_utils.py @@ -302,6 +302,45 @@ def import_resource(self, overwrite: bool = True, **kwargs) -> None: self.import_func(**self.import_args).write( self.path, overwrite=overwrite, **kwargs ) + + +class ExpressionResource(BaseResource): + """ + A Hail Expression resource. + + :param path: The Expression path (typically ending in .he). + :param import_args: Any sources that are required for the import and need to be + kept track of and/or passed to the import_func (e.g. .vcf path for an imported + VCF). + :param import_func: A function used to import the Expression. `import_func` will be + passed the `import_args` dictionary as kwargs. + """ + + expected_file_extensions: List[str] = [".he"] + + def he(self, force_import: bool = False) -> hl.expr.Expression: + """ + Read and return the Hail Expression resource. + + :return: Hail Expression resource. + """ + if self.path is None or force_import: + return self.import_func(**self.import_args) + else: + return hl.experimental.read_expression(self.path) + + def import_resource(self, overwrite: bool = True, **kwargs) -> None: + """ + Import the Expression resource using its import_func and writes it in its path. + + :param overwrite: If set, existing file(s) will be overwritten. + :param kwargs: Any other parameters to be passed to hl.experimental. + write_expression. + :return: Nothing. + """ + self.import_func(**self.import_args).write( + self.path, overwrite=overwrite, **kwargs + ) class BaseVersionedResource: From 5594d691ca944a6537279d7236ed85c462ee12bf Mon Sep 17 00:00:00 2001 From: averywpx Date: Thu, 10 Nov 2022 12:29:22 -0500 Subject: [PATCH 14/23] add func --- gnomad/utils/constraint.py | 63 +++++++++++++++++++++++++++++++++++++- 1 file changed, 62 insertions(+), 1 deletion(-) diff --git a/gnomad/utils/constraint.py b/gnomad/utils/constraint.py index 17c1be9a6..5872b512c 100644 --- a/gnomad/utils/constraint.py +++ b/gnomad/utils/constraint.py @@ -798,7 +798,7 @@ def get_constraint_grouping_expr( return groupings -def annotate_constraint_groupings( +def annotate_exploded_vep_for_constraint_groupings( ht: hl.Table, vep_annotation: str = "transcript_consequences" ) -> Tuple[Union[hl.Table, hl.MatrixTable], Tuple[str]]: """ @@ -844,3 +844,64 @@ def annotate_constraint_groupings( ) return ht.annotate(**groupings), tuple(groupings.keys()) + +def apply_plateau_models( + ht: hl.Table, + plateau_models: hl.StructExpression, + mu_expr: hl.Float64Expression, + cov_corr_expr: hl.Float64Expression, + pop: Optional[str] = None, +) -> Dict[str, Union[hl.Float64Expression, hl.Int64Expression]]: + """ + Apply plateau models for all sites and for a population (if specified) to compute predicted proportion observed ratio and expected variant counts. + + .. note: + Function expects following annotations to be present in `ht`: + - cpg + - observed_variants + - possible_variants + + :param ht: Input Table. + :param plateau_models: Linear models (output of `build_models()` in + gnomad_methods`), with the values of the dictionary formatted as a + StrucExpression of intercept and slope, that calibrates mutation rate to + proportion observed for high coverage exome. It includes models for CpG s, + non-CpG sites, and each population in `POPS`. + :param mu_expr: Float64Expression of mutation rate. + :param cov_corr_expr: Float64Expression of corrected coverage expression. + :param pop: Population that will be used when applying plateau model. Default is + None. + :return: A dictionary with predicted proportion observed ratio and expected variant + counts. + """ + if pop is None: + pop = "" + plateau_model = hl.literal(plateau_models.total)[ht.cpg] + slope = plateau_model[1] + intercept = plateau_model[0] + agg_func = hl.agg.sum + ann_to_sum = ["observed_variants", "possible_variants"] + else: + plateau_model = hl.literal(plateau_models[pop]) + slope = hl.map(lambda f: f[ht.cpg][1], plateau_model) + intercept = hl.map(lambda f: f[ht.cpg][0], plateau_model) + agg_func = hl.agg.array_sum + pop = f"_{pop}" + ann_to_sum = [f"downsampling_counts{pop}"] + + # Apply plateau models for specified population. + ppo_expr = mu_expr * slope + intercept + + # Generate sum aggregators for 'predicted_proportion_observed' and + # 'expected_variants', for specified population. + agg_expr = { + f"predicted_proportion_observed{pop}": agg_func(ppo_expr), + f"expected_variants{pop}": agg_func(ppo_expr * cov_corr_expr), + } + + # Generate sum aggregators for 'observed_variants' and 'possible_variants' on + # the entire dataset if pop is None, and for `downsampling_counts` for + # specified population if pop is not None. + agg_expr.update({ann: agg_func(ht[ann]) for ann in ann_to_sum}) + + return agg_expr From dc3a5a75e9aed343cbcda053595d9d1c6236d595 Mon Sep 17 00:00:00 2001 From: averywpx Date: Wed, 16 Nov 2022 14:34:52 -0500 Subject: [PATCH 15/23] black reformat --- gnomad/resources/resource_utils.py | 8 ++++---- gnomad/utils/constraint.py | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/gnomad/resources/resource_utils.py b/gnomad/resources/resource_utils.py index 5ee9e7074..57b397325 100644 --- a/gnomad/resources/resource_utils.py +++ b/gnomad/resources/resource_utils.py @@ -302,17 +302,17 @@ def import_resource(self, overwrite: bool = True, **kwargs) -> None: self.import_func(**self.import_args).write( self.path, overwrite=overwrite, **kwargs ) - + class ExpressionResource(BaseResource): """ A Hail Expression resource. :param path: The Expression path (typically ending in .he). - :param import_args: Any sources that are required for the import and need to be - kept track of and/or passed to the import_func (e.g. .vcf path for an imported + :param import_args: Any sources that are required for the import and need to be + kept track of and/or passed to the import_func (e.g. .vcf path for an imported VCF). - :param import_func: A function used to import the Expression. `import_func` will be + :param import_func: A function used to import the Expression. `import_func` will be passed the `import_args` dictionary as kwargs. """ diff --git a/gnomad/utils/constraint.py b/gnomad/utils/constraint.py index 5872b512c..343734182 100644 --- a/gnomad/utils/constraint.py +++ b/gnomad/utils/constraint.py @@ -845,6 +845,7 @@ def annotate_exploded_vep_for_constraint_groupings( return ht.annotate(**groupings), tuple(groupings.keys()) + def apply_plateau_models( ht: hl.Table, plateau_models: hl.StructExpression, From a38b4250c09eb0d88288c585a7eda0147bbefcac Mon Sep 17 00:00:00 2001 From: averywpx Date: Fri, 18 Nov 2022 09:50:22 -0500 Subject: [PATCH 16/23] small changes --- gnomad/utils/constraint.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/gnomad/utils/constraint.py b/gnomad/utils/constraint.py index 343734182..d23a01a28 100644 --- a/gnomad/utils/constraint.py +++ b/gnomad/utils/constraint.py @@ -846,11 +846,12 @@ def annotate_exploded_vep_for_constraint_groupings( return ht.annotate(**groupings), tuple(groupings.keys()) -def apply_plateau_models( +def compute_expected_variants( ht: hl.Table, plateau_models: hl.StructExpression, mu_expr: hl.Float64Expression, cov_corr_expr: hl.Float64Expression, + cpg_expr: hl.BooleanExpression, pop: Optional[str] = None, ) -> Dict[str, Union[hl.Float64Expression, hl.Int64Expression]]: """ @@ -870,6 +871,7 @@ def apply_plateau_models( non-CpG sites, and each population in `POPS`. :param mu_expr: Float64Expression of mutation rate. :param cov_corr_expr: Float64Expression of corrected coverage expression. + :param cpg_expr: BooleanExpression noting whether a site is a CPG site. :param pop: Population that will be used when applying plateau model. Default is None. :return: A dictionary with predicted proportion observed ratio and expected variant @@ -877,15 +879,15 @@ def apply_plateau_models( """ if pop is None: pop = "" - plateau_model = hl.literal(plateau_models.total)[ht.cpg] + plateau_model = hl.literal(plateau_models.total)[cpg_expr] slope = plateau_model[1] intercept = plateau_model[0] agg_func = hl.agg.sum ann_to_sum = ["observed_variants", "possible_variants"] else: plateau_model = hl.literal(plateau_models[pop]) - slope = hl.map(lambda f: f[ht.cpg][1], plateau_model) - intercept = hl.map(lambda f: f[ht.cpg][0], plateau_model) + slope = hl.map(lambda f: f[cpg_expr][1], plateau_model) + intercept = hl.map(lambda f: f[cpg_expr][0], plateau_model) agg_func = hl.agg.array_sum pop = f"_{pop}" ann_to_sum = [f"downsampling_counts{pop}"] From 64a58c09f29945af710fcf7d958296f9a481f997 Mon Sep 17 00:00:00 2001 From: averywpx Date: Fri, 18 Nov 2022 09:58:13 -0500 Subject: [PATCH 17/23] small changes --- gnomad/utils/constraint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/utils/constraint.py b/gnomad/utils/constraint.py index d23a01a28..67ad18530 100644 --- a/gnomad/utils/constraint.py +++ b/gnomad/utils/constraint.py @@ -745,7 +745,7 @@ def get_constraint_grouping_expr( coverage_expr: hl.Int32Expression = None, include_transcript_group: bool = True, include_canonical_group: bool = True, -) -> Dict[str, Union[hl.StringExpression, hl.nt32Expression, hl.BooleanExpression]]: +) -> Dict[str, Union[hl.StringExpression, hl.Int32Expression, hl.BooleanExpression]]: """ Collect annotations used for constraint groupings. From 6aa02c820d55a4c813785e206c290585c10f9a00 Mon Sep 17 00:00:00 2001 From: averywpx <51382435+averywpx@users.noreply.github.com> Date: Thu, 1 Dec 2022 04:18:34 -0500 Subject: [PATCH 18/23] Apply suggestions from code review Co-authored-by: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> --- gnomad/utils/constraint.py | 41 +++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/gnomad/utils/constraint.py b/gnomad/utils/constraint.py index 67ad18530..3522d5d23 100644 --- a/gnomad/utils/constraint.py +++ b/gnomad/utils/constraint.py @@ -708,16 +708,15 @@ def get_all_pop_lengths( def explode_by_vep_annotation( t: Union[hl.Table, hl.MatrixTable], - vep_annotation: str, + vep_annotation: str = "transcript_consequences", vep_root: str = "vep", ) -> Union[hl.Table, hl.MatrixTable]: """ - Annotate specified VEP annotation if it's not in the input Table or MatrixTable and explode the VEP annotation. + Explode the specified VEP annotation on the input Table/MatrixTable. :param t: Input Table or MatrixTable. - :param vep_annotation: Name of annotation in VEP annotation that will be annotated, - if necessary, and explode. - :param vep_root: Name used for VEP annotation. Default is 'vep'. + :param vep_annotation: Name of annotation in `vep_root` to explode. + :param vep_root: Name used for root VEP annotation. Default is 'vep'. :return: Table or MatrixTable with exploded VEP annotation. """ # Annotate 'worst_csq_by_gene' to t if it's specified for `vep_annotation`. @@ -726,7 +725,7 @@ def explode_by_vep_annotation( if vep_annotation not in t[vep_root].keys(): raise ValueError( - f"{vep_annotation} is not a row field of the VEP annotation in" + f"{vep_annotation} is not a row field of the {vep_root} annotation in" " Table/MatrixTable!" ) # Create top-level annotation for `vep_annotation` and explode it. @@ -742,7 +741,7 @@ def explode_by_vep_annotation( def get_constraint_grouping_expr( vep_annotation_expr: hl.StructExpression, - coverage_expr: hl.Int32Expression = None, + coverage_expr: Optional[hl.Int32Expression] = None, include_transcript_group: bool = True, include_canonical_group: bool = True, ) -> Dict[str, Union[hl.StringExpression, hl.Int32Expression, hl.BooleanExpression]]: @@ -754,7 +753,7 @@ def get_constraint_grouping_expr( - modifier - classic lof annotation, LOFTEE annotation, or PolyPhen annotation in `vep_annotation` - gene - 'gene_symbol' annotation inside `vep_annotation` - - coverage - exome coverage + - coverage - exome coverage if `coverage_expr` is specified - transcript (added when `include_transcript_group` is True) - canonical (added when `include_canonical_group` is True) @@ -769,10 +768,10 @@ def get_constraint_grouping_expr( - canonical (if `include_canonical_group` is True) :param vep_annotation_expr: StructExpression of VEP annotation. - :param coverage_expr: Int32Expression of exome coverage. Default is None. - :param include_transcript_group: Wether to include the transcript annotation in the + :param coverage_expr: Optional Int32Expression of exome coverage. Default is None. + :param include_transcript_group: Whether to include the transcript annotation in the groupings. Default is True. - :param include_canonical_group: Wether to include canonical annotation in the + :param include_canonical_group: Whether to include canonical annotation in the groupings. Default is True. :return: A dictionary with keys as annotation names and values as actual annotations. @@ -802,9 +801,9 @@ def annotate_exploded_vep_for_constraint_groupings( ht: hl.Table, vep_annotation: str = "transcript_consequences" ) -> Tuple[Union[hl.Table, hl.MatrixTable], Tuple[str]]: """ - Annotate constraint annotations used for groupings. + Annotate Table with annotations used for constraint groupings. - Function explodes the specified VEP annotation and annotates the following + Function explodes the specified VEP annotation (`vep_annotation`) and adds the following annotations: - annotation -'most_severe_consequence' annotation in `vep_annotation` - modifier - classic lof annotation, LOFTEE annotation, or PolyPhen annotation @@ -817,14 +816,14 @@ def annotate_exploded_vep_for_constraint_groupings( "transcript_consequences") .. note:: - This function expects that the following fields are present in `ht`: + This function expects that the following annotations are present in `ht`: - vep - exome_coverage :param t: Input Table or MatrixTable. - :param vep_annotation: Name of annotation in VEP annotation (one of + :param vep_annotation: Name of annotation in 'vep' annotation (one of "transcript_consequences" and "worst_csq_by_gene") that will be used for - constraint annotation. Defaults to "transcript_consequences". + obtaining constraint annotations. Default is "transcript_consequences". :return: A tuple of input Table or MatrixTable with grouping annotations added and the names of added annotations. """ @@ -848,7 +847,7 @@ def annotate_exploded_vep_for_constraint_groupings( def compute_expected_variants( ht: hl.Table, - plateau_models: hl.StructExpression, + plateau_models_expr: hl.StructExpression, mu_expr: hl.Float64Expression, cov_corr_expr: hl.Float64Expression, cpg_expr: hl.BooleanExpression, @@ -858,7 +857,7 @@ def compute_expected_variants( Apply plateau models for all sites and for a population (if specified) to compute predicted proportion observed ratio and expected variant counts. .. note: - Function expects following annotations to be present in `ht`: + Function expects the following annotations to be present in `ht`: - cpg - observed_variants - possible_variants @@ -872,20 +871,20 @@ def compute_expected_variants( :param mu_expr: Float64Expression of mutation rate. :param cov_corr_expr: Float64Expression of corrected coverage expression. :param cpg_expr: BooleanExpression noting whether a site is a CPG site. - :param pop: Population that will be used when applying plateau model. Default is + :param pop: Optional population to use when applying plateau model. Default is None. :return: A dictionary with predicted proportion observed ratio and expected variant counts. """ if pop is None: pop = "" - plateau_model = hl.literal(plateau_models.total)[cpg_expr] + plateau_model = hl.literal(plateau_models_expr.total)[cpg_expr] slope = plateau_model[1] intercept = plateau_model[0] agg_func = hl.agg.sum ann_to_sum = ["observed_variants", "possible_variants"] else: - plateau_model = hl.literal(plateau_models[pop]) + plateau_model = hl.literal(plateau_models_expr[pop]) slope = hl.map(lambda f: f[cpg_expr][1], plateau_model) intercept = hl.map(lambda f: f[cpg_expr][0], plateau_model) agg_func = hl.agg.array_sum From b400c8eff24fa990c58d38926349a51424dffcf1 Mon Sep 17 00:00:00 2001 From: averywpx Date: Thu, 1 Dec 2022 04:45:31 -0500 Subject: [PATCH 19/23] fix change requests --- gnomad/utils/constraint.py | 87 +++++++++++++------------------------- gnomad/utils/vep.py | 29 +++++++++++++ 2 files changed, 58 insertions(+), 58 deletions(-) diff --git a/gnomad/utils/constraint.py b/gnomad/utils/constraint.py index 3522d5d23..2433de25f 100644 --- a/gnomad/utils/constraint.py +++ b/gnomad/utils/constraint.py @@ -706,39 +706,6 @@ def get_all_pop_lengths( return pop_lengths -def explode_by_vep_annotation( - t: Union[hl.Table, hl.MatrixTable], - vep_annotation: str = "transcript_consequences", - vep_root: str = "vep", -) -> Union[hl.Table, hl.MatrixTable]: - """ - Explode the specified VEP annotation on the input Table/MatrixTable. - - :param t: Input Table or MatrixTable. - :param vep_annotation: Name of annotation in `vep_root` to explode. - :param vep_root: Name used for root VEP annotation. Default is 'vep'. - :return: Table or MatrixTable with exploded VEP annotation. - """ - # Annotate 'worst_csq_by_gene' to t if it's specified for `vep_annotation`. - if vep_annotation == "worst_csq_by_gene": - t = process_consequences(t) - - if vep_annotation not in t[vep_root].keys(): - raise ValueError( - f"{vep_annotation} is not a row field of the {vep_root} annotation in" - " Table/MatrixTable!" - ) - # Create top-level annotation for `vep_annotation` and explode it. - if isinstance(t, hl.Table): - t = t.transmute(**{vep_annotation: t[vep_root][vep_annotation]}) - t = t.explode(t[vep_annotation]) - else: - t = t.transmute_rows(**{vep_annotation: t[vep_root][vep_annotation]}) - t = t.explode_rows(t[vep_annotation]) - - return t - - def get_constraint_grouping_expr( vep_annotation_expr: hl.StructExpression, coverage_expr: Optional[hl.Int32Expression] = None, @@ -749,13 +716,17 @@ def get_constraint_grouping_expr( Collect annotations used for constraint groupings. Function collects the following annotations: - - annotation - 'most_severe_consequence' annotation in `vep_annotation` - - modifier - classic lof annotation, LOFTEE annotation, or PolyPhen annotation - in `vep_annotation` - - gene - 'gene_symbol' annotation inside `vep_annotation` + - annotation - 'most_severe_consequence' annotation in `vep_annotation_expr` + - modifier - classic lof annotation from 'lof' annotation in + `vep_annotation_expr`, LOFTEE annotation from 'lof' annotation in + `vep_annotation_expr`, PolyPhen annotation from 'polyphen_prediction' in + `vep_annotation_expr`, or "None" if neither is defined + - gene - 'gene_symbol' annotation inside `vep_annotation_expr` - coverage - exome coverage if `coverage_expr` is specified - - transcript (added when `include_transcript_group` is True) - - canonical (added when `include_canonical_group` is True) + - transcript - id from 'transcript_id' in `vep_annotation_expr` (added when + `include_transcript_group` is True) + - canonical from `vep_annotation_expr` (added when `include_canonical_group` is + True) .. note:: This function expects that the following fields are present in @@ -806,14 +777,16 @@ def annotate_exploded_vep_for_constraint_groupings( Function explodes the specified VEP annotation (`vep_annotation`) and adds the following annotations: - annotation -'most_severe_consequence' annotation in `vep_annotation` - - modifier - classic lof annotation, LOFTEE annotation, or PolyPhen annotation - in `vep_annotation` + - modifier - classic lof annotation from 'lof' annotation in + `vep_annotation_expr`, LOFTEE annotation from 'lof' annotation in + `vep_annotation_expr`, PolyPhen annotation from 'polyphen_prediction' in + `vep_annotation_expr`, or "None" if neither is defined - gene - 'gene_symbol' annotation inside `vep_annotation` - coverage - exome coverage in `t` - - transcript (added when `vep_annotation` is specified as - "transcript_consequences") - - canonical (added when `vep_annotation` is specified as - "transcript_consequences") + - transcript - id from 'transcript_id' in `vep_annotation_expr` (added when + `include_transcript_group` is True) + - canonical from `vep_annotation_expr` (added when `include_canonical_group` is + True) .. note:: This function expects that the following annotations are present in `ht`: @@ -827,13 +800,18 @@ def annotate_exploded_vep_for_constraint_groupings( :return: A tuple of input Table or MatrixTable with grouping annotations added and the names of added annotations. """ - # Explode the specified VEP annotation. - ht = explode_by_vep_annotation(ht, vep_annotation) if vep_annotation == "transcript_consequences": include_transcript_group = include_canonical_group = True else: include_transcript_group = include_canonical_group = False + # Annotate 'worst_csq_by_gene' to `ht` if it's specified for `vep_annotation`. + if vep_annotation == "worst_csq_by_gene": + ht = process_consequences(ht) + + # Explode the specified VEP annotation. + ht = explode_by_vep_annotation(ht, vep_annotation) + # Collect the annotations used for groupings. groupings = get_constraint_grouping_expr( ht[vep_annotation], @@ -856,18 +834,11 @@ def compute_expected_variants( """ Apply plateau models for all sites and for a population (if specified) to compute predicted proportion observed ratio and expected variant counts. - .. note: - Function expects the following annotations to be present in `ht`: - - cpg - - observed_variants - - possible_variants - :param ht: Input Table. - :param plateau_models: Linear models (output of `build_models()` in - gnomad_methods`), with the values of the dictionary formatted as a - StrucExpression of intercept and slope, that calibrates mutation rate to - proportion observed for high coverage exome. It includes models for CpG s, - non-CpG sites, and each population in `POPS`. + :param plateau_models: Linear models (output of `build_models()`, with the values + of the dictionary formatted as a StructExpression of intercept and slope, that + calibrates mutation rate to proportion observed for high coverage exomes. It + includes models for CpG, non-CpG sites, and each population in `POPS`. :param mu_expr: Float64Expression of mutation rate. :param cov_corr_expr: Float64Expression of corrected coverage expression. :param cpg_expr: BooleanExpression noting whether a site is a CPG site. diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index ecdcb5044..c0135003d 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -640,3 +640,32 @@ def add_most_severe_csq_to_tc_within_vep_root( if isinstance(t, hl.MatrixTable) else t.annotate(**{vep_root: annotation}) ) + + +def explode_by_vep_annotation( + t: Union[hl.Table, hl.MatrixTable], + vep_annotation: str = "transcript_consequences", + vep_root: str = "vep", +) -> Union[hl.Table, hl.MatrixTable]: + """ + Explode the specified VEP annotation on the input Table/MatrixTable. + + :param t: Input Table or MatrixTable. + :param vep_annotation: Name of annotation in `vep_root` to explode. + :param vep_root: Name used for root VEP annotation. Default is 'vep'. + :return: Table or MatrixTable with exploded VEP annotation. + """ + if vep_annotation not in t[vep_root].keys(): + raise ValueError( + f"{vep_annotation} is not a row field of the {vep_root} annotation in" + " Table/MatrixTable!" + ) + # Create top-level annotation for `vep_annotation` and explode it. + if isinstance(t, hl.Table): + t = t.transmute(**{vep_annotation: t[vep_root][vep_annotation]}) + t = t.explode(t[vep_annotation]) + else: + t = t.transmute_rows(**{vep_annotation: t[vep_root][vep_annotation]}) + t = t.explode_rows(t[vep_annotation]) + + return t From e5560c43ca210b32ab63cfea6504e8a983b6db8a Mon Sep 17 00:00:00 2001 From: averywpx Date: Thu, 1 Dec 2022 06:09:27 -0500 Subject: [PATCH 20/23] small fix --- gnomad/utils/constraint.py | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/gnomad/utils/constraint.py b/gnomad/utils/constraint.py index 2433de25f..c2d431cd1 100644 --- a/gnomad/utils/constraint.py +++ b/gnomad/utils/constraint.py @@ -5,7 +5,10 @@ import hail as hl -from gnomad.utils.vep import process_consequences +from gnomad.utils.vep import ( + process_consequences, + explode_by_vep_annotation +) logging.basicConfig( format="%(asctime)s (%(name)s %(lineno)s): %(message)s", @@ -776,17 +779,17 @@ def annotate_exploded_vep_for_constraint_groupings( Function explodes the specified VEP annotation (`vep_annotation`) and adds the following annotations: - - annotation -'most_severe_consequence' annotation in `vep_annotation` - - modifier - classic lof annotation from 'lof' annotation in - `vep_annotation_expr`, LOFTEE annotation from 'lof' annotation in - `vep_annotation_expr`, PolyPhen annotation from 'polyphen_prediction' in - `vep_annotation_expr`, or "None" if neither is defined - - gene - 'gene_symbol' annotation inside `vep_annotation` - - coverage - exome coverage in `t` - - transcript - id from 'transcript_id' in `vep_annotation_expr` (added when - `include_transcript_group` is True) - - canonical from `vep_annotation_expr` (added when `include_canonical_group` is - True) + - annotation -'most_severe_consequence' annotation in `vep_annotation` + - modifier - classic lof annotation from 'lof' annotation in + `vep_annotation_expr`, LOFTEE annotation from 'lof' annotation in + `vep_annotation_expr`, PolyPhen annotation from 'polyphen_prediction' in + `vep_annotation_expr`, or "None" if neither is defined + - gene - 'gene_symbol' annotation inside `vep_annotation` + - coverage - exome coverage in `t` + - transcript - id from 'transcript_id' in `vep_annotation_expr` (added when + `include_transcript_group` is True) + - canonical from `vep_annotation_expr` (added when `include_canonical_group` is + True) .. note:: This function expects that the following annotations are present in `ht`: From ca9301d1f072dfcb3063c4e51bfc6cd382f40eb3 Mon Sep 17 00:00:00 2001 From: averywpx <51382435+averywpx@users.noreply.github.com> Date: Sun, 4 Dec 2022 15:12:10 -0500 Subject: [PATCH 21/23] Apply suggestions from code review Co-authored-by: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> --- gnomad/utils/constraint.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/gnomad/utils/constraint.py b/gnomad/utils/constraint.py index c2d431cd1..a7eabcf9e 100644 --- a/gnomad/utils/constraint.py +++ b/gnomad/utils/constraint.py @@ -781,14 +781,14 @@ def annotate_exploded_vep_for_constraint_groupings( annotations: - annotation -'most_severe_consequence' annotation in `vep_annotation` - modifier - classic lof annotation from 'lof' annotation in - `vep_annotation_expr`, LOFTEE annotation from 'lof' annotation in - `vep_annotation_expr`, PolyPhen annotation from 'polyphen_prediction' in - `vep_annotation_expr`, or "None" if neither is defined + `vep_annotation`, LOFTEE annotation from 'lof' annotation in + `vep_annotation`, PolyPhen annotation from 'polyphen_prediction' in + `vep_annotation`, or "None" if neither is defined - gene - 'gene_symbol' annotation inside `vep_annotation` - - coverage - exome coverage in `t` - - transcript - id from 'transcript_id' in `vep_annotation_expr` (added when + - coverage - exome coverage in `ht` + - transcript - id from 'transcript_id' in `vep_annotation` (added when `include_transcript_group` is True) - - canonical from `vep_annotation_expr` (added when `include_canonical_group` is + - canonical from `vep_annotation` (added when `include_canonical_group` is True) .. note:: @@ -838,7 +838,7 @@ def compute_expected_variants( Apply plateau models for all sites and for a population (if specified) to compute predicted proportion observed ratio and expected variant counts. :param ht: Input Table. - :param plateau_models: Linear models (output of `build_models()`, with the values + :param plateau_models_expr: Linear models (output of `build_models()`, with the values of the dictionary formatted as a StructExpression of intercept and slope, that calibrates mutation rate to proportion observed for high coverage exomes. It includes models for CpG, non-CpG sites, and each population in `POPS`. @@ -876,7 +876,7 @@ def compute_expected_variants( } # Generate sum aggregators for 'observed_variants' and 'possible_variants' on - # the entire dataset if pop is None, and for `downsampling_counts` for + # the entire dataset if pop is None, and for 'downsampling_counts' for # specified population if pop is not None. agg_expr.update({ann: agg_func(ht[ann]) for ann in ann_to_sum}) From e25144629c79949c5ebd956ff79df9e161d66acd Mon Sep 17 00:00:00 2001 From: averywpx Date: Sun, 4 Dec 2022 15:26:41 -0500 Subject: [PATCH 22/23] pass check --- gnomad/utils/constraint.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/gnomad/utils/constraint.py b/gnomad/utils/constraint.py index a7eabcf9e..7091fa823 100644 --- a/gnomad/utils/constraint.py +++ b/gnomad/utils/constraint.py @@ -5,10 +5,7 @@ import hail as hl -from gnomad.utils.vep import ( - process_consequences, - explode_by_vep_annotation -) +from gnomad.utils.vep import process_consequences, explode_by_vep_annotation logging.basicConfig( format="%(asctime)s (%(name)s %(lineno)s): %(message)s", @@ -777,8 +774,7 @@ def annotate_exploded_vep_for_constraint_groupings( """ Annotate Table with annotations used for constraint groupings. - Function explodes the specified VEP annotation (`vep_annotation`) and adds the following - annotations: + Function explodes the specified VEP annotation (`vep_annotation`) and adds the following annotations: - annotation -'most_severe_consequence' annotation in `vep_annotation` - modifier - classic lof annotation from 'lof' annotation in `vep_annotation`, LOFTEE annotation from 'lof' annotation in From 0dbd6b823c1ba987f4a1ba68041b6ec574f94278 Mon Sep 17 00:00:00 2001 From: averywpx Date: Sun, 4 Dec 2022 15:37:36 -0500 Subject: [PATCH 23/23] to pass checks --- gnomad/utils/constraint.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/gnomad/utils/constraint.py b/gnomad/utils/constraint.py index 7091fa823..0255ba191 100644 --- a/gnomad/utils/constraint.py +++ b/gnomad/utils/constraint.py @@ -5,7 +5,7 @@ import hail as hl -from gnomad.utils.vep import process_consequences, explode_by_vep_annotation +from gnomad.utils.vep import explode_by_vep_annotation, process_consequences logging.basicConfig( format="%(asctime)s (%(name)s %(lineno)s): %(message)s", @@ -775,17 +775,17 @@ def annotate_exploded_vep_for_constraint_groupings( Annotate Table with annotations used for constraint groupings. Function explodes the specified VEP annotation (`vep_annotation`) and adds the following annotations: - - annotation -'most_severe_consequence' annotation in `vep_annotation` - - modifier - classic lof annotation from 'lof' annotation in - `vep_annotation`, LOFTEE annotation from 'lof' annotation in - `vep_annotation`, PolyPhen annotation from 'polyphen_prediction' in - `vep_annotation`, or "None" if neither is defined - - gene - 'gene_symbol' annotation inside `vep_annotation` - - coverage - exome coverage in `ht` - - transcript - id from 'transcript_id' in `vep_annotation` (added when - `include_transcript_group` is True) - - canonical from `vep_annotation` (added when `include_canonical_group` is - True) + - annotation -'most_severe_consequence' annotation in `vep_annotation` + - modifier - classic lof annotation from 'lof' annotation in + `vep_annotation`, LOFTEE annotation from 'lof' annotation in + `vep_annotation`, PolyPhen annotation from 'polyphen_prediction' in + `vep_annotation`, or "None" if neither is defined + - gene - 'gene_symbol' annotation inside `vep_annotation` + - coverage - exome coverage in `ht` + - transcript - id from 'transcript_id' in `vep_annotation` (added when + `include_transcript_group` is True) + - canonical from `vep_annotation` (added when `include_canonical_group` is + True) .. note:: This function expects that the following annotations are present in `ht`: