Skip to content

Commit 75fb930

Browse files
authored
Merge pull request #733 from broadinstitute/jg/annotate_context_ht_for_constraint
Add `assemble_constraint_context_ht` function to create a fully annotated context HT for computing constraint on
2 parents 099a825 + cc73837 commit 75fb930

File tree

3 files changed

+328
-2
lines changed

3 files changed

+328
-2
lines changed

gnomad/resources/grch37/reference_data.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
# noqa: D100
22

3+
from typing import Optional, Union
4+
35
import hail as hl
46

57
from gnomad.resources.resource_utils import (
@@ -10,6 +12,7 @@
1012
import_gencode,
1113
import_sites_vcf,
1214
)
15+
from gnomad.utils.constraint import transform_methylation_level
1316

1417

1518
def _import_gtex_rsem(gtex_path: str, meta_path: str, **kwargs) -> hl.MatrixTable:
@@ -370,3 +373,32 @@ def get_truth_ht() -> hl.Table:
370373
),
371374
},
372375
)
376+
377+
378+
def transform_grch37_methylation(
379+
ht: Optional[hl.Table] = None,
380+
methylation_expr: Optional[hl.expr.NumericExpression] = None,
381+
) -> Union[hl.Table, hl.expr.NumericExpression]:
382+
"""
383+
Transform methylation level from the GRCh37 methylation resource to a 0-2 scale.
384+
385+
.. note::
386+
387+
One of ht or methylation_expr must be provided.
388+
389+
The GRCh37 methylation resource provides a MEAN score ranging from 0 to 1. We transform
390+
this to a 0-2 scale by converting any value greater than 0.6 to 2, any value less
391+
than or equal to 0.2 to 0, and any value in between to 1.
392+
393+
:param ht: Optional Hail Table with methylation data. Default is None.
394+
:param methylation_expr: Optional methylation level expression. Default is None.
395+
:return: Transformed methylation level expression or annotated Hail Table.
396+
"""
397+
if ht is None and methylation_expr is None:
398+
raise ValueError("One of 'ht' or 'methylation_expr' must be provided")
399+
400+
return transform_methylation_level(
401+
methylation_expr="MEAN" if methylation_expr is None else methylation_expr,
402+
methylation_cutoffs=(0.2, 0.6),
403+
ht=ht,
404+
)

gnomad/resources/grch38/reference_data.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
# noqa: D100
22

3+
from typing import Optional, Union
4+
35
import hail as hl
46
from hail import Table
57

@@ -13,6 +15,7 @@
1315
import_gencode,
1416
import_sites_vcf,
1517
)
18+
from gnomad.utils.constraint import transform_methylation_level
1619
from gnomad.utils.vep import vep_or_lookup_vep
1720

1821

@@ -401,3 +404,50 @@ def get_truth_ht() -> Table:
401404
),
402405
},
403406
)
407+
408+
409+
def transform_grch38_methylation(
410+
ht: Optional[hl.Table] = None,
411+
methylation_expr: Optional[hl.expr.NumericExpression] = None,
412+
) -> Union[hl.Table, hl.expr.NumericExpression]:
413+
"""
414+
Transform methylation level from the GRCh38 methylation resource to a 0-2 scale.
415+
416+
.. note::
417+
418+
One of ht or methylation_expr must be provided.
419+
420+
The GRCh38 methylation resource provides a score ranging from 0 to 15 for autosomes. The
421+
determination of this score is described in Chen et al:
422+
https://www.biorxiv.org/content/10.1101/2022.03.20.485034v2.full
423+
For chrX, methylation scores range from 0 to 12, but these scores are not directly
424+
comparable to the autosome scores (chrX and autosomes were analyzed separately and
425+
levels are relative). Cutoffs to translate these scores to the 0-2 methylation
426+
level were determined by correlating these scores with the GRCh37 liftover scores.
427+
Proposed cutoffs are: 0, 1-5, 6+ for autosomes, and 0, 1-3, 4+ for chrX.
428+
429+
:param ht: Optional Hail Table with methylation data. Default is None.
430+
:param methylation_expr: Optional methylation level expression. Default is None.
431+
:return: Transformed methylation level expression or annotated Hail Table.
432+
"""
433+
if ht is None and methylation_expr is None:
434+
raise ValueError("One of 'ht' or 'methylation_expr' must be provided")
435+
436+
if methylation_expr is None:
437+
methylation_expr = ht.methylation_level
438+
439+
if ht is None:
440+
locus_expr = methylation_expr._indices.source.locus
441+
else:
442+
locus_expr = ht.locus
443+
444+
methylation_expr = hl.if_else(
445+
locus_expr.contig != "chrX",
446+
transform_methylation_level(methylation_expr, (0, 5)),
447+
transform_methylation_level(methylation_expr, (0, 3)),
448+
)
449+
450+
if ht is not None:
451+
return ht.annotate(methylation_level=methylation_expr)
452+
453+
return methylation_expr

gnomad/utils/constraint.py

Lines changed: 246 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,17 @@
22

33
import copy
44
import logging
5-
from typing import Any, Dict, List, Optional, Tuple, Union
5+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
66

77
import hail as hl
88
from hail.utils.misc import divide_null, new_temp_file
99

10-
from gnomad.utils.vep import explode_by_vep_annotation, process_consequences
10+
from gnomad.utils.reference_genome import get_reference_genome
11+
from gnomad.utils.vep import (
12+
add_most_severe_consequence_to_consequence,
13+
explode_by_vep_annotation,
14+
process_consequences,
15+
)
1116

1217
logging.basicConfig(
1318
format="%(asctime)s (%(name)s %(lineno)s): %(message)s",
@@ -601,6 +606,245 @@ def collapse_strand(
601606
)
602607

603608

609+
def transform_methylation_level(
610+
methylation_expr: Union[str, hl.expr.NumericExpression] = "methylation_level",
611+
methylation_cutoffs: Tuple[Union[int, float], Union[int, float]] = (0, 5),
612+
ht: Optional[hl.Table] = None,
613+
) -> Union[hl.Table, hl.expr.NumericExpression]:
614+
"""
615+
Transform methylation level into a 0 to 2 scale.
616+
617+
The methylation level is transformed to a 0-2 scale based on the provided cutoffs.
618+
The methylation level is assigned a value of 2 if it is greater than the second
619+
cutoff, 1 if it is greater than the first cutoff (but less than or equal to the
620+
second), and 0 otherwise.
621+
622+
:param methylation_expr: Methylation level expression or annotation name in `ht`. If
623+
`methylation_expr` is a string, `ht` must be provided.
624+
:param methylation_cutoffs: Tuple of two integers/floats representing the cutoffs
625+
for the methylation level transformation. Default is (0, 5).
626+
:param ht: Input Table. Default is None.
627+
:return: Table with methylation level annotation added or methylation level
628+
expression.
629+
"""
630+
if isinstance(methylation_expr, str) and ht is None:
631+
raise ValueError("ht must be provided if methylation_expr is a string.")
632+
633+
if isinstance(methylation_expr, str):
634+
methylation_expr = ht[methylation_expr]
635+
636+
methylation_expr = (
637+
hl.case()
638+
.when(methylation_expr > methylation_cutoffs[1], 2)
639+
.when(methylation_expr > methylation_cutoffs[0], 1)
640+
.default(0)
641+
)
642+
643+
if ht is not None:
644+
return ht.annotate(methylation_level=methylation_expr)
645+
646+
return methylation_expr
647+
648+
649+
def assemble_constraint_context_ht(
650+
ht: hl.Table,
651+
methylation_ht: hl.Table = None,
652+
gerp_ht: hl.Table = None,
653+
coverage_hts: Dict[str, hl.Table] = None,
654+
an_hts: Dict[str, hl.Table] = None,
655+
freq_hts: Dict[str, hl.Table] = None,
656+
filter_hts: Dict[str, hl.Table] = None,
657+
transformation_funcs: Optional[Dict[str, Callable]] = None,
658+
) -> hl.Table:
659+
"""
660+
Assemble context Table with necessary annotations for constraint calculations.
661+
662+
.. note::
663+
664+
Checks for 'was_split' annotation in Table. If not present, splits
665+
multiallelic sites.
666+
667+
The following annotations are added to the output Table:
668+
669+
- ref: Reference allele.
670+
- alt: Alternate allele.
671+
- context: Trimer context.
672+
- annotations added by `annotate_mutation_type()`, `collapse_strand()`, and
673+
`add_most_severe_csq_to_tc_within_vep_root()`.
674+
675+
Depending on the annotations provided, the following annotations may also be added:
676+
677+
- 'methylation_level': Methylation level annotation will be added if
678+
`methylation_ht` is provided.
679+
- 'gerp': GERP annotation will be added if `gerp_ht` is provided.
680+
- 'coverage': Coverage annotations will be added if `coverage_hts` is provided.
681+
- 'AN': Allele number annotations will be added if `an_hts` is provided.
682+
- 'freq': Frequency annotations will be added if `freq_hts` is provided.
683+
- 'filters': Filter annotations will be added if `filter_hts` is provided.
684+
685+
The `transformation_funcs` parameter can be used to transform the HTs before adding
686+
annotations. The keys should be the annotation names ('coverage', 'gerp', etc.) and
687+
the values should be functions that take the annotation Table and the ht that is
688+
being annotated as input and return the transformed and keyed annotation. If not
689+
provided, the following default transformations are used:
690+
691+
- 'methylation_level': Uses the 'methylation_level' annotation in the
692+
Methylation Table after transforming the methylation level to a 0-2 scale
693+
using `transform_grch37_methylation()` or `transform_grch38_methylation()`.
694+
- 'gerp': Uses the 'S' annotation in the GERP Table. If 'S' is missing, it
695+
defaults to 0.
696+
- 'coverage': If necessary, pulls out the first element of coverage statistics
697+
(which includes all samples). Relevant to v4, where coverage stats include
698+
additional elements to stratify by UK Biobank subset and platforms.
699+
- 'AN': Uses the 'AN' annotation in the allele number Table.
700+
- 'freq': Uses the 'freq' annotation in the frequency Table.
701+
- 'filters': Uses the 'filters' annotation in the filter Table.
702+
703+
The following global annotations may also be added to the output Table:
704+
705+
- 'an_globals': Global allele number annotations 'strata_sample_count' and
706+
'strata_meta' will be added if `an_hts` is provided.
707+
- 'freq_globals': Global frequency annotations 'freq_meta_sample_count' and
708+
'freq_meta' will be added if `freq_hts` is provided.
709+
710+
:param ht: Input context Table with VEP annotation.
711+
:param methylation_ht: Optional Table with methylation annotation. Default is None.
712+
:param gerp_ht: Optional Table with GERP annotation. Default is None.
713+
:param coverage_hts: An optional Dictionary with key as one of 'exomes' or
714+
'genomes' and values as corresponding coverage Tables. Default is None.
715+
:param an_hts: A Dictionary with key as one of 'exomes' or 'genomes' and
716+
values as corresponding allele number Tables. Default is None.
717+
:param transformation_funcs: An optional Dictionary to transform the HTs before
718+
adding annotations. Default is None, which will use some default transformations
719+
as described above.
720+
:return: Table with sites split and necessary annotations.
721+
"""
722+
ref = get_reference_genome(ht.locus)
723+
if ref == hl.get_reference("GRCh37"):
724+
from gnomad.resources.grch37 import transform_grch37_methylation as trans_methyl
725+
else:
726+
from gnomad.resources.grch38 import transform_grch38_methylation as trans_methyl
727+
728+
# Check if context Table is split, and if not, split multiallelic sites.
729+
if "was_split" not in list(ht.row):
730+
ht = hl.split_multi_hts(ht)
731+
732+
# Only need to keep the keys (locus, alleles) and the context and vep annotations.
733+
ht = ht.select("context", "vep")
734+
735+
# Filter Table to only contigs 1-22, X, Y.
736+
ht = hl.filter_intervals(
737+
ht, [hl.parse_locus_interval(c, ref.name) for c in ref.contigs[:24]]
738+
)
739+
740+
# Add annotations for 'ref' and 'alt'.
741+
ht = ht.annotate(ref=ht.alleles[0], alt=ht.alleles[1])
742+
743+
# Trim heptamer context to create trimer context.
744+
ht = trimer_from_heptamer(ht)
745+
746+
# Annotate mutation type (such as "CpG", "non-CpG transition", "transversion") and
747+
# collapse strands to deduplicate the context.
748+
ht = annotate_mutation_type(collapse_strand(ht))
749+
750+
# Add most_severe_consequence annotation to 'transcript_consequences' within the
751+
# vep root annotation.
752+
csqs = ht.vep.transcript_consequences
753+
csqs = add_most_severe_consequence_to_consequence(csqs)
754+
vep_csq_fields = [
755+
"transcript_id",
756+
"gene_id",
757+
"gene_symbol",
758+
"biotype",
759+
"most_severe_consequence",
760+
"mane_select",
761+
"canonical",
762+
"lof",
763+
"lof_flags",
764+
]
765+
vep_csq_fields = [x for x in vep_csq_fields if x in csqs.dtype.element_type]
766+
ht = ht.annotate(
767+
vep=ht.vep.select(
768+
"most_severe_consequence",
769+
transcript_consequences=csqs.map(lambda x: x.select(*vep_csq_fields)),
770+
)
771+
)
772+
773+
# Add 'methylation_level' and 'gerp' annotations if specified.
774+
transformation_funcs = transformation_funcs or {}
775+
776+
if "methylation_level" not in transformation_funcs:
777+
transformation_funcs["methylation_level"] = lambda x, t: hl.if_else(
778+
ht.cpg, trans_methyl(x)[t.locus].methylation_level, 0
779+
)
780+
781+
if "gerp" not in transformation_funcs:
782+
transformation_funcs["gerp"] = lambda x, t: hl.if_else(
783+
hl.is_missing(x[t.locus].S), 0, x[t.locus].S
784+
)
785+
786+
# If necessary, pull out first element of coverage statistics (which includes all
787+
# samples). Relevant to v4, where coverage stats include additional elements to
788+
# stratify by ukb subset and platforms.
789+
if "coverage" not in transformation_funcs:
790+
transformation_funcs["coverage"] = lambda x, t: (
791+
x[t.locus].coverage_stats[0] if "coverage_stats" in x.row else x[t.locus]
792+
)
793+
794+
if "AN" not in transformation_funcs:
795+
transformation_funcs["AN"] = lambda x, t: x[t.locus].AN
796+
797+
if "freq" not in transformation_funcs:
798+
transformation_funcs["freq"] = lambda x, t: x[t.key].freq
799+
800+
if "filters" not in transformation_funcs:
801+
transformation_funcs["filters"] = lambda x, t: x[t.key].filters
802+
803+
hts = {
804+
"methylation_level": methylation_ht,
805+
"gerp": gerp_ht,
806+
"coverage": coverage_hts,
807+
"AN": an_hts,
808+
"freq": freq_hts,
809+
"filters": filter_hts,
810+
}
811+
hts = {k: v for k, v in hts.items() if v is not None}
812+
exprs = {}
813+
for ann, ann_ht in hts.items():
814+
if isinstance(ann_ht, dict):
815+
ann_expr = hl.struct(
816+
**{k: transformation_funcs[ann](v, ht) for k, v in ann_ht.items()}
817+
)
818+
else:
819+
ann_expr = transformation_funcs[ann](ann_ht, ht)
820+
821+
exprs[ann] = ann_expr
822+
823+
ht = ht.annotate(**exprs)
824+
825+
# Add global annotations for 'AN' and 'freq' if HTs are provided.
826+
global_anns = {
827+
"an_globals": (an_hts, ["strata_sample_count", "strata_meta"]),
828+
"freq_globals": (freq_hts, ["freq_meta_sample_count", "freq_meta"]),
829+
}
830+
global_anns = {k: v for k, v in global_anns.items() if v[0] is not None}
831+
ht = ht.annotate_globals(
832+
**{
833+
k: hl.struct(
834+
**{
835+
data_type: ann_ht.select_globals(
836+
*[x for x in g if x in ann_ht.globals]
837+
).index_globals()
838+
for data_type, ann_ht in hts.items()
839+
}
840+
)
841+
for k, (hts, g) in global_anns.items()
842+
}
843+
)
844+
845+
return ht
846+
847+
604848
def build_models(
605849
coverage_ht: hl.Table,
606850
coverage_expr: hl.expr.Int32Expression,

0 commit comments

Comments
 (0)