|
2 | 2 |
|
3 | 3 | import copy |
4 | 4 | import logging |
5 | | -from typing import Any, Dict, List, Optional, Tuple, Union |
| 5 | +from typing import Any, Callable, Dict, List, Optional, Tuple, Union |
6 | 6 |
|
7 | 7 | import hail as hl |
8 | 8 | from hail.utils.misc import divide_null, new_temp_file |
9 | 9 |
|
10 | | -from gnomad.utils.vep import explode_by_vep_annotation, process_consequences |
| 10 | +from gnomad.utils.reference_genome import get_reference_genome |
| 11 | +from gnomad.utils.vep import ( |
| 12 | + add_most_severe_consequence_to_consequence, |
| 13 | + explode_by_vep_annotation, |
| 14 | + process_consequences, |
| 15 | +) |
11 | 16 |
|
12 | 17 | logging.basicConfig( |
13 | 18 | format="%(asctime)s (%(name)s %(lineno)s): %(message)s", |
@@ -601,6 +606,245 @@ def collapse_strand( |
601 | 606 | ) |
602 | 607 |
|
603 | 608 |
|
| 609 | +def transform_methylation_level( |
| 610 | + methylation_expr: Union[str, hl.expr.NumericExpression] = "methylation_level", |
| 611 | + methylation_cutoffs: Tuple[Union[int, float], Union[int, float]] = (0, 5), |
| 612 | + ht: Optional[hl.Table] = None, |
| 613 | +) -> Union[hl.Table, hl.expr.NumericExpression]: |
| 614 | + """ |
| 615 | + Transform methylation level into a 0 to 2 scale. |
| 616 | +
|
| 617 | + The methylation level is transformed to a 0-2 scale based on the provided cutoffs. |
| 618 | + The methylation level is assigned a value of 2 if it is greater than the second |
| 619 | + cutoff, 1 if it is greater than the first cutoff (but less than or equal to the |
| 620 | + second), and 0 otherwise. |
| 621 | +
|
| 622 | + :param methylation_expr: Methylation level expression or annotation name in `ht`. If |
| 623 | + `methylation_expr` is a string, `ht` must be provided. |
| 624 | + :param methylation_cutoffs: Tuple of two integers/floats representing the cutoffs |
| 625 | + for the methylation level transformation. Default is (0, 5). |
| 626 | + :param ht: Input Table. Default is None. |
| 627 | + :return: Table with methylation level annotation added or methylation level |
| 628 | + expression. |
| 629 | + """ |
| 630 | + if isinstance(methylation_expr, str) and ht is None: |
| 631 | + raise ValueError("ht must be provided if methylation_expr is a string.") |
| 632 | + |
| 633 | + if isinstance(methylation_expr, str): |
| 634 | + methylation_expr = ht[methylation_expr] |
| 635 | + |
| 636 | + methylation_expr = ( |
| 637 | + hl.case() |
| 638 | + .when(methylation_expr > methylation_cutoffs[1], 2) |
| 639 | + .when(methylation_expr > methylation_cutoffs[0], 1) |
| 640 | + .default(0) |
| 641 | + ) |
| 642 | + |
| 643 | + if ht is not None: |
| 644 | + return ht.annotate(methylation_level=methylation_expr) |
| 645 | + |
| 646 | + return methylation_expr |
| 647 | + |
| 648 | + |
| 649 | +def assemble_constraint_context_ht( |
| 650 | + ht: hl.Table, |
| 651 | + methylation_ht: hl.Table = None, |
| 652 | + gerp_ht: hl.Table = None, |
| 653 | + coverage_hts: Dict[str, hl.Table] = None, |
| 654 | + an_hts: Dict[str, hl.Table] = None, |
| 655 | + freq_hts: Dict[str, hl.Table] = None, |
| 656 | + filter_hts: Dict[str, hl.Table] = None, |
| 657 | + transformation_funcs: Optional[Dict[str, Callable]] = None, |
| 658 | +) -> hl.Table: |
| 659 | + """ |
| 660 | + Assemble context Table with necessary annotations for constraint calculations. |
| 661 | +
|
| 662 | + .. note:: |
| 663 | +
|
| 664 | + Checks for 'was_split' annotation in Table. If not present, splits |
| 665 | + multiallelic sites. |
| 666 | +
|
| 667 | + The following annotations are added to the output Table: |
| 668 | +
|
| 669 | + - ref: Reference allele. |
| 670 | + - alt: Alternate allele. |
| 671 | + - context: Trimer context. |
| 672 | + - annotations added by `annotate_mutation_type()`, `collapse_strand()`, and |
| 673 | + `add_most_severe_csq_to_tc_within_vep_root()`. |
| 674 | +
|
| 675 | + Depending on the annotations provided, the following annotations may also be added: |
| 676 | +
|
| 677 | + - 'methylation_level': Methylation level annotation will be added if |
| 678 | + `methylation_ht` is provided. |
| 679 | + - 'gerp': GERP annotation will be added if `gerp_ht` is provided. |
| 680 | + - 'coverage': Coverage annotations will be added if `coverage_hts` is provided. |
| 681 | + - 'AN': Allele number annotations will be added if `an_hts` is provided. |
| 682 | + - 'freq': Frequency annotations will be added if `freq_hts` is provided. |
| 683 | + - 'filters': Filter annotations will be added if `filter_hts` is provided. |
| 684 | +
|
| 685 | + The `transformation_funcs` parameter can be used to transform the HTs before adding |
| 686 | + annotations. The keys should be the annotation names ('coverage', 'gerp', etc.) and |
| 687 | + the values should be functions that take the annotation Table and the ht that is |
| 688 | + being annotated as input and return the transformed and keyed annotation. If not |
| 689 | + provided, the following default transformations are used: |
| 690 | +
|
| 691 | + - 'methylation_level': Uses the 'methylation_level' annotation in the |
| 692 | + Methylation Table after transforming the methylation level to a 0-2 scale |
| 693 | + using `transform_grch37_methylation()` or `transform_grch38_methylation()`. |
| 694 | + - 'gerp': Uses the 'S' annotation in the GERP Table. If 'S' is missing, it |
| 695 | + defaults to 0. |
| 696 | + - 'coverage': If necessary, pulls out the first element of coverage statistics |
| 697 | + (which includes all samples). Relevant to v4, where coverage stats include |
| 698 | + additional elements to stratify by UK Biobank subset and platforms. |
| 699 | + - 'AN': Uses the 'AN' annotation in the allele number Table. |
| 700 | + - 'freq': Uses the 'freq' annotation in the frequency Table. |
| 701 | + - 'filters': Uses the 'filters' annotation in the filter Table. |
| 702 | +
|
| 703 | + The following global annotations may also be added to the output Table: |
| 704 | +
|
| 705 | + - 'an_globals': Global allele number annotations 'strata_sample_count' and |
| 706 | + 'strata_meta' will be added if `an_hts` is provided. |
| 707 | + - 'freq_globals': Global frequency annotations 'freq_meta_sample_count' and |
| 708 | + 'freq_meta' will be added if `freq_hts` is provided. |
| 709 | +
|
| 710 | + :param ht: Input context Table with VEP annotation. |
| 711 | + :param methylation_ht: Optional Table with methylation annotation. Default is None. |
| 712 | + :param gerp_ht: Optional Table with GERP annotation. Default is None. |
| 713 | + :param coverage_hts: An optional Dictionary with key as one of 'exomes' or |
| 714 | + 'genomes' and values as corresponding coverage Tables. Default is None. |
| 715 | + :param an_hts: A Dictionary with key as one of 'exomes' or 'genomes' and |
| 716 | + values as corresponding allele number Tables. Default is None. |
| 717 | + :param transformation_funcs: An optional Dictionary to transform the HTs before |
| 718 | + adding annotations. Default is None, which will use some default transformations |
| 719 | + as described above. |
| 720 | + :return: Table with sites split and necessary annotations. |
| 721 | + """ |
| 722 | + ref = get_reference_genome(ht.locus) |
| 723 | + if ref == hl.get_reference("GRCh37"): |
| 724 | + from gnomad.resources.grch37 import transform_grch37_methylation as trans_methyl |
| 725 | + else: |
| 726 | + from gnomad.resources.grch38 import transform_grch38_methylation as trans_methyl |
| 727 | + |
| 728 | + # Check if context Table is split, and if not, split multiallelic sites. |
| 729 | + if "was_split" not in list(ht.row): |
| 730 | + ht = hl.split_multi_hts(ht) |
| 731 | + |
| 732 | + # Only need to keep the keys (locus, alleles) and the context and vep annotations. |
| 733 | + ht = ht.select("context", "vep") |
| 734 | + |
| 735 | + # Filter Table to only contigs 1-22, X, Y. |
| 736 | + ht = hl.filter_intervals( |
| 737 | + ht, [hl.parse_locus_interval(c, ref.name) for c in ref.contigs[:24]] |
| 738 | + ) |
| 739 | + |
| 740 | + # Add annotations for 'ref' and 'alt'. |
| 741 | + ht = ht.annotate(ref=ht.alleles[0], alt=ht.alleles[1]) |
| 742 | + |
| 743 | + # Trim heptamer context to create trimer context. |
| 744 | + ht = trimer_from_heptamer(ht) |
| 745 | + |
| 746 | + # Annotate mutation type (such as "CpG", "non-CpG transition", "transversion") and |
| 747 | + # collapse strands to deduplicate the context. |
| 748 | + ht = annotate_mutation_type(collapse_strand(ht)) |
| 749 | + |
| 750 | + # Add most_severe_consequence annotation to 'transcript_consequences' within the |
| 751 | + # vep root annotation. |
| 752 | + csqs = ht.vep.transcript_consequences |
| 753 | + csqs = add_most_severe_consequence_to_consequence(csqs) |
| 754 | + vep_csq_fields = [ |
| 755 | + "transcript_id", |
| 756 | + "gene_id", |
| 757 | + "gene_symbol", |
| 758 | + "biotype", |
| 759 | + "most_severe_consequence", |
| 760 | + "mane_select", |
| 761 | + "canonical", |
| 762 | + "lof", |
| 763 | + "lof_flags", |
| 764 | + ] |
| 765 | + vep_csq_fields = [x for x in vep_csq_fields if x in csqs.dtype.element_type] |
| 766 | + ht = ht.annotate( |
| 767 | + vep=ht.vep.select( |
| 768 | + "most_severe_consequence", |
| 769 | + transcript_consequences=csqs.map(lambda x: x.select(*vep_csq_fields)), |
| 770 | + ) |
| 771 | + ) |
| 772 | + |
| 773 | + # Add 'methylation_level' and 'gerp' annotations if specified. |
| 774 | + transformation_funcs = transformation_funcs or {} |
| 775 | + |
| 776 | + if "methylation_level" not in transformation_funcs: |
| 777 | + transformation_funcs["methylation_level"] = lambda x, t: hl.if_else( |
| 778 | + ht.cpg, trans_methyl(x)[t.locus].methylation_level, 0 |
| 779 | + ) |
| 780 | + |
| 781 | + if "gerp" not in transformation_funcs: |
| 782 | + transformation_funcs["gerp"] = lambda x, t: hl.if_else( |
| 783 | + hl.is_missing(x[t.locus].S), 0, x[t.locus].S |
| 784 | + ) |
| 785 | + |
| 786 | + # If necessary, pull out first element of coverage statistics (which includes all |
| 787 | + # samples). Relevant to v4, where coverage stats include additional elements to |
| 788 | + # stratify by ukb subset and platforms. |
| 789 | + if "coverage" not in transformation_funcs: |
| 790 | + transformation_funcs["coverage"] = lambda x, t: ( |
| 791 | + x[t.locus].coverage_stats[0] if "coverage_stats" in x.row else x[t.locus] |
| 792 | + ) |
| 793 | + |
| 794 | + if "AN" not in transformation_funcs: |
| 795 | + transformation_funcs["AN"] = lambda x, t: x[t.locus].AN |
| 796 | + |
| 797 | + if "freq" not in transformation_funcs: |
| 798 | + transformation_funcs["freq"] = lambda x, t: x[t.key].freq |
| 799 | + |
| 800 | + if "filters" not in transformation_funcs: |
| 801 | + transformation_funcs["filters"] = lambda x, t: x[t.key].filters |
| 802 | + |
| 803 | + hts = { |
| 804 | + "methylation_level": methylation_ht, |
| 805 | + "gerp": gerp_ht, |
| 806 | + "coverage": coverage_hts, |
| 807 | + "AN": an_hts, |
| 808 | + "freq": freq_hts, |
| 809 | + "filters": filter_hts, |
| 810 | + } |
| 811 | + hts = {k: v for k, v in hts.items() if v is not None} |
| 812 | + exprs = {} |
| 813 | + for ann, ann_ht in hts.items(): |
| 814 | + if isinstance(ann_ht, dict): |
| 815 | + ann_expr = hl.struct( |
| 816 | + **{k: transformation_funcs[ann](v, ht) for k, v in ann_ht.items()} |
| 817 | + ) |
| 818 | + else: |
| 819 | + ann_expr = transformation_funcs[ann](ann_ht, ht) |
| 820 | + |
| 821 | + exprs[ann] = ann_expr |
| 822 | + |
| 823 | + ht = ht.annotate(**exprs) |
| 824 | + |
| 825 | + # Add global annotations for 'AN' and 'freq' if HTs are provided. |
| 826 | + global_anns = { |
| 827 | + "an_globals": (an_hts, ["strata_sample_count", "strata_meta"]), |
| 828 | + "freq_globals": (freq_hts, ["freq_meta_sample_count", "freq_meta"]), |
| 829 | + } |
| 830 | + global_anns = {k: v for k, v in global_anns.items() if v[0] is not None} |
| 831 | + ht = ht.annotate_globals( |
| 832 | + **{ |
| 833 | + k: hl.struct( |
| 834 | + **{ |
| 835 | + data_type: ann_ht.select_globals( |
| 836 | + *[x for x in g if x in ann_ht.globals] |
| 837 | + ).index_globals() |
| 838 | + for data_type, ann_ht in hts.items() |
| 839 | + } |
| 840 | + ) |
| 841 | + for k, (hts, g) in global_anns.items() |
| 842 | + } |
| 843 | + ) |
| 844 | + |
| 845 | + return ht |
| 846 | + |
| 847 | + |
604 | 848 | def build_models( |
605 | 849 | coverage_ht: hl.Table, |
606 | 850 | coverage_expr: hl.expr.Int32Expression, |
|
0 commit comments