broadinstitute · KoalaQin · Jun 20, 2023 · May 25, 2023 · May 25, 2023 · May 25, 2023
@@ -4,6 +4,7 @@
 from typing import Dict, List, Optional, Union
 
 import hail as hl
+from hail.utils.misc import new_temp_file
 
 from gnomad.resources.grch38.gnomad import CURRENT_MAJOR_RELEASE, POPS, SEXES
 from gnomad.utils.vcf import HISTS, SORT_ORDER, make_label_combos
@@ -982,3 +983,93 @@ def validate_release_t(
             t, info_metrics, non_info_metrics, n_sites, missingness_threshold
         )
     logger.info("VALIDITY CHECKS COMPLETE")
+
+
+def count_vep_annotated_variants_per_interval(
+    vep_ht: hl.Table, interval_ht: hl.Table
+) -> hl.Table:
+    """
+    Calculate the count of VEP annotated variants in `vep_ht` per interval defined by `interval_ht`.
+
+    .. note::
+
+        - `vep_ht` must contain the 'vep.transcript_consequences' array field, which
+          contains a 'biotype' field to determine whether a variant is in a
+          "protein-coding" gene.
+        - `interval_ht` should be indexed by 'locus' and contain a 'gene_stable_ID'
+          field. For example, an interval Table containing the intervals of
+          protein-coding genes of a specific Ensembl release.
+
+    The returned Table will have the following fields added:
+        - n_total_variants: The number of total variants in the interval.
+        - n_pcg_variants: The number of variants in the interval that are annotated as
+          "protein-coding".
+
+    :param vep_ht: VEP-annotated Table.
+    :param interval_ht: Interval Table.
+    :return: Interval Table with annotations for the counts of total variants and
+        variants annotated as "protein-coding" in biotype.
+    """
+    logger.info(
+        "Counting the number of total variants and protein-coding variants in each"
+        " interval..."
+    )
+
+    # Select the vep_ht and annotate genes that have a matched interval from
+    # the interval_ht and are protein-coding.
+    vep_ht = vep_ht.select(
+        gene_stable_ID=interval_ht.index(vep_ht.locus, all_matches=True).gene_stable_ID,
+        in_pcg=vep_ht.vep.transcript_consequences.biotype.contains("protein_coding"),
+    )
+
+    vep_ht = vep_ht.filter(hl.is_defined(vep_ht.gene_stable_ID))
+
+    # Explode the vep_ht by gene_stable_ID.
+    vep_ht = vep_ht.explode(vep_ht.gene_stable_ID)
+
+    # Count the number of total variants and "protein-coding" variants in each interval.
+    count_ht = vep_ht.group_by(vep_ht.gene_stable_ID).aggregate(
+        all_variants=hl.agg.count(),
+        variants_in_pcg=hl.agg.count_where(vep_ht.in_pcg),
+    )
+
+    interval_ht = interval_ht.annotate(**count_ht[interval_ht.gene_stable_ID])
+
+    logger.info("Checkpointing the counts per interval...")
+    interval_ht = interval_ht.checkpoint(
+        new_temp_file("validity_checks.vep_count_per_interval", extension="ht"),
+        overwrite=True,
+    )
+
+    logger.info("Genes without variants annotated: ")
+    gene_sets = interval_ht.aggregate(
+        hl.struct(
+            na_genes=hl.agg.filter(
+                hl.is_missing(interval_ht.variants_in_pcg)
+                | (interval_ht.variants_in_pcg == 0),
+                hl.agg.collect_as_set(interval_ht.gene_stable_ID),
+            ),
+            partial_pcg_genes=hl.agg.filter(
+                (interval_ht.all_variants != 0)
+                & (interval_ht.variants_in_pcg != 0)
+                & (interval_ht.all_variants != interval_ht.variants_in_pcg),
+                hl.agg.collect_as_set(interval_ht.gene_stable_ID),
+            ),
+        )
+    )
+
+    logger.info(
+        "%s gene(s) have no variants annotated as protein-coding in Biotype. It is"
+        " likely these genes are not covered by the variants in 'vep_ht'. These genes"
+        " are: %s",
+        len(gene_sets.na_genes),
+        gene_sets.na_genes,
+    )
+
+    logger.info(
+        "%s gene(s) have a subset of variants annotated as protein-coding biotype in"
+        " their defined intervals",
+        len(gene_sets.partial_pcg_genes),
+    )
+
+    return interval_ht
@@ -57,6 +57,32 @@ def _import_dbsnp(**kwargs) -> hl.Table:
     return dbsnp
 
 
+def _import_ensembl_interval(path) -> hl.Table:
+    """
+    Import and parse Ensembl intervals of protein-coding genes to a Hail Table.
+
+    File is expected to include only the following fields: gene_stable_ID, chr, start, end, source_gene, gene_name, and type.
+
+    :param path: Path to the interval Table file.
+    """
+    ensembl = hl.import_table(
+        path,
+        delimiter="\t",
+        min_partitions=100,
+        impute=True,
+    )
+
+    ensembl = ensembl.key_by(
+        interval=hl.locus_interval(
+            "chr" + ensembl.chr,
+            ensembl.start,
+            ensembl.end,
+            reference_genome="GRCh38",
+        )
+    )
+    return ensembl
+
+
 # Resources with no versioning needed
 purcell_5k_intervals = GnomadPublicTableResource(
     path="gs://gnomad-public-requester-pays/resources/grch38/purcell_5k_intervals/purcell5k.ht",
@@ -135,6 +161,31 @@ def _import_dbsnp(**kwargs) -> hl.Table:
     },
 )
 
+# These Ensembl Interval Tables are focused on protein-coding genes on chr1-22,X,Y.
+# Downloaded from the biomart of Ensembl Archive (https://useast.ensembl.org/info/website/archives/index.html)
+# Ensembl 101 & 105 are included, since 101 was used to annotate gnomAD v3 and 105 to gnomAD v4.
+# Basic stats: 19924 protein-coding genes in Ensembl 101, and1 19951
+# protein-coding genes in Ensembl 105.
+ensembl_interval = VersionedTableResource(
+    default_version="105",
+    versions={
+        "105": GnomadPublicTableResource(
+            path="gs://gnomad-public-requester-pays/resources/grch38/ensembl/ensembl_105_pc_genes_grch38.ht",
+            import_func=_import_ensembl_interval,
+            import_args={
+                "path": "gs://gcp-public-data--gnomad/resources/grch38/ensembl/ensembl_105_pc_genes_grch38.tsv",
+            },
+        ),
+        "101": GnomadPublicTableResource(
+            path="gs://gnomad-public-requester-pays/resources/grch38/ensembl/ensembl_101_pc_genes_grch38.ht",
+            import_func=_import_ensembl_interval,
+            import_args={
+                "path": "gs://gcp-public-data--gnomad/resources/grch38/ensembl/ensembl_101_pc_genes_grch38.tsv",
+            },
+        ),
+    },
+)
+
 clinvar = VersionedTableResource(
     default_version="20190923",
     versions={