Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 91 additions & 0 deletions gnomad/assessment/validity_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import Dict, List, Optional, Union

import hail as hl
from hail.utils.misc import new_temp_file

from gnomad.resources.grch38.gnomad import CURRENT_MAJOR_RELEASE, POPS, SEXES
from gnomad.utils.vcf import HISTS, SORT_ORDER, make_label_combos
Expand Down Expand Up @@ -982,3 +983,93 @@ def validate_release_t(
t, info_metrics, non_info_metrics, n_sites, missingness_threshold
)
logger.info("VALIDITY CHECKS COMPLETE")


def count_vep_annotated_variants_per_interval(
vep_ht: hl.Table, interval_ht: hl.Table
) -> hl.Table:
"""
Calculate the count of VEP annotated variants in `vep_ht` per interval defined by `interval_ht`.

.. note::

- `vep_ht` must contain the 'vep.transcript_consequences' array field, which
contains a 'biotype' field to determine whether a variant is in a
"protein-coding" gene.
- `interval_ht` should be indexed by 'locus' and contain a 'gene_stable_ID'
field. For example, an interval Table containing the intervals of
protein-coding genes of a specific Ensembl release.

The returned Table will have the following fields added:
- n_total_variants: The number of total variants in the interval.
- n_pcg_variants: The number of variants in the interval that are annotated as
"protein-coding".

:param vep_ht: VEP-annotated Table.
:param interval_ht: Interval Table.
:return: Interval Table with annotations for the counts of total variants and
variants annotated as "protein-coding" in biotype.
"""
logger.info(
"Counting the number of total variants and protein-coding variants in each"
" interval..."
)

# Select the vep_ht and annotate genes that have a matched interval from
# the interval_ht and are protein-coding.
vep_ht = vep_ht.select(
gene_stable_ID=interval_ht.index(vep_ht.locus, all_matches=True).gene_stable_ID,
in_pcg=vep_ht.vep.transcript_consequences.biotype.contains("protein_coding"),
)

vep_ht = vep_ht.filter(hl.is_defined(vep_ht.gene_stable_ID))

# Explode the vep_ht by gene_stable_ID.
vep_ht = vep_ht.explode(vep_ht.gene_stable_ID)

# Count the number of total variants and "protein-coding" variants in each interval.
count_ht = vep_ht.group_by(vep_ht.gene_stable_ID).aggregate(
all_variants=hl.agg.count(),
variants_in_pcg=hl.agg.count_where(vep_ht.in_pcg),
)

interval_ht = interval_ht.annotate(**count_ht[interval_ht.gene_stable_ID])

logger.info("Checkpointing the counts per interval...")
interval_ht = interval_ht.checkpoint(
new_temp_file("validity_checks.vep_count_per_interval", extension="ht"),
overwrite=True,
)

logger.info("Genes without variants annotated: ")
gene_sets = interval_ht.aggregate(
hl.struct(
na_genes=hl.agg.filter(
hl.is_missing(interval_ht.variants_in_pcg)
| (interval_ht.variants_in_pcg == 0),
hl.agg.collect_as_set(interval_ht.gene_stable_ID),
),
partial_pcg_genes=hl.agg.filter(
(interval_ht.all_variants != 0)
& (interval_ht.variants_in_pcg != 0)
& (interval_ht.all_variants != interval_ht.variants_in_pcg),
hl.agg.collect_as_set(interval_ht.gene_stable_ID),
),
)
)

logger.info(
"%s gene(s) have no variants annotated as protein-coding in Biotype. It is"
" likely these genes are not covered by the variants in 'vep_ht'. These genes"
" are: %s",
len(gene_sets.na_genes),
gene_sets.na_genes,
)

logger.info(
"%s gene(s) have a subset of variants annotated as protein-coding biotype in"
" their defined intervals",
len(gene_sets.partial_pcg_genes),
)

return interval_ht
51 changes: 51 additions & 0 deletions gnomad/resources/grch38/reference_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,32 @@ def _import_dbsnp(**kwargs) -> hl.Table:
return dbsnp


def _import_ensembl_interval(path) -> hl.Table:
"""
Import and parse Ensembl intervals of protein-coding genes to a Hail Table.

File is expected to include only the following fields: gene_stable_ID, chr, start, end, source_gene, gene_name, and type.

:param path: Path to the interval Table file.
"""
ensembl = hl.import_table(
path,
delimiter="\t",
min_partitions=100,
impute=True,
)

ensembl = ensembl.key_by(
interval=hl.locus_interval(
"chr" + ensembl.chr,
ensembl.start,
ensembl.end,
reference_genome="GRCh38",
)
)
return ensembl


# Resources with no versioning needed
purcell_5k_intervals = GnomadPublicTableResource(
path="gs://gnomad-public-requester-pays/resources/grch38/purcell_5k_intervals/purcell5k.ht",
Expand Down Expand Up @@ -135,6 +161,31 @@ def _import_dbsnp(**kwargs) -> hl.Table:
},
)

# These Ensembl Interval Tables are focused on protein-coding genes on chr1-22,X,Y.
# Downloaded from the biomart of Ensembl Archive (https://useast.ensembl.org/info/website/archives/index.html)
# Ensembl 101 & 105 are included, since 101 was used to annotate gnomAD v3 and 105 to gnomAD v4.
# Basic stats: 19924 protein-coding genes in Ensembl 101, and1 19951
# protein-coding genes in Ensembl 105.
ensembl_interval = VersionedTableResource(
default_version="105",
versions={
"105": GnomadPublicTableResource(
path="gs://gnomad-public-requester-pays/resources/grch38/ensembl/ensembl_105_pc_genes_grch38.ht",
import_func=_import_ensembl_interval,
import_args={
"path": "gs://gcp-public-data--gnomad/resources/grch38/ensembl/ensembl_105_pc_genes_grch38.tsv",
},
),
"101": GnomadPublicTableResource(
path="gs://gnomad-public-requester-pays/resources/grch38/ensembl/ensembl_101_pc_genes_grch38.ht",
import_func=_import_ensembl_interval,
import_args={
"path": "gs://gcp-public-data--gnomad/resources/grch38/ensembl/ensembl_101_pc_genes_grch38.tsv",
},
),
},
)

clinvar = VersionedTableResource(
default_version="20190923",
versions={
Expand Down