Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
b1d89cf
Copy relevant changes from variant_report_code
matren395 Sep 11, 2023
f3b8864
Copy relevant changes to branch
matren395 Sep 11, 2023
0b3321f
Run Black on gnomad.py
matren395 Sep 11, 2023
96cdd95
isort
matren395 Sep 11, 2023
4533e20
DocString and one variable name
matren395 Sep 12, 2023
120e906
Update variable name in gnomad.py
matren395 Sep 12, 2023
aed049a
Apply suggestions from code review
matren395 Sep 13, 2023
381f233
Update annotations.py
matren395 Sep 13, 2023
769265f
add select and checkpoint
matren395 Sep 14, 2023
2309908
Update gnomad.py
matren395 Sep 14, 2023
28833bc
Black & Renaming
matren395 Sep 14, 2023
46d0c70
revise checkpoint structure
matren395 Sep 15, 2023
be68c7a
new_temp_file not _path oops:)
matren395 Sep 15, 2023
6c97aeb
Update gnomad/resources/grch38/gnomad.py
matren395 Sep 15, 2023
0a12289
PyLint!!!
matren395 Sep 15, 2023
1332bb8
Only collect globals once
theferrit32 Sep 19, 2023
51868fc
Perform coverage join in gnomad_gks so there's only 1 collect on cove…
theferrit32 Sep 19, 2023
6c686b5
Add skip_checkpoint argument to gnomad_gks
theferrit32 Sep 19, 2023
c0cf68d
Apply review comments.
theferrit32 Sep 22, 2023
0d465a9
Fix field name
theferrit32 Sep 26, 2023
1f95734
revise popmax
matren395 Sep 26, 2023
728620e
Move FAF95 annotation and edit some handling
matren395 Sep 27, 2023
c5bffa2
revise import, remove weird change, remove old fxn
matren395 Sep 27, 2023
120c25c
reformat gnomad.py with Black
matren395 Sep 27, 2023
6a35362
rerun isort==5.12.0
matren395 Sep 27, 2023
263346a
simpler FAF_POPS import
matren395 Sep 27, 2023
1d1e517
Update requirements.txt
matren395 Sep 28, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
173 changes: 173 additions & 0 deletions gnomad/resources/grch38/gnomad.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,27 @@
# noqa: D100

import logging
from typing import Optional

import hail as hl

from gnomad.resources.resource_utils import (
DataException,
GnomadPublicMatrixTableResource,
GnomadPublicTableResource,
VersionedMatrixTableResource,
VersionedTableResource,
)
from gnomad.sample_qc.ancestry import POP_NAMES
from gnomad.utils.annotations import add_gks_va, add_gks_vrs
from gnomad.utils.vcf import FAF_POPS

logging.basicConfig(
format="%(asctime)s (%(name)s %(lineno)s): %(message)s",
datefmt="%m/%d/%Y %I:%M:%S %p",
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

CURRENT_EXOME_RELEASE = ""
CURRENT_GENOME_RELEASE = "3.1.2"
Expand Down Expand Up @@ -413,3 +426,163 @@ def release_vcf_path(data_type: str, version: str, contig: str) -> str:
contig = f".{contig}" if contig else ""
version_prefix = "r" if version.startswith("3.0") else "v"
return f"gs://gcp-public-data--gnomad/release/{version}/vcf/{data_type}/gnomad.{data_type}.{version_prefix}{version}.sites{contig}.vcf.bgz"


def gnomad_gks(
locus_interval: hl.IntervalExpression,
version: str,
data_type: str = "genomes",
by_ancestry_group: bool = False,
by_sex: bool = False,
vrs_only: bool = False,
custom_ht: hl.Table = None,
skip_checkpoint: bool = False,
skip_coverage: bool = False,
custom_coverage_ht: hl.Table = None,
) -> list:
"""
Perform gnomad GKS annotations on a range of variants at once.

:param locus_interval: Hail IntervalExpression of locus<reference_genome>.
e.g. hl.locus_interval('chr1', 6424776, 6461367, reference_genome="GRCh38")
:param version: String of version of gnomAD release to use.
:param data_type: String of either "exomes" or "genomes" for the type of reads that are desired.
:param by_ancestry_group: Boolean to pass to obtain frequency information for each cohort.
:param by_sex: Boolean to pass to return frequency information for each cohort split by chromosomal sex.
:param vrs_only: Boolean to pass for only VRS info to be returned
(will not include allele frequency information).
:param custom_ht: Table to use instead of what public_release() method would return for the version.
:param skip_checkpoint: Bool to pass to skip checkpointing selected fields
(checkpointing may be desirable for large datasets by reducing data copies across the cluster).
:param skip_coverage: Bool to pass to skip adding coverage statistics.
:param custom_coverage_ht: Custom table to use for coverage statistics instead of the release coverage table.
:return: List of dictionaries containing VRS information
(and freq info split by ancestry groups and sex if desired) for specified variant.
"""
# Read public_release table if no custom table provided
if custom_ht:
ht = custom_ht
else:
ht = hl.read_table(public_release(data_type).versions[version].path)

high_level_version = f"v{version.split('.')[0]}"

# Read coverage statistics if requested
if high_level_version == "v3":
coverage_version = "3.0.1"
else:
raise NotImplementedError(
"gnomad_gks() is currently only implemented for gnomAD v3."
)

coverage_ht = None

if not skip_coverage:
if custom_coverage_ht:
coverage_ht = custom_coverage_ht
else:
coverage_ht = hl.read_table(
coverage("genomes").versions[coverage_version].path
)
ht = ht.annotate(mean_depth=coverage_ht[ht.locus].mean)

# Retrieve ancestry groups from the imported POPS dictionary.
pops_list = list(POPS[high_level_version]) if by_ancestry_group else None

# Throw warnings if contradictory arguments are passed.
if by_ancestry_group and vrs_only:
logger.warning(
"Both 'vrs_only' and 'by_ancestry_groups' have been specified. Ignoring"
" 'by_ancestry_groups' list and returning only VRS information."
)
elif by_sex and not by_ancestry_group:
logger.warning(
"Splitting whole database by sex is not yet supported. If using 'by_sex',"
" please also specify 'by_ancestry_group' to stratify by."
)

# Call and return add_gks_vrs and add_gks_va for chosen arguments.

# Select relevant fields, checkpoint, and filter to interval before adding
# annotations
ht = ht.annotate(
faf95=hl.rbind(
hl.sorted(
hl.array(
[
hl.struct(
faf=ht.faf[ht.faf_index_dict[f"{pop}-adj"]].faf95,
population=pop,
)
for pop in FAF_POPS
]
),
key=lambda f: (-f.faf, f.population),
),
lambda fafs: hl.if_else(
hl.len(fafs) > 0,
hl.struct(
popmax=fafs[0].faf,
popmax_population=hl.if_else(
fafs[0].faf == 0, hl.missing(hl.tstr), fafs[0].population
),
),
hl.struct(
popmax=hl.missing(hl.tfloat), popmax_population=hl.missing(hl.tstr)
),
),
)
)

keep_fields = [ht.freq, ht.info.vrs, ht.faf95]

if not skip_coverage:
keep_fields.append(ht.mean_depth)

ht = ht.select(*keep_fields)

# Checkpoint narrower set of columns if not skipped.
ht = hl.filter_intervals(ht, [locus_interval])
if not skip_checkpoint:
ht = ht.checkpoint(hl.utils.new_temp_file("vrs_checkpoint", extension="ht"))

# Collect all variants as structs, so all dictionary construction can be
# done in native Python
variant_list = ht.collect()
ht_freq_index_dict = ht.freq_index_dict.collect()[0]

# Assemble output dicts with VRS and optionally frequency, append to list,
# then return list
outputs = []
for variant in variant_list:
vrs_variant = add_gks_vrs(variant.locus, variant.vrs)

out = {
"locus": {
"contig": variant.locus.contig,
"position": variant.locus.position,
"reference_genome": variant.locus.reference_genome.name,
},
"alleles": variant.alleles,
"gks_vrs_variant": vrs_variant,
}

if not vrs_only:
va_freq_dict = add_gks_va(
input_struct=variant,
label_name="gnomAD",
label_version=version,
ancestry_groups=pops_list,
ancestry_groups_dict=POP_NAMES,
by_sex=by_sex,
freq_index_dict=ht_freq_index_dict,
)

# Assign existing VRS information to "focusAllele" key
va_freq_dict["focusAllele"] = vrs_variant
out["gks_va_freq"] = va_freq_dict

# Append variant dictionary to list of outputs
outputs.append(out)

return outputs
Loading