Skip to content

Commit 060fcb8

Browse files
authored
Merge pull request #617 from theferrit32/kf/gks-cg-annotations
GKS formatting changes and addition of gnomAD flags to annotation
2 parents e14aa94 + caabf24 commit 060fcb8

File tree

3 files changed

+177
-157
lines changed

3 files changed

+177
-157
lines changed

gnomad/resources/grch38/gnomad.py

Lines changed: 78 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
)
1515
from gnomad.sample_qc.ancestry import POP_NAMES
1616
from gnomad.utils.annotations import add_gks_va, add_gks_vrs
17-
from gnomad.utils.vcf import FAF_POPS
1817

1918
logging.basicConfig(
2019
format="%(asctime)s (%(name)s %(lineno)s): %(message)s",
@@ -479,6 +478,32 @@ def release_vcf_path(data_type: str, version: str, contig: str) -> str:
479478
return f"gs://gcp-public-data--gnomad/release/{version}/vcf/{data_type}/gnomad.{data_type}.{version_prefix}{version}.sites{contig}.vcf.bgz"
480479

481480

481+
def add_grpMaxFAF95_v4(ht: hl.Table) -> hl.Table:
482+
"""
483+
Add a grpMaxFAF95 struct with 'popmax' and 'popmax_population'.
484+
485+
Also includes a jointGrpMaxFAF95 annotation using the v4 fafmax and joint_fafmax structures.
486+
487+
:param ht: Input hail table.
488+
:return: Annotated hail table.
489+
"""
490+
if "gnomad" in ht.fafmax:
491+
fafmax_field = ht.fafmax.gnomad
492+
else:
493+
fafmax_field = ht.fafmax
494+
ht = ht.annotate(
495+
grpMaxFAF95=hl.struct(
496+
popmax=fafmax_field.faf95_max,
497+
popmax_population=fafmax_field.faf95_max_gen_anc,
498+
),
499+
jointGrpMaxFAF95=hl.struct(
500+
popmax=ht.joint_fafmax.faf95_max,
501+
popmax_population=ht.joint_fafmax.faf95_max_gen_anc,
502+
),
503+
)
504+
return ht
505+
506+
482507
def gnomad_gks(
483508
locus_interval: hl.IntervalExpression,
484509
version: str,
@@ -510,21 +535,29 @@ def gnomad_gks(
510535
:return: List of dictionaries containing VRS information
511536
(and freq info split by ancestry groups and sex if desired) for specified variant.
512537
"""
513-
# Read public_release table if no custom table provided
538+
# Obtain the high level version number and verify that it is 4.
539+
high_level_version = f"v{version.split('.')[0]}"
540+
if high_level_version != "v4":
541+
raise NotImplementedError(
542+
"gnomad_gks() is currently only implemented for gnomAD v4."
543+
)
544+
545+
# Read public_release table if no custom table provided.
514546
if custom_ht:
515547
ht = custom_ht
516548
else:
517549
ht = hl.read_table(public_release(data_type).versions[version].path)
518550

519-
high_level_version = f"v{version.split('.')[0]}"
551+
# Read coverage statistics if requested.
552+
coverage_version_3_0_1 = "3.0.1" # v4 genomes coverage
553+
coverage_version_4_0 = "4.0" # v4 exomes coverage
520554

521-
# Read coverage statistics if requested
522-
if high_level_version == "v3":
523-
coverage_version = "3.0.1"
555+
# In v4, exomes have coverage in v4 coverage table,
556+
# genomes have coverage in v3 coverage table.
557+
if data_type == "genomes":
558+
coverage_version = coverage_version_3_0_1
524559
else:
525-
raise NotImplementedError(
526-
"gnomad_gks() is currently only implemented for gnomAD v3."
527-
)
560+
coverage_version = coverage_version_4_0
528561

529562
coverage_ht = None
530563

@@ -533,12 +566,17 @@ def gnomad_gks(
533566
coverage_ht = custom_coverage_ht
534567
else:
535568
coverage_ht = hl.read_table(
536-
coverage("genomes").versions[coverage_version].path
569+
coverage(data_type).versions[coverage_version].path
537570
)
538571
ht = ht.annotate(mean_depth=coverage_ht[ht.locus].mean)
572+
ht = ht.annotate(fraction_cov_over_20=coverage_ht[ht.locus].over_20)
539573

540574
# Retrieve ancestry groups from the imported POPS dictionary.
541-
pops_list = list(POPS[high_level_version]) if by_ancestry_group else None
575+
pops_list = (
576+
list(POPS["v4" if data_type == "exomes" else "v3"])
577+
if by_ancestry_group
578+
else None
579+
)
542580

543581
# Throw warnings if contradictory arguments are passed.
544582
if by_ancestry_group and vrs_only:
@@ -552,43 +590,36 @@ def gnomad_gks(
552590
" please also specify 'by_ancestry_group' to stratify by."
553591
)
554592

555-
# Call and return add_gks_vrs and add_gks_va for chosen arguments.
556-
557593
# Select relevant fields, checkpoint, and filter to interval before adding
558-
# annotations
559-
ht = ht.annotate(
560-
faf95=hl.rbind(
561-
hl.sorted(
562-
hl.array(
563-
[
564-
hl.struct(
565-
faf=ht.faf[ht.faf_index_dict[f"{pop}-adj"]].faf95,
566-
population=pop,
567-
)
568-
for pop in FAF_POPS
569-
]
570-
),
571-
key=lambda f: (-f.faf, f.population),
572-
),
573-
lambda fafs: hl.if_else(
574-
hl.len(fafs) > 0,
575-
hl.struct(
576-
popmax=fafs[0].faf,
577-
popmax_population=hl.if_else(
578-
fafs[0].faf == 0, hl.missing(hl.tstr), fafs[0].population
579-
),
580-
),
581-
hl.struct(
582-
popmax=hl.missing(hl.tfloat), popmax_population=hl.missing(hl.tstr)
583-
),
584-
),
585-
)
586-
)
594+
# annotations.
595+
596+
# Pull up LCR flag and make referrable in the same field.
597+
ht = ht.annotate(lcr=ht.region_flags.lcr)
598+
599+
# Pull up allele balance histogram arrays.
600+
ht = ht.annotate(ab_hist_alt=ht.histograms.qual_hists.ab_hist_alt)
587601

588-
keep_fields = [ht.freq, ht.info.vrs, ht.faf95]
602+
ht = add_grpMaxFAF95_v4(ht)
603+
604+
ht = ht.annotate(in_autosome_or_par=ht.locus.in_autosome_or_par())
605+
606+
keep_fields = [
607+
ht.freq,
608+
ht.info.vrs,
609+
ht.info.monoallelic,
610+
ht.grpMaxFAF95,
611+
ht.filters,
612+
ht.lcr,
613+
ht.ab_hist_alt,
614+
ht.in_autosome_or_par,
615+
]
589616

590617
if not skip_coverage:
591618
keep_fields.append(ht.mean_depth)
619+
keep_fields.append(ht.fraction_cov_over_20)
620+
621+
if "jointGrpMaxFAF95" in ht.row:
622+
keep_fields.append(ht.jointGrpMaxFAF95)
592623

593624
ht = ht.select(*keep_fields)
594625

@@ -598,9 +629,12 @@ def gnomad_gks(
598629
ht = ht.checkpoint(hl.utils.new_temp_file("vrs_checkpoint", extension="ht"))
599630

600631
# Collect all variants as structs, so all dictionary construction can be
601-
# done in native Python
632+
# done in native Python.
602633
variant_list = ht.collect()
603634
ht_freq_index_dict = ht.freq_index_dict.collect()[0]
635+
# gnomad v4 renamed freq_index_dict keys to use underscores instead of dashes.
636+
# Use underscores for v3 as well.
637+
ht_freq_index_dict = {k.replace("-", "_"): v for k, v in ht_freq_index_dict.items()}
604638

605639
# Assemble output dicts with VRS and optionally frequency, append to list,
606640
# then return list

gnomad/sample_qc/ancestry.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@
8282
"oeu": "#6AA5CD",
8383
"onf": "#6AA5CD",
8484
"unk": "#ABB9B9",
85+
"remaining": "#ABB9B9",
8586
"": "#ABB9B9",
8687
}
8788

0 commit comments

Comments
 (0)