1414)
1515from gnomad .sample_qc .ancestry import POP_NAMES
1616from gnomad .utils .annotations import add_gks_va , add_gks_vrs
17- from gnomad .utils .vcf import FAF_POPS
1817
1918logging .basicConfig (
2019 format = "%(asctime)s (%(name)s %(lineno)s): %(message)s" ,
@@ -479,6 +478,32 @@ def release_vcf_path(data_type: str, version: str, contig: str) -> str:
479478 return f"gs://gcp-public-data--gnomad/release/{ version } /vcf/{ data_type } /gnomad.{ data_type } .{ version_prefix } { version } .sites{ contig } .vcf.bgz"
480479
481480
481+ def add_grpMaxFAF95_v4 (ht : hl .Table ) -> hl .Table :
482+ """
483+ Add a grpMaxFAF95 struct with 'popmax' and 'popmax_population'.
484+
485+ Also includes a jointGrpMaxFAF95 annotation using the v4 fafmax and joint_fafmax structures.
486+
487+ :param ht: Input hail table.
488+ :return: Annotated hail table.
489+ """
490+ if "gnomad" in ht .fafmax :
491+ fafmax_field = ht .fafmax .gnomad
492+ else :
493+ fafmax_field = ht .fafmax
494+ ht = ht .annotate (
495+ grpMaxFAF95 = hl .struct (
496+ popmax = fafmax_field .faf95_max ,
497+ popmax_population = fafmax_field .faf95_max_gen_anc ,
498+ ),
499+ jointGrpMaxFAF95 = hl .struct (
500+ popmax = ht .joint_fafmax .faf95_max ,
501+ popmax_population = ht .joint_fafmax .faf95_max_gen_anc ,
502+ ),
503+ )
504+ return ht
505+
506+
482507def gnomad_gks (
483508 locus_interval : hl .IntervalExpression ,
484509 version : str ,
@@ -510,21 +535,29 @@ def gnomad_gks(
510535 :return: List of dictionaries containing VRS information
511536 (and freq info split by ancestry groups and sex if desired) for specified variant.
512537 """
513- # Read public_release table if no custom table provided
538+ # Obtain the high level version number and verify that it is 4.
539+ high_level_version = f"v{ version .split ('.' )[0 ]} "
540+ if high_level_version != "v4" :
541+ raise NotImplementedError (
542+ "gnomad_gks() is currently only implemented for gnomAD v4."
543+ )
544+
545+ # Read public_release table if no custom table provided.
514546 if custom_ht :
515547 ht = custom_ht
516548 else :
517549 ht = hl .read_table (public_release (data_type ).versions [version ].path )
518550
519- high_level_version = f"v{ version .split ('.' )[0 ]} "
551+ # Read coverage statistics if requested.
552+ coverage_version_3_0_1 = "3.0.1" # v4 genomes coverage
553+ coverage_version_4_0 = "4.0" # v4 exomes coverage
520554
521- # Read coverage statistics if requested
522- if high_level_version == "v3" :
523- coverage_version = "3.0.1"
555+ # In v4, exomes have coverage in v4 coverage table,
556+ # genomes have coverage in v3 coverage table.
557+ if data_type == "genomes" :
558+ coverage_version = coverage_version_3_0_1
524559 else :
525- raise NotImplementedError (
526- "gnomad_gks() is currently only implemented for gnomAD v3."
527- )
560+ coverage_version = coverage_version_4_0
528561
529562 coverage_ht = None
530563
@@ -533,12 +566,17 @@ def gnomad_gks(
533566 coverage_ht = custom_coverage_ht
534567 else :
535568 coverage_ht = hl .read_table (
536- coverage ("genomes" ).versions [coverage_version ].path
569+ coverage (data_type ).versions [coverage_version ].path
537570 )
538571 ht = ht .annotate (mean_depth = coverage_ht [ht .locus ].mean )
572+ ht = ht .annotate (fraction_cov_over_20 = coverage_ht [ht .locus ].over_20 )
539573
540574 # Retrieve ancestry groups from the imported POPS dictionary.
541- pops_list = list (POPS [high_level_version ]) if by_ancestry_group else None
575+ pops_list = (
576+ list (POPS ["v4" if data_type == "exomes" else "v3" ])
577+ if by_ancestry_group
578+ else None
579+ )
542580
543581 # Throw warnings if contradictory arguments are passed.
544582 if by_ancestry_group and vrs_only :
@@ -552,43 +590,36 @@ def gnomad_gks(
552590 " please also specify 'by_ancestry_group' to stratify by."
553591 )
554592
555- # Call and return add_gks_vrs and add_gks_va for chosen arguments.
556-
557593 # Select relevant fields, checkpoint, and filter to interval before adding
558- # annotations
559- ht = ht .annotate (
560- faf95 = hl .rbind (
561- hl .sorted (
562- hl .array (
563- [
564- hl .struct (
565- faf = ht .faf [ht .faf_index_dict [f"{ pop } -adj" ]].faf95 ,
566- population = pop ,
567- )
568- for pop in FAF_POPS
569- ]
570- ),
571- key = lambda f : (- f .faf , f .population ),
572- ),
573- lambda fafs : hl .if_else (
574- hl .len (fafs ) > 0 ,
575- hl .struct (
576- popmax = fafs [0 ].faf ,
577- popmax_population = hl .if_else (
578- fafs [0 ].faf == 0 , hl .missing (hl .tstr ), fafs [0 ].population
579- ),
580- ),
581- hl .struct (
582- popmax = hl .missing (hl .tfloat ), popmax_population = hl .missing (hl .tstr )
583- ),
584- ),
585- )
586- )
594+ # annotations.
595+
596+ # Pull up LCR flag and make referrable in the same field.
597+ ht = ht .annotate (lcr = ht .region_flags .lcr )
598+
599+ # Pull up allele balance histogram arrays.
600+ ht = ht .annotate (ab_hist_alt = ht .histograms .qual_hists .ab_hist_alt )
587601
588- keep_fields = [ht .freq , ht .info .vrs , ht .faf95 ]
602+ ht = add_grpMaxFAF95_v4 (ht )
603+
604+ ht = ht .annotate (in_autosome_or_par = ht .locus .in_autosome_or_par ())
605+
606+ keep_fields = [
607+ ht .freq ,
608+ ht .info .vrs ,
609+ ht .info .monoallelic ,
610+ ht .grpMaxFAF95 ,
611+ ht .filters ,
612+ ht .lcr ,
613+ ht .ab_hist_alt ,
614+ ht .in_autosome_or_par ,
615+ ]
589616
590617 if not skip_coverage :
591618 keep_fields .append (ht .mean_depth )
619+ keep_fields .append (ht .fraction_cov_over_20 )
620+
621+ if "jointGrpMaxFAF95" in ht .row :
622+ keep_fields .append (ht .jointGrpMaxFAF95 )
592623
593624 ht = ht .select (* keep_fields )
594625
@@ -598,9 +629,12 @@ def gnomad_gks(
598629 ht = ht .checkpoint (hl .utils .new_temp_file ("vrs_checkpoint" , extension = "ht" ))
599630
600631 # Collect all variants as structs, so all dictionary construction can be
601- # done in native Python
632+ # done in native Python.
602633 variant_list = ht .collect ()
603634 ht_freq_index_dict = ht .freq_index_dict .collect ()[0 ]
635+ # gnomad v4 renamed freq_index_dict keys to use underscores instead of dashes.
636+ # Use underscores for v3 as well.
637+ ht_freq_index_dict = {k .replace ("-" , "_" ): v for k , v in ht_freq_index_dict .items ()}
604638
605639 # Assemble output dicts with VRS and optionally frequency, append to list,
606640 # then return list
0 commit comments