diff --git a/CHANGELOG.md b/CHANGELOG.md index 5fcddb085..56993bfa1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,11 @@ * VersionedResource objects are no longer subclasses of BaseResource [(#359)](https://github.com/broadinstitute/gnomad_methods/pull/359) * gnomAD resources can now be imported from different sources [(#373)](https://github.com/broadinstitute/gnomad_methods/pull/373) * Replaced `ht_to_vcf_mt` with `adjust_vcf_incompatible_types` which maintains all functionality except turning the ht into a mt because it is no longer needed for use of the Hail module `export_vcf` [(#365)](https://github.com/broadinstitute/gnomad_methods/pull/365/files) - +* Added function `remove_fields_from_constant` to remove fields from a list and notify which requested fields to remove were missing [(#381)](https://github.com/broadinstitute/gnomad_methods/pull/381) +* Added function `create_label_groups` to generate a list of label group dictionaries needed to populate the info dictionary for vcf export [(#381)](https://github.com/broadinstitute/gnomad_methods/pull/381) +* Added function `build_vcf_export_reference` to create a subset reference based on an existing reference genome [(#381)](https://github.com/broadinstitute/gnomad_methods/pull/381) +* Added function `rekey_new_reference` to re-key a Table or MatrixTable with a new reference genome [(#381)](https://github.com/broadinstitute/gnomad_methods/pull/381) +* Modified `SEXES` in utils/vcf to be 'XX' and 'XY' instead of 'female' and 'male' [(#381)](https://github.com/broadinstitute/gnomad_methods/pull/381) ## Version 0.5.0 - April 22nd, 2021 diff --git a/gnomad/utils/filtering.py b/gnomad/utils/filtering.py index cceccccef..a738efb35 100644 --- a/gnomad/utils/filtering.py +++ b/gnomad/utils/filtering.py @@ -341,3 +341,21 @@ def filter_to_clinvar_pathogenic( t.count_rows() if isinstance(t, hl.MatrixTable) else t.count(), ) return t + + +def remove_fields_from_constant( + constant: List[str], fields_to_remove: List[str] +) -> List[str]: + """ + Remove fields from a list and display any field(s) missing from the original list. + + :param constant: List of fields + :param fields_to_remove: List of fields to remove from `constant` + """ + for field in fields_to_remove: + if field in constant: + constant.remove(field) + else: + logger.info("%s missing from %s", field, constant) + + return constant diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index 6c7e13472..e43ae9a67 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -42,10 +42,12 @@ Global populations that are included in filtering allele frequency (faf) calculations. Used in VCF export. """ -SEXES = ["male", "female"] +SEXES = ["XX", "XY"] """ Sample sexes used in VCF export. + Used to stratify frequency annotations (AC, AN, AF) for each sex. +Note that sample sexes in gnomAD v3 and earlier were 'male' and 'female'. """ AS_FIELDS = [ @@ -493,6 +495,33 @@ def make_combo_header_text( return header_text +def create_label_groups( + pops: List[str], + sexes: List[str] = SEXES, + all_groups: List[str] = GROUPS, + pop_sex_groups: List[str] = ["adj"], +) -> List[Dict[str, List[str]]]: + """ + Generate a list of label group dictionaries needed to populate info dictionary. + + Label dictionaries are passed as input to `make_info_dict`. + + :param pops: List of population names. + :param sexes: List of sample sexes. + :param all_groups: List of data types (raw, adj). Default is `GROUPS`, which is ["raw", "adj"]. + :param pop_sex_groups: List of data types (raw, adj) to populate with pops and sexes. Default is ["adj"]. + :return: List of label group dictionaries. + """ + return [ + # This is to capture raw frequency fields, which are + # not stratified by sex or population (e.g., only AC_raw exists, not AC_XX_raw) + dict(group=all_groups), + dict(group=pop_sex_groups, sex=sexes), + dict(group=pop_sex_groups, pop=pops), + dict(group=pop_sex_groups, pop=pops, sex=sexes), + ] + + def make_info_dict( prefix: str = "", prefix_before_metric: bool = True, @@ -865,3 +894,64 @@ def set_female_y_metrics_to_na( } ) return female_metrics_dict + + +def build_vcf_export_reference( + name: str, + build: str = "GRCh38", + keep_contigs: List[str] = [f"chr{i}" for i in range(1, 23)] + + ["chrX", "chrY", "chrM"], +) -> hl.ReferenceGenome: + """ + Create export reference based on reference genome defined by `build`. + + By default this will return a new reference with all non-standard contigs eliminated. Keeps chr 1-22, Y, X, and M. + + An example of a non-standard contig is: ##contig= + + :param name: Name to use for new reference. + :param build: Reference genome build to use as starting reference genome. + :param keep_contigs: Contigs to keep from reference genome defined by `build`. Default is autosomes, sex chromosomes, and chrM. + :return: Reference genome for VCF export containing only contigs in `keep_contigs`. + """ + ref = hl.get_reference(build) + + export_reference = hl.ReferenceGenome( + name=name, + contigs=keep_contigs, + lengths={contig: ref.lengths[contig] for contig in keep_contigs}, + x_contigs=ref.x_contigs, + y_contigs=ref.y_contigs, + par=[ + (interval.start.contig, interval.start.position, interval.end.position) + for interval in ref.par + ], + mt_contigs=ref.mt_contigs, + ) + + return export_reference + + +def rekey_new_reference( + t: Union[hl.Table, hl.MatrixTable], reference: hl.ReferenceGenome +) -> Union[hl.Table, hl.MatrixTable]: + """ + Re-key Table or MatrixTable with a new reference genome. + + :param t: Input Table/MatrixTable. + :param reference: Reference genome to re-key with. + :return: Re-keyed Table/MatrixTable + """ + t = t.rename({"locus": "locus_original"}) + locus_expr = hl.locus( + t.locus_original.contig, t.locus_original.position, reference_genome=reference, + ) + + if isinstance(t, hl.MatrixTable): + t = t.annotate_rows(locus=locus_expr) + t = t.key_rows_by("locus", "alleles").drop("locus_original") + else: + t = t.annotate(locus=locus_expr) + t = t.key_by("locus", "alleles").drop("locus_original") + + return t