Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,11 @@
* VersionedResource objects are no longer subclasses of BaseResource [(#359)](https://github.com/broadinstitute/gnomad_methods/pull/359)
* gnomAD resources can now be imported from different sources [(#373)](https://github.com/broadinstitute/gnomad_methods/pull/373)
* Replaced `ht_to_vcf_mt` with `adjust_vcf_incompatible_types` which maintains all functionality except turning the ht into a mt because it is no longer needed for use of the Hail module `export_vcf` [(#365)](https://github.com/broadinstitute/gnomad_methods/pull/365/files)

* Added function `remove_fields_from_constant` to remove fields from a list and notify which requested fields to remove were missing [(#381)](https://github.com/broadinstitute/gnomad_methods/pull/381)
* Added function `create_label_groups` to generate a list of label group dictionaries needed to populate the info dictionary for vcf export [(#381)](https://github.com/broadinstitute/gnomad_methods/pull/381)
* Added function `build_vcf_export_reference` to create a subset reference based on an existing reference genome [(#381)](https://github.com/broadinstitute/gnomad_methods/pull/381)
* Added function `rekey_new_reference` to re-key a Table or MatrixTable with a new reference genome [(#381)](https://github.com/broadinstitute/gnomad_methods/pull/381)
* Modified `SEXES` in utils/vcf to be 'XX' and 'XY' instead of 'female' and 'male' [(#381)](https://github.com/broadinstitute/gnomad_methods/pull/381)

## Version 0.5.0 - April 22nd, 2021

Expand Down
18 changes: 18 additions & 0 deletions gnomad/utils/filtering.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,3 +341,21 @@ def filter_to_clinvar_pathogenic(
t.count_rows() if isinstance(t, hl.MatrixTable) else t.count(),
)
return t


def remove_fields_from_constant(
constant: List[str], fields_to_remove: List[str]
) -> List[str]:
"""
Remove fields from a list and display any field(s) missing from the original list.

:param constant: List of fields
:param fields_to_remove: List of fields to remove from `constant`
"""
for field in fields_to_remove:
if field in constant:
constant.remove(field)
else:
logger.info("%s missing from %s", field, constant)

return constant
92 changes: 91 additions & 1 deletion gnomad/utils/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,12 @@
Global populations that are included in filtering allele frequency (faf) calculations. Used in VCF export.
"""

SEXES = ["male", "female"]
SEXES = ["XX", "XY"]
"""
Sample sexes used in VCF export.

Used to stratify frequency annotations (AC, AN, AF) for each sex.
Note that sample sexes in gnomAD v3 and earlier were 'male' and 'female'.
"""

AS_FIELDS = [
Expand Down Expand Up @@ -493,6 +495,33 @@ def make_combo_header_text(
return header_text


def create_label_groups(
pops: List[str],
sexes: List[str] = SEXES,
all_groups: List[str] = GROUPS,
pop_sex_groups: List[str] = ["adj"],
) -> List[Dict[str, List[str]]]:
"""
Generate a list of label group dictionaries needed to populate info dictionary.

Label dictionaries are passed as input to `make_info_dict`.

:param pops: List of population names.
:param sexes: List of sample sexes.
:param all_groups: List of data types (raw, adj). Default is `GROUPS`, which is ["raw", "adj"].
:param pop_sex_groups: List of data types (raw, adj) to populate with pops and sexes. Default is ["adj"].
:return: List of label group dictionaries.
"""
return [
# This is to capture raw frequency fields, which are
# not stratified by sex or population (e.g., only AC_raw exists, not AC_XX_raw)
dict(group=all_groups),
dict(group=pop_sex_groups, sex=sexes),
dict(group=pop_sex_groups, pop=pops),
dict(group=pop_sex_groups, pop=pops, sex=sexes),
]


def make_info_dict(
prefix: str = "",
prefix_before_metric: bool = True,
Expand Down Expand Up @@ -865,3 +894,64 @@ def set_female_y_metrics_to_na(
}
)
return female_metrics_dict


def build_vcf_export_reference(
name: str,
build: str = "GRCh38",
keep_contigs: List[str] = [f"chr{i}" for i in range(1, 23)]
+ ["chrX", "chrY", "chrM"],
) -> hl.ReferenceGenome:
"""
Create export reference based on reference genome defined by `build`.

By default this will return a new reference with all non-standard contigs eliminated. Keeps chr 1-22, Y, X, and M.

An example of a non-standard contig is: ##contig=<ID=chr3_GL000221v1_random,length=155397,assembly=GRCh38>

:param name: Name to use for new reference.
:param build: Reference genome build to use as starting reference genome.
:param keep_contigs: Contigs to keep from reference genome defined by `build`. Default is autosomes, sex chromosomes, and chrM.
:return: Reference genome for VCF export containing only contigs in `keep_contigs`.
"""
ref = hl.get_reference(build)

export_reference = hl.ReferenceGenome(
name=name,
contigs=keep_contigs,
lengths={contig: ref.lengths[contig] for contig in keep_contigs},
x_contigs=ref.x_contigs,
y_contigs=ref.y_contigs,
par=[
(interval.start.contig, interval.start.position, interval.end.position)
for interval in ref.par
],
mt_contigs=ref.mt_contigs,
)

return export_reference


def rekey_new_reference(
t: Union[hl.Table, hl.MatrixTable], reference: hl.ReferenceGenome
) -> Union[hl.Table, hl.MatrixTable]:
"""
Re-key Table or MatrixTable with a new reference genome.

:param t: Input Table/MatrixTable.
:param reference: Reference genome to re-key with.
:return: Re-keyed Table/MatrixTable
"""
t = t.rename({"locus": "locus_original"})
locus_expr = hl.locus(
t.locus_original.contig, t.locus_original.position, reference_genome=reference,
)

if isinstance(t, hl.MatrixTable):
t = t.annotate_rows(locus=locus_expr)
t = t.key_rows_by("locus", "alleles").drop("locus_original")
else:
t = t.annotate(locus=locus_expr)
t = t.key_by("locus", "alleles").drop("locus_original")

return t