Skip to content

Commit 541080e

Browse files
authored
Merge pull request #445 from broadinstitute/jg/sex_ploidy_variants
Modification to the `annotate_sex` pipeline to allow sex ploidy estimation using only variants instead of ref blocks
2 parents b3f960b + 122b36f commit 541080e

File tree

2 files changed

+143
-18
lines changed

2 files changed

+143
-18
lines changed

gnomad/sample_qc/pipeline.py

Lines changed: 75 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,8 @@ def annotate_sex(
226226
gt_expr: str = "GT",
227227
f_stat_cutoff: float = 0.5,
228228
aaf_threshold: float = 0.001,
229+
variants_only_x_ploidy: bool = False,
230+
variants_only_y_ploidy: bool = False,
229231
) -> hl.Table:
230232
"""
231233
Impute sample sex based on X-chromosome heterozygosity and sex chromosome ploidy.
@@ -256,6 +258,8 @@ def annotate_sex(
256258
:param gt_expr: Name of entry field storing the genotype. Default: 'GT'
257259
:param f_stat_cutoff: f-stat to roughly divide 'XX' from 'XY' samples. Assumes XX samples are below cutoff and XY are above cutoff.
258260
:param float aaf_threshold: Minimum alternate allele frequency to be used in f-stat calculations.
261+
:param variants_only_x_ploidy: Whether to use depth of only variant data for the x ploidy estimation.
262+
:param variants_only_y_ploidy: Whether to use depth of only variant data for the y ploidy estimation.
259263
:return: Table of samples and their imputed sex karyotypes.
260264
"""
261265
logger.info("Imputing sex chromosome ploidies...")
@@ -266,27 +270,94 @@ def annotate_sex(
266270
raise NotImplementedError(
267271
"The use of the parameter 'excluded_intervals' is currently not implemented for imputing sex chromosome ploidy on a VDS!"
268272
)
273+
# Begin by creating a ploidy estimate HT using the method defined by 'variants_only_x_ploidy'
269274
ploidy_ht = hl.vds.impute_sex_chromosome_ploidy(
270275
mtds,
271276
calling_intervals=included_intervals,
272277
normalization_contig=normalization_contig,
278+
use_variant_dataset=variants_only_x_ploidy,
273279
)
274280
ploidy_ht = ploidy_ht.rename(
275281
{
276282
"x_ploidy": "chrX_ploidy",
277283
"y_ploidy": "chrY_ploidy",
278284
"x_mean_dp": "chrX_mean_dp",
279285
"y_mean_dp": "chrY_mean_dp",
280-
"autosomal_mean_dp": f"{normalization_contig}_mean_dp",
286+
"autosomal_mean_dp": f"var_data_{normalization_contig}_mean_dp"
287+
if variants_only_x_ploidy
288+
else f"{normalization_contig}_mean_dp",
281289
}
282290
)
291+
# If 'variants_only_y_ploidy' is different from 'variants_only_x_ploidy' then re-run the ploidy estimation using
292+
# the method defined by 'variants_only_y_ploidy' and re-annotate with the modified ploidy estimates.
293+
if variants_only_y_ploidy != variants_only_x_ploidy:
294+
y_ploidy_ht = hl.vds.impute_sex_chromosome_ploidy(
295+
mtds,
296+
calling_intervals=included_intervals,
297+
normalization_contig=normalization_contig,
298+
use_variant_dataset=variants_only_y_ploidy,
299+
)
300+
y_ploidy_idx = y_ploidy_ht[ploidy_ht.key]
301+
ploidy_ht = ploidy_ht.annotate(
302+
chrY_ploidy=y_ploidy_idx.y_ploidy,
303+
chrY_mean_dp=y_ploidy_idx.y_mean_dp,
304+
)
305+
306+
# If the `variants_only_y_ploidy' is True modify the name of the normalization contig mean DP to indicate
307+
# that this is the variant dataset only mean DP (this will have already been added if
308+
# 'variants_only_x_ploidy' was also True).
309+
if variants_only_y_ploidy:
310+
ploidy_ht = ploidy_ht.annotate(
311+
**{
312+
f"var_data_{normalization_contig}_mean_dp": y_ploidy_idx.autosomal_mean_dp
313+
}
314+
)
315+
283316
mt = mtds.variant_data
284317
else:
285318
mt = mtds
286319
if is_sparse:
287320
ploidy_ht = impute_sex_ploidy(
288-
mt, excluded_intervals, included_intervals, normalization_contig
321+
mt,
322+
excluded_intervals,
323+
included_intervals,
324+
normalization_contig,
325+
use_only_variants=variants_only_x_ploidy,
289326
)
327+
ploidy_ht = ploidy_ht.rename(
328+
{
329+
"autosomal_mean_dp": f"var_data_{normalization_contig}_mean_dp"
330+
if variants_only_x_ploidy
331+
else f"{normalization_contig}_mean_dp",
332+
}
333+
)
334+
# If 'variants_only_y_ploidy' is different from 'variants_only_x_ploidy' then re-run the ploidy estimation
335+
# using the method defined by 'variants_only_y_ploidy' and re-annotate with the modified ploidy estimates.
336+
if variants_only_y_ploidy != variants_only_x_ploidy:
337+
y_ploidy_ht = impute_sex_ploidy(
338+
mt,
339+
excluded_intervals,
340+
included_intervals,
341+
normalization_contig,
342+
use_only_variants=variants_only_y_ploidy,
343+
)
344+
y_ploidy_ht.select(
345+
"chrY_ploidy",
346+
"chrY_mean_dp",
347+
f"{normalization_contig}_mean_dp",
348+
)
349+
# If the `variants_only_y_ploidy' is True modify the name of the normalization contig mean DP to indicate
350+
# that this is the variant dataset only mean DP (this will have already been added if
351+
# 'variants_only_x_ploidy' was also True).
352+
if variants_only_y_ploidy:
353+
ploidy_ht = ploidy_ht.rename(
354+
{
355+
f"{normalization_contig}_mean_dp": f"var_data_{normalization_contig}_mean_dp"
356+
}
357+
)
358+
# Re-annotate the ploidy HT with modified Y ploidy annotations
359+
ploidy_ht = ploidy_ht.annotate(**y_ploidy_ht[ploidy_ht.key])
360+
290361
else:
291362
raise NotImplementedError(
292363
"Imputing sex ploidy does not exist yet for dense data."
@@ -348,6 +419,8 @@ def annotate_sex(
348419
lower_cutoff_YY=y_ploidy_cutoffs[1],
349420
),
350421
f_stat_cutoff=f_stat_cutoff,
422+
variants_only_x_ploidy=variants_only_x_ploidy,
423+
variants_only_y_ploidy=variants_only_y_ploidy,
351424
)
352425
return sex_ht.annotate(
353426
**get_sex_expr(

gnomad/utils/sparse_mt.py

Lines changed: 68 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -549,6 +549,7 @@ def impute_sex_ploidy(
549549
normalization_contig: str = "chr20",
550550
chr_x: Optional[str] = None,
551551
chr_y: Optional[str] = None,
552+
use_only_variants: bool = False,
552553
) -> hl.Table:
553554
"""
554555
Impute sex ploidy from a sparse MatrixTable.
@@ -557,16 +558,20 @@ def impute_sex_ploidy(
557558
chromosome (by default chr20).
558559
559560
Coverage is computed using the median block coverage (summed over the block size) and the non-ref coverage at
560-
non-ref genotypes.
561+
non-ref genotypes unless the `use_only_variants` argument is set to True and then it will use the mean coverage
562+
defined by only the variants.
561563
562564
:param mt: Input sparse Matrix Table
563-
:param excluded_calling_intervals: Optional table of intervals to exclude from the computation.
564-
Used only when determining contig size (not used when computing chromosome depth).
565-
:param included_calling_intervals: Optional table of intervals to use in the computation.
566-
Used only when determining contig size (not used when computing chromosome depth).
565+
:param excluded_calling_intervals: Optional table of intervals to exclude from the computation. Used only when
566+
determining contig size (not used when computing chromosome depth) when `use_only_variants` is False.
567+
:param included_calling_intervals: Optional table of intervals to use in the computation. Used only when
568+
determining contig size (not used when computing chromosome depth) when `use_only_variants` is False.
567569
:param normalization_contig: Which chromosome to normalize by
568570
:param chr_x: Optional X Chromosome contig name (by default uses the X contig in the reference)
569571
:param chr_y: Optional Y Chromosome contig name (by default uses the Y contig in the reference)
572+
:param use_only_variants: Whether to use depth of variant data within calling intervals instead of reference data.
573+
Default will only use reference data.
574+
570575
:return: Table with mean coverage over chromosomes 20, X and Y and sex chromosomes ploidy based on normalized coverage.
571576
"""
572577
ref = get_reference_genome(mt.locus, add_sequence=True)
@@ -588,6 +593,16 @@ def impute_sex_ploidy(
588593
chr_y = ref.y_contigs[0]
589594

590595
def get_contig_size(contig: str) -> int:
596+
"""
597+
Compute the size of the specified `contig` using the median block coverage (summed over the block size).
598+
599+
The size of the contig will be determined using only non par regions if the contig is an X or Y reference contig
600+
and using the intervals specified by `included_calling_intervals` and excluding intervals specified by
601+
`excluded_calling_intervals` if either is defined in the outer function.
602+
603+
:param contig: Contig to compute the size of
604+
:return: Integer of the contig size
605+
"""
591606
logger.info("Working on %s", contig)
592607
contig_ht = hl.utils.range_table(
593608
ref.contig_length(contig),
@@ -617,6 +632,24 @@ def get_contig_size(contig: str) -> int:
617632
return contig_size
618633

619634
def get_chr_dp_ann(chrom: str) -> hl.Table:
635+
"""
636+
Compute the mean depth of the specified chromosome.
637+
638+
The total depth will be determined using the sum DP of either reference and variant data or only variant data
639+
depending on the value of `use_only_variants` in the outer function.
640+
641+
If `use_only_variants` is set to False then this value is computed using the median block coverage (summed over
642+
the block size). If `use_only_variants` is set to True, this value is computed using the sum of DP for all
643+
variants divided by the total number of variants.
644+
645+
The depth calculations will be determined using only non par regions if the contig is an X or Y reference contig
646+
and using the intervals specified by `included_calling_intervals` and excluding intervals specified by
647+
`excluded_calling_intervals` if either is defined in the outer function (when `use_only_variants` is not
648+
set this only applies to the contig size estimate and is not used when computing chromosome depth).
649+
650+
:param chrom: Chromosome to compute the mean depth of
651+
:return: Table of a per sample mean depth of `chrom`
652+
"""
620653
contig_size = get_contig_size(chrom)
621654
chr_mt = hl.filter_intervals(mt, [hl.parse_locus_interval(chrom)])
622655

@@ -625,18 +658,37 @@ def get_chr_dp_ann(chrom: str) -> hl.Table:
625658
if chrom in ref.y_contigs:
626659
chr_mt = chr_mt.filter_rows(chr_mt.locus.in_y_nonpar())
627660

628-
return chr_mt.select_cols(
629-
**{
630-
f"{chrom}_mean_dp": hl.agg.sum(
631-
hl.cond(
632-
chr_mt.LGT.is_hom_ref(),
633-
chr_mt.DP * (1 + chr_mt.END - chr_mt.locus.position),
634-
chr_mt.DP,
635-
)
661+
if use_only_variants:
662+
if included_calling_intervals is not None:
663+
chr_mt = chr_mt.filter_rows(
664+
hl.is_defined(included_calling_intervals[chr_mt.row_key])
665+
)
666+
if excluded_calling_intervals is not None:
667+
chr_mt = chr_mt.filter_rows(
668+
hl.is_missing(excluded_calling_intervals[chr_mt.row_key])
636669
)
637-
/ contig_size
638-
}
639-
).cols()
670+
return chr_mt.select_cols(
671+
**{
672+
f"{chrom}_mean_dp": hl.agg.filter(
673+
chr_mt.LGT.is_non_ref(),
674+
hl.agg.sum(chr_mt.DP),
675+
)
676+
/ hl.agg.filter(chr_mt.LGT.is_non_ref(), hl.agg.count())
677+
}
678+
).cols()
679+
else:
680+
return chr_mt.select_cols(
681+
**{
682+
f"{chrom}_mean_dp": hl.agg.sum(
683+
hl.if_else(
684+
chr_mt.LGT.is_hom_ref(),
685+
chr_mt.DP * (1 + chr_mt.END - chr_mt.locus.position),
686+
chr_mt.DP,
687+
)
688+
)
689+
/ contig_size
690+
}
691+
).cols()
640692

641693
normalization_chrom_dp = get_chr_dp_ann(normalization_contig)
642694
chrX_dp = get_chr_dp_ann(chr_x)

0 commit comments

Comments
 (0)