Merge pull request #445 from broadinstitute/jg/sex_ploidy_variants

jkgoodrich · web-flow · commit 541080e0f622 · 2022-04-28T16:45:51.000-04:00
Modification to the `annotate_sex` pipeline to allow sex ploidy estimation using only variants instead of ref blocks
diff --git a/gnomad/sample_qc/pipeline.py b/gnomad/sample_qc/pipeline.py
@@ -226,6 +226,8 @@ def annotate_sex(
     gt_expr: str = "GT",
     f_stat_cutoff: float = 0.5,
     aaf_threshold: float = 0.001,
+    variants_only_x_ploidy: bool = False,
+    variants_only_y_ploidy: bool = False,
 ) -> hl.Table:
     """
     Impute sample sex based on X-chromosome heterozygosity and sex chromosome ploidy.
@@ -256,6 +258,8 @@ def annotate_sex(
     :param gt_expr: Name of entry field storing the genotype. Default: 'GT'
     :param f_stat_cutoff: f-stat to roughly divide 'XX' from 'XY' samples. Assumes XX samples are below cutoff and XY are above cutoff.
     :param float aaf_threshold: Minimum alternate allele frequency to be used in f-stat calculations.
+    :param variants_only_x_ploidy: Whether to use depth of only variant data for the x ploidy estimation.
+    :param variants_only_y_ploidy: Whether to use depth of only variant data for the y ploidy estimation.
     :return: Table of samples and their imputed sex karyotypes.
     """
     logger.info("Imputing sex chromosome ploidies...")
@@ -266,27 +270,94 @@ def annotate_sex(
             raise NotImplementedError(
                 "The use of the parameter 'excluded_intervals' is currently not implemented for imputing sex chromosome ploidy on a VDS!"
             )
+        # Begin by creating a ploidy estimate HT using the method defined by 'variants_only_x_ploidy'
         ploidy_ht = hl.vds.impute_sex_chromosome_ploidy(
             mtds,
             calling_intervals=included_intervals,
             normalization_contig=normalization_contig,
+            use_variant_dataset=variants_only_x_ploidy,
         )
         ploidy_ht = ploidy_ht.rename(
             {
                 "x_ploidy": "chrX_ploidy",
                 "y_ploidy": "chrY_ploidy",
                 "x_mean_dp": "chrX_mean_dp",
                 "y_mean_dp": "chrY_mean_dp",
-                "autosomal_mean_dp": f"{normalization_contig}_mean_dp",
+                "autosomal_mean_dp": f"var_data_{normalization_contig}_mean_dp"
+                if variants_only_x_ploidy
+                else f"{normalization_contig}_mean_dp",
             }
         )
+        # If 'variants_only_y_ploidy' is different from 'variants_only_x_ploidy' then re-run the ploidy estimation using
+        # the method defined by 'variants_only_y_ploidy' and re-annotate with the modified ploidy estimates.
+        if variants_only_y_ploidy != variants_only_x_ploidy:
+            y_ploidy_ht = hl.vds.impute_sex_chromosome_ploidy(
+                mtds,
+                calling_intervals=included_intervals,
+                normalization_contig=normalization_contig,
+                use_variant_dataset=variants_only_y_ploidy,
+            )
+            y_ploidy_idx = y_ploidy_ht[ploidy_ht.key]
+            ploidy_ht = ploidy_ht.annotate(
+                chrY_ploidy=y_ploidy_idx.y_ploidy,
+                chrY_mean_dp=y_ploidy_idx.y_mean_dp,
+            )
+
+            # If the `variants_only_y_ploidy' is True modify the name of the normalization contig mean DP to indicate
+            # that this is the variant dataset only mean DP (this will have already been added if
+            # 'variants_only_x_ploidy' was also True).
+            if variants_only_y_ploidy:
+                ploidy_ht = ploidy_ht.annotate(
+                    **{
+                        f"var_data_{normalization_contig}_mean_dp": y_ploidy_idx.autosomal_mean_dp
+                    }
+                )
+
         mt = mtds.variant_data
     else:
         mt = mtds
         if is_sparse:
             ploidy_ht = impute_sex_ploidy(
-                mt, excluded_intervals, included_intervals, normalization_contig
+                mt,
+                excluded_intervals,
+                included_intervals,
+                normalization_contig,
+                use_only_variants=variants_only_x_ploidy,
             )
+            ploidy_ht = ploidy_ht.rename(
+                {
+                    "autosomal_mean_dp": f"var_data_{normalization_contig}_mean_dp"
+                    if variants_only_x_ploidy
+                    else f"{normalization_contig}_mean_dp",
+                }
+            )
+            # If 'variants_only_y_ploidy' is different from 'variants_only_x_ploidy' then re-run the ploidy estimation
+            # using the method defined by 'variants_only_y_ploidy' and re-annotate with the modified ploidy estimates.
+            if variants_only_y_ploidy != variants_only_x_ploidy:
+                y_ploidy_ht = impute_sex_ploidy(
+                    mt,
+                    excluded_intervals,
+                    included_intervals,
+                    normalization_contig,
+                    use_only_variants=variants_only_y_ploidy,
+                )
+                y_ploidy_ht.select(
+                    "chrY_ploidy",
+                    "chrY_mean_dp",
+                    f"{normalization_contig}_mean_dp",
+                )
+                # If the `variants_only_y_ploidy' is True modify the name of the normalization contig mean DP to indicate
+                # that this is the variant dataset only mean DP (this will have already been added if
+                # 'variants_only_x_ploidy' was also True).
+                if variants_only_y_ploidy:
+                    ploidy_ht = ploidy_ht.rename(
+                        {
+                            f"{normalization_contig}_mean_dp": f"var_data_{normalization_contig}_mean_dp"
+                        }
+                    )
+                # Re-annotate the ploidy HT with modified Y ploidy annotations
+                ploidy_ht = ploidy_ht.annotate(**y_ploidy_ht[ploidy_ht.key])
+
         else:
             raise NotImplementedError(
                 "Imputing sex ploidy does not exist yet for dense data."
@@ -348,6 +419,8 @@ def annotate_sex(
             lower_cutoff_YY=y_ploidy_cutoffs[1],
         ),
         f_stat_cutoff=f_stat_cutoff,
+        variants_only_x_ploidy=variants_only_x_ploidy,
+        variants_only_y_ploidy=variants_only_y_ploidy,
     )
     return sex_ht.annotate(
         **get_sex_expr(
diff --git a/gnomad/utils/sparse_mt.py b/gnomad/utils/sparse_mt.py
@@ -549,6 +549,7 @@ def impute_sex_ploidy(
     normalization_contig: str = "chr20",
     chr_x: Optional[str] = None,
     chr_y: Optional[str] = None,
+    use_only_variants: bool = False,
 ) -> hl.Table:
     """
     Impute sex ploidy from a sparse MatrixTable.
@@ -557,16 +558,20 @@ def impute_sex_ploidy(
     chromosome (by default chr20).
 
     Coverage is computed using the median block coverage (summed over the block size) and the non-ref coverage at
-    non-ref genotypes.
+    non-ref genotypes unless the `use_only_variants` argument is set to True and then it will use the mean coverage
+    defined by only the variants.
 
     :param mt: Input sparse Matrix Table
-    :param excluded_calling_intervals: Optional table of intervals to exclude from the computation.
-        Used only when determining contig size (not used when computing chromosome depth).
-    :param included_calling_intervals: Optional table of intervals to use in the computation.
-        Used only when determining contig size (not used when computing chromosome depth).
+    :param excluded_calling_intervals: Optional table of intervals to exclude from the computation. Used only when
+        determining contig size (not used when computing chromosome depth) when `use_only_variants` is False.
+    :param included_calling_intervals: Optional table of intervals to use in the computation. Used only when
+        determining contig size (not used when computing chromosome depth) when `use_only_variants` is False.
     :param normalization_contig: Which chromosome to normalize by
     :param chr_x: Optional X Chromosome contig name (by default uses the X contig in the reference)
     :param chr_y: Optional Y Chromosome contig name (by default uses the Y contig in the reference)
+    :param use_only_variants: Whether to use depth of variant data within calling intervals instead of reference data.
+        Default will only use reference data.
+
     :return: Table with mean coverage over chromosomes 20, X and Y and sex chromosomes ploidy based on normalized coverage.
     """
     ref = get_reference_genome(mt.locus, add_sequence=True)
@@ -588,6 +593,16 @@ def impute_sex_ploidy(
         chr_y = ref.y_contigs[0]
 
     def get_contig_size(contig: str) -> int:
+        """
+        Compute the size of the specified `contig` using the median block coverage (summed over the block size).
+
+        The size of the contig will be determined using only non par regions if the contig is an X or Y reference contig
+        and using the intervals specified by `included_calling_intervals` and excluding intervals specified by
+        `excluded_calling_intervals` if either is defined in the outer function.
+
+        :param contig: Contig to compute the size of
+        :return: Integer of the contig size
+        """
         logger.info("Working on %s", contig)
         contig_ht = hl.utils.range_table(
             ref.contig_length(contig),
@@ -617,6 +632,24 @@ def get_contig_size(contig: str) -> int:
         return contig_size
 
     def get_chr_dp_ann(chrom: str) -> hl.Table:
+        """
+        Compute the mean depth of the specified chromosome.
+
+        The total depth will be determined using the sum DP of either reference and variant data or only variant data
+        depending on the value of `use_only_variants` in the outer function.
+
+        If `use_only_variants` is set to False then this value is computed using the median block coverage (summed over
+        the block size). If `use_only_variants` is set to True, this value is computed using the sum of DP for  all
+        variants divided by the total number of variants.
+
+        The depth calculations will be determined using only non par regions if the contig is an X or Y reference contig
+        and using the intervals specified by `included_calling_intervals` and excluding intervals specified by
+        `excluded_calling_intervals` if either is defined in the outer function (when `use_only_variants` is not
+        set this only applies to the contig size estimate and is not used when computing chromosome depth).
+
+        :param chrom: Chromosome to compute the mean depth of
+        :return: Table of a per sample mean depth of `chrom`
+        """
         contig_size = get_contig_size(chrom)
         chr_mt = hl.filter_intervals(mt, [hl.parse_locus_interval(chrom)])
 
@@ -625,18 +658,37 @@ def get_chr_dp_ann(chrom: str) -> hl.Table:
         if chrom in ref.y_contigs:
             chr_mt = chr_mt.filter_rows(chr_mt.locus.in_y_nonpar())
 
-        return chr_mt.select_cols(
-            **{
-                f"{chrom}_mean_dp": hl.agg.sum(
-                    hl.cond(
-                        chr_mt.LGT.is_hom_ref(),
-                        chr_mt.DP * (1 + chr_mt.END - chr_mt.locus.position),
-                        chr_mt.DP,
-                    )
+        if use_only_variants:
+            if included_calling_intervals is not None:
+                chr_mt = chr_mt.filter_rows(
+                    hl.is_defined(included_calling_intervals[chr_mt.row_key])
+                )
+            if excluded_calling_intervals is not None:
+                chr_mt = chr_mt.filter_rows(
+                    hl.is_missing(excluded_calling_intervals[chr_mt.row_key])
                 )
-                / contig_size
-            }
-        ).cols()
+            return chr_mt.select_cols(
+                **{
+                    f"{chrom}_mean_dp": hl.agg.filter(
+                        chr_mt.LGT.is_non_ref(),
+                        hl.agg.sum(chr_mt.DP),
+                    )
+                    / hl.agg.filter(chr_mt.LGT.is_non_ref(), hl.agg.count())
+                }
+            ).cols()
+        else:
+            return chr_mt.select_cols(
+                **{
+                    f"{chrom}_mean_dp": hl.agg.sum(
+                        hl.if_else(
+                            chr_mt.LGT.is_hom_ref(),
+                            chr_mt.DP * (1 + chr_mt.END - chr_mt.locus.position),
+                            chr_mt.DP,
+                        )
+                    )
+                    / contig_size
+                }
+            ).cols()
 
     normalization_chrom_dp = get_chr_dp_ann(normalization_contig)
     chrX_dp = get_chr_dp_ann(chr_x)