diff --git a/gnomad/sample_qc/pipeline.py b/gnomad/sample_qc/pipeline.py index 31bc0992e..362b7a53d 100644 --- a/gnomad/sample_qc/pipeline.py +++ b/gnomad/sample_qc/pipeline.py @@ -11,7 +11,7 @@ from gnomad.utils.reference_genome import get_reference_genome from gnomad.utils.sparse_mt import impute_sex_ploidy -from typing import List, Optional +from typing import List, Optional, Union logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") logger = logging.getLogger(__name__) @@ -209,7 +209,7 @@ def get_qc_mt( def annotate_sex( - mt: hl.MatrixTable, + mtds: Union[hl.MatrixTable, hl.vds.VariantDataset], is_sparse: bool = True, excluded_intervals: Optional[hl.Table] = None, included_intervals: Optional[hl.Table] = None, @@ -225,7 +225,7 @@ def annotate_sex( Return Table with the following fields: - s (str): Sample - - chr20_mean_dp (float32): Sample's mean coverage over chromosome 20. + - `normalization_contig`_mean_dp (float32): Sample's mean coverage over the specified `normalization_contig`. - chrX_mean_dp (float32): Sample's mean coverage over chromosome X. - chrY_mean_dp (float32): Sample's mean coverage over chromosome Y. - chrX_ploidy (float32): Sample's imputed ploidy over chromosome X. @@ -238,7 +238,7 @@ def annotate_sex( - Y_karyotype (str): Sample's chromosome Y karyotype. - sex_karyotype (str): Sample's sex karyotype. - :param mt: Input MatrixTable + :param mtds: Input MatrixTable or VariantDataset :param bool is_sparse: Whether input MatrixTable is in sparse data format :param excluded_intervals: Optional table of intervals to exclude from the computation. :param included_intervals: Optional table of intervals to use in the computation. REQUIRED for exomes. @@ -252,14 +252,38 @@ def annotate_sex( :return: Table of samples and their imputed sex karyotypes. """ logger.info("Imputing sex chromosome ploidies...") - if is_sparse: - ploidy_ht = impute_sex_ploidy( - mt, excluded_intervals, included_intervals, normalization_contig + + is_vds = isinstance(mtds, hl.vds.VariantDataset) + if is_vds: + if excluded_intervals is not None: + raise NotImplementedError( + "The use of the parameter 'excluded_intervals' is currently not implemented for imputing sex chromosome ploidy on a VDS!" + ) + ploidy_ht = hl.vds.impute_sex_chromosome_ploidy( + mtds, + calling_intervals=included_intervals, + normalization_contig=normalization_contig, ) - else: - raise NotImplementedError( - "Imputing sex ploidy does not exist yet for dense data." + ploidy_ht = ploidy_ht.rename( + { + "x_ploidy": "chrX_ploidy", + "y_ploidy": "chrY_ploidy", + "x_mean_dp": "chrX_mean_dp", + "y_mean_dp": "chrY_mean_dp", + "autosomal_mean_dp": f"{normalization_contig}_mean_dp", + } ) + mt = mtds.variant_data + else: + mt = mtds + if is_sparse: + ploidy_ht = impute_sex_ploidy( + mt, excluded_intervals, included_intervals, normalization_contig + ) + else: + raise NotImplementedError( + "Imputing sex ploidy does not exist yet for dense data." + ) x_contigs = get_reference_genome(mt.locus).x_contigs logger.info("Filtering mt to biallelic SNPs in X contigs: %s", x_contigs) @@ -269,8 +293,15 @@ def annotate_sex( mt = mt.filter_rows( (hl.len(mt.alleles) == 2) & hl.is_snp(mt.alleles[0], mt.alleles[1]) ) + + build = get_reference_genome(mt.locus).name mt = hl.filter_intervals( - mt, [hl.parse_locus_interval(contig) for contig in x_contigs] + mt, + [ + hl.parse_locus_interval(contig, reference_genome=build) + for contig in x_contigs + ], + keep=True, ) if sites_ht is not None: diff --git a/gnomad/utils/file_utils.py b/gnomad/utils/file_utils.py index a40a18b1e..988cf5090 100644 --- a/gnomad/utils/file_utils.py +++ b/gnomad/utils/file_utils.py @@ -11,7 +11,8 @@ from typing import Callable, List, Dict, Optional, Tuple, Union import hail as hl -from hailtop.aiotools import LocalAsyncFS, RouterAsyncFS, AsyncFS +from hailtop.aiotools import LocalAsyncFS, AsyncFS +from hailtop.aiotools.router_fs import RouterAsyncFS from hailtop.aiogoogle import GoogleStorageAsyncFS from hailtop.utils import bounded_gather, tqdm