diff --git a/gnomad/sample_qc/pipeline.py b/gnomad/sample_qc/pipeline.py index 31bc0992e..e23d41ff5 100644 --- a/gnomad/sample_qc/pipeline.py +++ b/gnomad/sample_qc/pipeline.py @@ -121,6 +121,7 @@ def get_qc_mt( filter_segdup: bool = True, filter_exome_low_coverage_regions: bool = False, high_conf_regions: Optional[List[str]] = None, + checkpoint_path: Optional[str] = None, ) -> hl.MatrixTable: """ Create a QC-ready MT. @@ -147,6 +148,7 @@ def get_qc_mt( :param filter_segdup: Filter segmental duplication regions :param filter_exome_low_coverage_regions: If set, only high coverage exome regions (computed from gnomAD are kept) :param high_conf_regions: If given, the data will be filtered to only include variants in those regions + :param checkpoint_path: If given, the QC MT will be checkpointed to the specified path before running LD pruning. If not specified, persist will be used instead. :return: Filtered MT """ logger.info("Creating QC MatrixTable") @@ -179,7 +181,12 @@ def get_qc_mt( ) if ld_r2 is not None: - qc_mt = qc_mt.persist() + if checkpoint_path: + logger.info("Checkpointing the MT and LD pruning") + qc_mt = qc_mt.checkpoint(checkpoint_path, overwrite=True) + else: + logger.info("Persisting the MT and LD pruning") + qc_mt = qc_mt.persist() unfiltered_qc_mt = qc_mt.unfilter_entries() pruned_ht = hl.ld_prune(unfiltered_qc_mt.GT, r2=ld_r2) qc_mt = qc_mt.filter_rows(hl.is_defined(pruned_ht[qc_mt.row_key]))