From 1ad3666e943c9e07d4008a376bb20299f53af63f Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Fri, 12 Jul 2024 15:42:27 -0400 Subject: [PATCH 1/8] Add create_vds function that only supports creating from gvcfs --- gnomad/utils/file_utils.py | 49 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/gnomad/utils/file_utils.py b/gnomad/utils/file_utils.py index 8ddf1425e..2dce58d52 100644 --- a/gnomad/utils/file_utils.py +++ b/gnomad/utils/file_utils.py @@ -222,3 +222,52 @@ def repartition_for_join( " partitions than the original HT!" ) return ht._calculate_new_partitions(ht.n_partitions() * new_partition_percent) + + +def create_vds( + gvcfs: str, + output_path: str, + temp_path: Optional[str] = None, + save_path: Optional[str] = None, + use_genome_default_intervals: bool = False, + use_exome_default_intervals: bool = False, + intervals: Optional[List[str]] = None, + gvcf_batch_size: Optional[int] = None, +) -> hl.vds.VariantDataset: + """ + Combine gVCFs into a single VDS. + + :param gvcfs: Path to file containing gVCF paths with no header. + :param str output_path: Path to write output VDS. + :param str temp_path: Path to write temporary files. + :param str save_path: Path to write combiner to on failure. Can be used to restart + combiner from a failed state. If not specified, defaults to temp_path + + combiner_plan.json. + :param bool use_genome_default_intervals: Use the default genome intervals. + :param bool use_exome_default_intervals: Use the default exome intervals. + :param List[str] intervals: List of intervals to use. + :param gvcf_batch_size: Number of GVCFs to combine into a Variant Dataset at once. + :return: Combined VDS. + """ + if not save_path and temp_path: + save_path = temp_path + "combiner_plan.json" + + gvcfs = read_list_data(gvcfs) + + if not len(gvcfs) > 0: + raise DataException("No gVCFs provided in file") + + logger.info("Combining %s gVCFs into a single VDS", len(gvcfs)) + combiner = hl.vds.new_combiner( + output_path=output_path, + temp_path=temp_path, + save_path=save_path, + gvcf_paths=gvcfs, + use_genome_default_intervals=use_genome_default_intervals, + use_exome_default_intervals=use_exome_default_intervals, + intervals=intervals, + gvcf_batch_size=gvcf_batch_size, + ) + combiner.run() + vds = hl.vds.read_vds(output_path) + return vds From 4b819707a25c356aeeff66bc475cd8f0ec37c945 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Wed, 17 Jul 2024 10:48:49 -0400 Subject: [PATCH 2/8] Drop types in doc strings and update all gVCF refs to GVCF --- gnomad/utils/file_utils.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/gnomad/utils/file_utils.py b/gnomad/utils/file_utils.py index 2dce58d52..91d2aa668 100644 --- a/gnomad/utils/file_utils.py +++ b/gnomad/utils/file_utils.py @@ -227,7 +227,7 @@ def repartition_for_join( def create_vds( gvcfs: str, output_path: str, - temp_path: Optional[str] = None, + temp_path: str, save_path: Optional[str] = None, use_genome_default_intervals: bool = False, use_exome_default_intervals: bool = False, @@ -235,17 +235,18 @@ def create_vds( gvcf_batch_size: Optional[int] = None, ) -> hl.vds.VariantDataset: """ - Combine gVCFs into a single VDS. + Combine GVCFs into a single VDS. :param gvcfs: Path to file containing gVCF paths with no header. - :param str output_path: Path to write output VDS. - :param str temp_path: Path to write temporary files. - :param str save_path: Path to write combiner to on failure. Can be used to restart + :param output_path: Path to write output VDS. + :param temp_path: Path to write temporary files. A bucket with a life-cycle + policy is recommended. + :param save_path: Path to write combiner to on failure. Can be used to restart combiner from a failed state. If not specified, defaults to temp_path + combiner_plan.json. - :param bool use_genome_default_intervals: Use the default genome intervals. - :param bool use_exome_default_intervals: Use the default exome intervals. - :param List[str] intervals: List of intervals to use. + :param use_genome_default_intervals: Use the default genome intervals. + :param use_exome_default_intervals: Use the default exome intervals. + :param intervals: List of intervals to use. :param gvcf_batch_size: Number of GVCFs to combine into a Variant Dataset at once. :return: Combined VDS. """ @@ -255,9 +256,9 @@ def create_vds( gvcfs = read_list_data(gvcfs) if not len(gvcfs) > 0: - raise DataException("No gVCFs provided in file") + raise DataException("No GVCFs provided in file") - logger.info("Combining %s gVCFs into a single VDS", len(gvcfs)) + logger.info("Combining %s GVCFs into a single VDS", len(gvcfs)) combiner = hl.vds.new_combiner( output_path=output_path, temp_path=temp_path, From ea73d3d6caa5f95a09fcf3f94ecedb2805a349c9 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Wed, 17 Jul 2024 10:51:21 -0400 Subject: [PATCH 3/8] gVCF -> GVCF --- gnomad/utils/file_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/utils/file_utils.py b/gnomad/utils/file_utils.py index 91d2aa668..33d29de2a 100644 --- a/gnomad/utils/file_utils.py +++ b/gnomad/utils/file_utils.py @@ -237,7 +237,7 @@ def create_vds( """ Combine GVCFs into a single VDS. - :param gvcfs: Path to file containing gVCF paths with no header. + :param gvcfs: Path to file containing GVCF paths with no header. :param output_path: Path to write output VDS. :param temp_path: Path to write temporary files. A bucket with a life-cycle policy is recommended. From 7a616adebcfc6d44b5429ad3001d7ce9b4c8e7bd Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Wed, 17 Jul 2024 11:04:48 -0400 Subject: [PATCH 4/8] Accept file of intervals and convert to list --- gnomad/utils/file_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gnomad/utils/file_utils.py b/gnomad/utils/file_utils.py index 33d29de2a..be1414534 100644 --- a/gnomad/utils/file_utils.py +++ b/gnomad/utils/file_utils.py @@ -246,7 +246,7 @@ def create_vds( combiner_plan.json. :param use_genome_default_intervals: Use the default genome intervals. :param use_exome_default_intervals: Use the default exome intervals. - :param intervals: List of intervals to use. + :param intervals: Path to text file with intervals to use for VDS creation. :param gvcf_batch_size: Number of GVCFs to combine into a Variant Dataset at once. :return: Combined VDS. """ @@ -254,6 +254,7 @@ def create_vds( save_path = temp_path + "combiner_plan.json" gvcfs = read_list_data(gvcfs) + intervals = read_list_data(intervals) if intervals else None if not len(gvcfs) > 0: raise DataException("No GVCFs provided in file") From 4f5257a9d1b7d9a771c43fbc252fcb2ef46b9fd5 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Wed, 17 Jul 2024 14:40:43 -0400 Subject: [PATCH 5/8] Check passed interval file --- gnomad/utils/file_utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gnomad/utils/file_utils.py b/gnomad/utils/file_utils.py index be1414534..2ce17f71e 100644 --- a/gnomad/utils/file_utils.py +++ b/gnomad/utils/file_utils.py @@ -259,6 +259,9 @@ def create_vds( if not len(gvcfs) > 0: raise DataException("No GVCFs provided in file") + if intervals and not len(intervals) > 0: + raise DataException("No intervals provided in passed intervals file") + logger.info("Combining %s GVCFs into a single VDS", len(gvcfs)) combiner = hl.vds.new_combiner( output_path=output_path, From f3b5b9ac8edcb3da5c71287a14259ccbf778bab3 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Wed, 17 Jul 2024 16:00:28 -0400 Subject: [PATCH 6/8] Update gnomad/utils/file_utils.py Co-authored-by: klaricch --- gnomad/utils/file_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/utils/file_utils.py b/gnomad/utils/file_utils.py index 2ce17f71e..3f8760dc6 100644 --- a/gnomad/utils/file_utils.py +++ b/gnomad/utils/file_utils.py @@ -239,7 +239,7 @@ def create_vds( :param gvcfs: Path to file containing GVCF paths with no header. :param output_path: Path to write output VDS. - :param temp_path: Path to write temporary files. A bucket with a life-cycle + :param temp_path: Directory path to write temporary files. A bucket with a life-cycle policy is recommended. :param save_path: Path to write combiner to on failure. Can be used to restart combiner from a failed state. If not specified, defaults to temp_path + From 78faa73444f9068cc6364deaf673f654244d09b4 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Wed, 17 Jul 2024 16:04:52 -0400 Subject: [PATCH 7/8] Import intervals from path and add ref --- gnomad/utils/file_utils.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/gnomad/utils/file_utils.py b/gnomad/utils/file_utils.py index 3f8760dc6..033bb0c70 100644 --- a/gnomad/utils/file_utils.py +++ b/gnomad/utils/file_utils.py @@ -231,8 +231,9 @@ def create_vds( save_path: Optional[str] = None, use_genome_default_intervals: bool = False, use_exome_default_intervals: bool = False, - intervals: Optional[List[str]] = None, + intervals: Optional[str] = None, gvcf_batch_size: Optional[int] = None, + reference_genome: str = "GRCh38", ) -> hl.vds.VariantDataset: """ Combine GVCFs into a single VDS. @@ -248,13 +249,20 @@ def create_vds( :param use_exome_default_intervals: Use the default exome intervals. :param intervals: Path to text file with intervals to use for VDS creation. :param gvcf_batch_size: Number of GVCFs to combine into a Variant Dataset at once. + :param reference_genome: Reference genome to use. Default is GRCh38. :return: Combined VDS. """ if not save_path and temp_path: save_path = temp_path + "combiner_plan.json" gvcfs = read_list_data(gvcfs) - intervals = read_list_data(intervals) if intervals else None + intervals = ( + hl.import_locus_intervals( + intervals, reference_genome=reference_genome + ).collect() + if intervals + else None + ) if not len(gvcfs) > 0: raise DataException("No GVCFs provided in file") From 7c975b244cfc43747fd8be51c8248717709f71a9 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Wed, 17 Jul 2024 16:06:29 -0400 Subject: [PATCH 8/8] Collect intervals for combiner --- gnomad/utils/file_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/utils/file_utils.py b/gnomad/utils/file_utils.py index 033bb0c70..df0965ef1 100644 --- a/gnomad/utils/file_utils.py +++ b/gnomad/utils/file_utils.py @@ -259,7 +259,7 @@ def create_vds( intervals = ( hl.import_locus_intervals( intervals, reference_genome=reference_genome - ).collect() + ).interval.collect() if intervals else None )