Skip to content

Commit 79a2e02

Browse files
committed
feat(pipelines): add v4 lof curations pipeline
1 parent 95fc13a commit 79a2e02

File tree

4 files changed

+63
-7
lines changed

4 files changed

+63
-7
lines changed

data-pipeline/src/data_pipeline/datasets/gnomad_v2/gnomad_v2_lof_curation.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,10 @@
3030
"not_lof": "Not LoF",
3131
}
3232

33+
VERDICT_MAPPINGS_CLEAN = VERDICT_MAPPING.values()
3334

34-
def import_gnomad_v2_lof_curation_results(curation_result_paths, genes_path):
35+
36+
def import_gnomad_lof_curation_results(curation_result_paths, genes_path, reference_genome="GRCh37"):
3537
all_flags = set()
3638

3739
with hl.hadoop_open("/tmp/import_temp.tsv", "w") as temp_output_file:
@@ -52,8 +54,13 @@ def import_gnomad_v2_lof_curation_results(curation_result_paths, genes_path):
5254

5355
for row in reader:
5456
[chrom, pos, ref, alt] = row["Variant ID"].split("-")
57+
chrom = f"chr{chrom}" if reference_genome == "GRCh38" else chrom
5558

56-
variant_flags = [FLAG_MAPPING.get(f, f) for f in raw_dataset_flags if row[f"Flag {f}"] == "TRUE"]
59+
variant_flags = [
60+
FLAG_MAPPING.get(f, f)
61+
for f in raw_dataset_flags
62+
if row.get(f"Flag {f}") == "TRUE" or row.get(f"FLAG {f}") == "1"
63+
]
5764

5865
genes = [gene_id for (gene_id, gene_symbol) in (gene.split(":") for gene in row["Gene"].split(";"))]
5966

@@ -62,7 +69,8 @@ def import_gnomad_v2_lof_curation_results(curation_result_paths, genes_path):
6269
if verdict == "inufficient_evidence":
6370
verdict = "insufficient_evidence"
6471

65-
verdict = VERDICT_MAPPING[verdict]
72+
if verdict not in VERDICT_MAPPINGS_CLEAN:
73+
verdict = VERDICT_MAPPING[verdict]
6674

6775
output_row = [
6876
chrom,
@@ -81,7 +89,7 @@ def import_gnomad_v2_lof_curation_results(curation_result_paths, genes_path):
8189
ds = hl.import_table("/tmp/import_temp.tsv")
8290

8391
ds = ds.transmute(
84-
locus=hl.locus(ds.chrom, hl.int(ds.position)),
92+
locus=hl.locus(ds.chrom, hl.int(ds.position), reference_genome),
8593
alleles=[ds.ref, ds.alt],
8694
)
8795

data-pipeline/src/data_pipeline/pipelines/export_to_elasticsearch.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,8 @@
3838
from data_pipeline.pipelines.gnomad_v3_short_tandem_repeats import pipeline as gnomad_v3_short_tandem_repeats_pipeline
3939
from data_pipeline.pipelines.gnomad_v4_variants import pipeline as gnomad_v4_variants_pipeline
4040
from data_pipeline.pipelines.gnomad_v4_coverage import pipeline as gnomad_v4_coverage_pipeline
41-
4241
from data_pipeline.pipelines.gnomad_v4_cnvs import pipeline as gnomad_v4_cnvs_pipeline
42+
from data_pipeline.pipelines.gnomad_v4_lof_curation_results import pipeline as gnomad_v4_lof_curation_results_pipeline
4343

4444

4545
logger = logging.getLogger("gnomad_data_pipeline")
@@ -145,6 +145,18 @@ def add_liftover_document_id(ds):
145145
# ),
146146
# "args": {"index": "gnomad_v4_genome_coverage", "id_field": "xpos", "num_shards": 2, "block_size": 10_000},
147147
# },
148+
"gnomad_v4_lof_curation_results": {
149+
"get_table": lambda: add_variant_document_id(
150+
hl.read_table(gnomad_v4_lof_curation_results_pipeline.get_output("lof_curation_results").get_output_path())
151+
),
152+
"args": {
153+
"index": "gnomad_v4_lof_curation_results",
154+
"index_fields": ["document_id", "variant_id", "locus", "lof_curations.gene_id"],
155+
"id_field": "document_id",
156+
"num_shards": 1,
157+
"block_size": 1_000,
158+
},
159+
},
148160
##############################################################################################################
149161
# gnomAD v4 CNVs
150162
##############################################################################################################

data-pipeline/src/data_pipeline/pipelines/gnomad_v2_lof_curation_results.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from data_pipeline.pipeline import Pipeline, run_pipeline
22

3-
from data_pipeline.datasets.gnomad_v2.gnomad_v2_lof_curation import import_gnomad_v2_lof_curation_results
3+
from data_pipeline.datasets.gnomad_v2.gnomad_v2_lof_curation import import_gnomad_lof_curation_results
44

55
from data_pipeline.pipelines.genes import pipeline as genes_pipeline
66

@@ -9,7 +9,7 @@
99

1010
pipeline.add_task(
1111
"prepare_gnomad_v2_lof_curation_results",
12-
import_gnomad_v2_lof_curation_results,
12+
import_gnomad_lof_curation_results,
1313
"/gnomad_v2/gnomad_v2_lof_curation_results.ht",
1414
{"genes_path": genes_pipeline.get_output("genes_grch37")},
1515
{
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
from data_pipeline.pipeline import Pipeline, run_pipeline
2+
3+
from data_pipeline.datasets.gnomad_v2.gnomad_v2_lof_curation import import_gnomad_lof_curation_results
4+
5+
from data_pipeline.pipelines.genes import pipeline as genes_pipeline
6+
7+
8+
pipeline = Pipeline()
9+
10+
pipeline.add_task(
11+
"prepare_gnomad_v4_lof_curation_results",
12+
import_gnomad_lof_curation_results,
13+
"/gnomad_v4/gnomad_v4_lof_curation_results.ht",
14+
{"genes_path": genes_pipeline.get_output("genes_grch38")},
15+
{
16+
# If a result for a variant/gene pair is present in more than one file,
17+
# the result in the first file in this list takes precedence.
18+
"curation_result_paths": [
19+
"gs://gnomad-v4-data-pipeline/inputs/lof_curation/gnomAD_v4/gnomAD_incomplete_penetrance_final_results.csv",
20+
],
21+
"reference_genome": "GRCh38",
22+
},
23+
)
24+
25+
###############################################
26+
# Outputs
27+
###############################################
28+
29+
pipeline.set_outputs({"lof_curation_results": "prepare_gnomad_v4_lof_curation_results"})
30+
31+
###############################################
32+
# Run
33+
###############################################
34+
35+
if __name__ == "__main__":
36+
run_pipeline(pipeline)

0 commit comments

Comments
 (0)