diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 903ab3c10..c5c12cb91 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -36,56 +36,56 @@ VRS_CHROM_IDS = { "GRCh38": { - "chr1": "ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", - "chr2": "ga4gh:SQ.pnAqCRBrTsUoBghSD1yp_jXWSmlbdh4g", - "chr3": "ga4gh:SQ.Zu7h9AggXxhTaGVsy7h_EZSChSZGcmgX", - "chr4": "ga4gh:SQ.HxuclGHh0XCDuF8x6yQrpHUBL7ZntAHc", - "chr5": "ga4gh:SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI", - "chr6": "ga4gh:SQ.0iKlIQk2oZLoeOG9P1riRU6hvL5Ux8TV", - "chr7": "ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", - "chr8": "ga4gh:SQ.209Z7zJ-mFypBEWLk4rNC6S_OxY5p7bs", - "chr9": "ga4gh:SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI", - "chr10": "ga4gh:SQ.ss8r_wB0-b9r44TQTMmVTI92884QvBiB", - "chr11": "ga4gh:SQ.2NkFm8HK88MqeNkCgj78KidCAXgnsfV1", - "chr12": "ga4gh:SQ.6wlJpONE3oNb4D69ULmEXhqyDZ4vwNfl", - "chr13": "ga4gh:SQ._0wi-qoDrvram155UmcSC-zA5ZK4fpLT", - "chr14": "ga4gh:SQ.eK4D2MosgK_ivBkgi6FVPg5UXs1bYESm", - "chr15": "ga4gh:SQ.AsXvWL1-2i5U_buw6_niVIxD6zTbAuS6", - "chr16": "ga4gh:SQ.yC_0RBj3fgBlvgyAuycbzdubtLxq-rE0", - "chr17": "ga4gh:SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7", - "chr18": "ga4gh:SQ.vWwFhJ5lQDMhh-czg06YtlWqu0lvFAZV", - "chr19": "ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl", - "chr20": "ga4gh:SQ.-A1QmD_MatoqxvgVxBLZTONHz9-c7nQo", - "chr21": "ga4gh:SQ.5ZUqxCmDDgN4xTRbaSjN8LwgZironmB8", - "chr22": "ga4gh:SQ.7B7SHsmchAR0dFcDCuSFjJAo7tX87krQ", - "chrX": "ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", - "chrY": "ga4gh:SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5", + "chr1": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + "chr2": "SQ.pnAqCRBrTsUoBghSD1yp_jXWSmlbdh4g", + "chr3": "SQ.Zu7h9AggXxhTaGVsy7h_EZSChSZGcmgX", + "chr4": "SQ.HxuclGHh0XCDuF8x6yQrpHUBL7ZntAHc", + "chr5": "SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI", + "chr6": "SQ.0iKlIQk2oZLoeOG9P1riRU6hvL5Ux8TV", + "chr7": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", + "chr8": "SQ.209Z7zJ-mFypBEWLk4rNC6S_OxY5p7bs", + "chr9": "SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI", + "chr10": "SQ.ss8r_wB0-b9r44TQTMmVTI92884QvBiB", + "chr11": "SQ.2NkFm8HK88MqeNkCgj78KidCAXgnsfV1", + "chr12": "SQ.6wlJpONE3oNb4D69ULmEXhqyDZ4vwNfl", + "chr13": "SQ._0wi-qoDrvram155UmcSC-zA5ZK4fpLT", + "chr14": "SQ.eK4D2MosgK_ivBkgi6FVPg5UXs1bYESm", + "chr15": "SQ.AsXvWL1-2i5U_buw6_niVIxD6zTbAuS6", + "chr16": "SQ.yC_0RBj3fgBlvgyAuycbzdubtLxq-rE0", + "chr17": "SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7", + "chr18": "SQ.vWwFhJ5lQDMhh-czg06YtlWqu0lvFAZV", + "chr19": "SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl", + "chr20": "SQ.-A1QmD_MatoqxvgVxBLZTONHz9-c7nQo", + "chr21": "SQ.5ZUqxCmDDgN4xTRbaSjN8LwgZironmB8", + "chr22": "SQ.7B7SHsmchAR0dFcDCuSFjJAo7tX87krQ", + "chrX": "SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", + "chrY": "SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5", }, "GRCh37": { - "1": "ga4gh:SQ.S_KjnFVz-FE7M0W6yoaUDgYxLPc1jyWU", - "2": "ga4gh:SQ.9KdcA9ZpY1Cpvxvg8bMSLYDUpsX6GDLO", - "3": "ga4gh:SQ.VNBualIltAyi2AI_uXcKU7M9XUOuA7MS", - "4": "ga4gh:SQ.iy7Zfceb5_VGtTQzJ-v5JpPbpeifHD_V", - "5": "ga4gh:SQ.vbjOdMfHJvTjK_nqvFvpaSKhZillW0SX", - "6": "ga4gh:SQ.KqaUhJMW3CDjhoVtBetdEKT1n6hM-7Ek", - "7": "ga4gh:SQ.IW78mgV5Cqf6M24hy52hPjyyo5tCCd86", - "8": "ga4gh:SQ.tTm7wmhz0G4lpt8wPspcNkAD_qiminj6", - "9": "ga4gh:SQ.HBckYGQ4wYG9APHLpjoQ9UUe9v7NxExt", - "10": "ga4gh:SQ.-BOZ8Esn8J88qDwNiSEwUr5425UXdiGX", - "11": "ga4gh:SQ.XXi2_O1ly-CCOi3HP5TypAw7LtC6niFG", - "12": "ga4gh:SQ.105bBysLoDFQHhajooTAUyUkNiZ8LJEH", - "13": "ga4gh:SQ.Ewb9qlgTqN6e_XQiRVYpoUfZJHXeiUfH", - "14": "ga4gh:SQ.5Ji6FGEKfejK1U6BMScqrdKJK8GqmIGf", - "15": "ga4gh:SQ.zIMZb3Ft7RdWa5XYq0PxIlezLY2ccCgt", - "16": "ga4gh:SQ.W6wLoIFOn4G7cjopxPxYNk2lcEqhLQFb", - "17": "ga4gh:SQ.AjWXsI7AkTK35XW9pgd3UbjpC3MAevlz", - "18": "ga4gh:SQ.BTj4BDaaHYoPhD3oY2GdwC_l0uqZ92UD", - "19": "ga4gh:SQ.ItRDD47aMoioDCNW_occY5fWKZBKlxCX", - "20": "ga4gh:SQ.iy_UbUrvECxFRX5LPTH_KPojdlT7BKsf", - "21": "ga4gh:SQ.LpTaNW-hwuY_yARP0rtarCnpCQLkgVCg", - "22": "ga4gh:SQ.XOgHwwR3Upfp5sZYk6ZKzvV25a4RBVu8", - "X": "ga4gh:SQ.v7noePfnNpK8ghYXEqZ9NukMXW7YeNsm", - "Y": "ga4gh:SQ.BT7QyW5iXaX_1PSX-msSGYsqRdMKqkj-", + "1": "SQ.S_KjnFVz-FE7M0W6yoaUDgYxLPc1jyWU", + "2": "SQ.9KdcA9ZpY1Cpvxvg8bMSLYDUpsX6GDLO", + "3": "SQ.VNBualIltAyi2AI_uXcKU7M9XUOuA7MS", + "4": "SQ.iy7Zfceb5_VGtTQzJ-v5JpPbpeifHD_V", + "5": "SQ.vbjOdMfHJvTjK_nqvFvpaSKhZillW0SX", + "6": "SQ.KqaUhJMW3CDjhoVtBetdEKT1n6hM-7Ek", + "7": "SQ.IW78mgV5Cqf6M24hy52hPjyyo5tCCd86", + "8": "SQ.tTm7wmhz0G4lpt8wPspcNkAD_qiminj6", + "9": "SQ.HBckYGQ4wYG9APHLpjoQ9UUe9v7NxExt", + "10": "SQ.-BOZ8Esn8J88qDwNiSEwUr5425UXdiGX", + "11": "SQ.XXi2_O1ly-CCOi3HP5TypAw7LtC6niFG", + "12": "SQ.105bBysLoDFQHhajooTAUyUkNiZ8LJEH", + "13": "SQ.Ewb9qlgTqN6e_XQiRVYpoUfZJHXeiUfH", + "14": "SQ.5Ji6FGEKfejK1U6BMScqrdKJK8GqmIGf", + "15": "SQ.zIMZb3Ft7RdWa5XYq0PxIlezLY2ccCgt", + "16": "SQ.W6wLoIFOn4G7cjopxPxYNk2lcEqhLQFb", + "17": "SQ.AjWXsI7AkTK35XW9pgd3UbjpC3MAevlz", + "18": "SQ.BTj4BDaaHYoPhD3oY2GdwC_l0uqZ92UD", + "19": "SQ.ItRDD47aMoioDCNW_occY5fWKZBKlxCX", + "20": "SQ.iy_UbUrvECxFRX5LPTH_KPojdlT7BKsf", + "21": "SQ.LpTaNW-hwuY_yARP0rtarCnpCQLkgVCg", + "22": "SQ.XOgHwwR3Upfp5sZYk6ZKzvV25a4RBVu8", + "X": "SQ.v7noePfnNpK8ghYXEqZ9NukMXW7YeNsm", + "Y": "SQ.BT7QyW5iXaX_1PSX-msSGYsqRdMKqkj-", }, } @@ -2672,9 +2672,13 @@ def add_gks_vrs( "state": {"type": "LiteralSequenceExpression", "sequence": vrs_state_sequence}, } - location_id = ga4gh_core._internal.identifiers.ga4gh_identify( - ga4gh_vrs.models.SequenceLocation(**vrs_dict_out["location"]) + seq_ref = ga4gh_vrs.models.SequenceReference(refgetAccession=vrs_chrom_id) + seq_loc = ga4gh_vrs.models.SequenceLocation( + sequenceReference=seq_ref, + start=vrs_start_value, + end=vrs_end_value, ) + location_id = ga4gh_core.ga4gh_identify(seq_loc) vrs_dict_out["location"]["_id"] = location_id @@ -2860,9 +2864,7 @@ def _create_group_dicts( ancillaryResults["grpMaxFAF95"] = { "frequency": input_struct.grpMaxFAF95.grpmax_gen_anc, "confidenceInterval": 0.95, - "groupId": ( - f"{gnomad_id}.{input_struct.grpMaxFAF95.grpmax_gen_anc.upper()}" - ), + "groupId": f"{gnomad_id}.{input_struct.grpMaxFAF95.grpmax_gen_anc.upper()}", } # Add joint group max FAF if it exists. diff --git a/requirements.txt b/requirements.txt index 2148ddafd..3915ad76f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ annoy -ga4gh.vrs[extras]==0.8.4 +ga4gh.vrs[extras]==2.0.1 hail hdbscan ipywidgets diff --git a/tests/utils/test_annotations.py b/tests/utils/test_annotations.py index f6b060ecc..5925fd520 100644 --- a/tests/utils/test_annotations.py +++ b/tests/utils/test_annotations.py @@ -2,10 +2,14 @@ from typing import Dict, List +import ga4gh.core as ga4gh_core +import ga4gh.vrs as ga4gh_vrs import hail as hl import pytest from gnomad.utils.annotations import ( + VRS_CHROM_IDS, + add_gks_vrs, annotate_downsamplings, fill_missing_key_combinations, get_copy_state_by_sex, @@ -1088,6 +1092,110 @@ def test_merge_histograms_sum_with_negatives_error(self, sample_ht): ht.select(result_hist=result_hist).collect() +class TestVRSFunctions: + """Test the VRS-related functions.""" + + def test_vrs_identifier_generation(self): + """Test that GA4GH identifiers are generated correctly.""" + # Test that we can import and access VRS modules + assert hasattr(ga4gh_core, "__version__") + assert hasattr(ga4gh_vrs, "models") + assert hasattr(ga4gh_vrs.models, "SequenceLocation") + + # Test that VRS 2.0.1+ API is available + assert hasattr( + ga4gh_core, "ga4gh_identify" + ), "VRS 2.0.1+ ga4gh_identify function not found" + + # Test that we can create a VRS object using the 2.0.1+ API + seq_loc = ga4gh_vrs.models.SequenceLocation( + sequenceReference="ga4gh:SQ.test", + start=1, + end=2, + ) + assert seq_loc is not None + + def test_vrs_error_handling(self): + """Test that VRS functions handle errors gracefully.""" + # Test with invalid locus (should raise appropriate error) + with pytest.raises((AttributeError, TypeError)): + # This should fail because we're not providing proper Hail locus objects + add_gks_vrs("invalid_locus", "invalid_vrs") + + def test_add_gks_vrs_actual_api_call(self): + """Test that add_gks_vrs actually calls the VRS 2.0.1 API with real data.""" + # Create a real Hail locus and VRS struct that would come from actual data + locus = hl.locus("chr1", 100, reference_genome="GRCh38") + + # Create a VRS struct that mimics what would be in actual gnomAD data + vrs_struct = hl.struct( + VRS_Allele_IDs=["test_ref_id", "ga4gh:VA.test_var_id"], + VRS_Starts=[99, 99], # 0-based coordinates + VRS_Ends=[100, 100], + VRS_States=["A", "T"], # ref, alt + ) + + # Evaluate to get actual Python objects + locus_py = hl.eval(locus) + vrs_py = hl.eval(vrs_struct) + + result = add_gks_vrs(locus_py, vrs_py) + + # Verify the result has the expected VRS structure + assert isinstance(result, dict) + assert result["type"] == "Allele" + assert "location" in result + assert result["location"]["type"] == "SequenceLocation" + assert "_id" in result["location"] # This comes from ga4gh_identify() call + assert "state" in result + assert result["state"]["type"] == "LiteralSequenceExpression" + assert result["state"]["sequence"] == "T" + + # Verify the chromosome ID matches our VRS_CHROM_IDS mapping + expected_chr1_id = VRS_CHROM_IDS["GRCh38"]["chr1"] + assert result["location"]["sequence_id"] == expected_chr1_id + assert expected_chr1_id == "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO" + + # Verify the location ID was generated by the VRS API + location_id = result["location"]["_id"] + assert isinstance(location_id, str) + assert location_id.startswith("ga4gh:") # VRS identifiers start with ga4gh: + + def test_add_gks_vrs_grch37_chromosome_mapping(self): + """Test that VRS chromosome mapping works correctly for GRCh37.""" + # Test GRCh37 (note different chromosome naming: "1" vs "chr1") + locus_grch37 = hl.locus("1", 100, reference_genome="GRCh37") + vrs_struct = hl.struct( + VRS_Allele_IDs=["test_ref_id", "ga4gh:VA.test_var_id"], + VRS_Starts=[99, 99], + VRS_Ends=[100, 100], + VRS_States=["A", "T"], + ) + + locus_py = hl.eval(locus_grch37) + vrs_py = hl.eval(vrs_struct) + + result = add_gks_vrs(locus_py, vrs_py) + + # Verify GRCh37 chromosome 1 mapping + expected_chr1_grch37_id = VRS_CHROM_IDS["GRCh37"]["1"] + assert result["location"]["sequence_id"] == expected_chr1_grch37_id + assert expected_chr1_grch37_id == "SQ.S_KjnFVz-FE7M0W6yoaUDgYxLPc1jyWU" + + # Verify the API was called and generated a proper identifier + location_id = result["location"]["_id"] + assert isinstance(location_id, str) + assert location_id.startswith("ga4gh:") + + def test_vrs_version_compatibility(self): + """Test that VRS 2.0.1+ is properly installed.""" + # Test that VRS 2.0.1+ API is available + assert hasattr(ga4gh_core, "ga4gh_identify"), "VRS 2.0.1+ is required" + + # The function should work with VRS 2.0.1+ + assert callable(add_gks_vrs) + + class TestAnnotateDownsamplings: """Test the annotate_downsamplings function."""