-
Notifications
You must be signed in to change notification settings - Fork 31
Add gnomad_gks() and get_gks() for extracting gks information for a specified variant #539
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 24 commits
20a4e7e
6d30766
80ec015
f9d2a7b
fa6bfd4
9985815
d18e1c6
c0ac756
b22197d
26c99f8
e5dbbb1
e4956f1
377bd69
36b64b0
d811c4b
a1e8fc9
8bd283f
01c5e30
0868da8
2ec4d3c
8987051
cf2095f
737eedc
4dd1b60
ddca8b4
f64795c
c97fe5a
e04691e
08682a3
03044bf
c6ec474
925b4ac
1a752c0
88f7309
3e5da3e
becf9ce
c453934
22ab092
4a085e7
2f275ce
4f6de00
6e11d1b
6ee24bc
14819af
1f58568
87fd2fd
634ab95
3499e40
4d374d2
3aa7f68
55340de
2d40b72
ac50e04
8c29fb8
ce76d83
93be0d3
0d4a67a
19f3c26
50c2b22
6b75bfa
914b313
c4d65b8
e6ad87f
95bf58c
980b1e8
f006281
bc75879
7a7ea7d
589a2fc
65de216
8fd0ef9
00418c6
1013b60
320d4e2
f9c2f0f
5826d02
fb5cbcc
3befde4
d467939
8bc2e39
1043918
fd7e183
97b2b12
7060e65
6a3ab9a
8aec570
33aa1b9
ced5cd7
5121ca5
5b62076
2d20af2
c775ffd
8b9f87e
d0cce17
64b0117
c8146ba
abb8187
62e7c63
2eb04cf
2115597
4935ff2
517f2c5
61a85c7
7e0d001
c0c17e7
cd7a81a
1fc3df9
24c069e
511b8d7
3f49f84
1eefe6a
8273a8b
10f4dfa
faa26c3
9f4f0b7
2acb730
0106f08
2521ec4
4263f07
8fe0457
dd7b65e
1911962
389622f
f02cc53
be87522
71131a0
44d37d7
1ce6f65
974c846
ccd8d5c
5d837ef
6d8dd71
057e2e7
249c3de
d5e495f
363b9d3
0cfee66
db12d9b
1807bda
9244cb5
40d055a
20a3831
2273906
c23119b
a089bbc
f7ce832
1dda889
1bb92bb
8b9121b
b25685b
6183ad9
1579002
581cd7a
12a11ae
c946a0b
be43606
d622059
edad95d
a4dc91c
af73f09
6dcc781
89de5e2
3a448cd
541ad88
28fc712
7d90d58
2bc0a22
aaef59b
0b355d1
d1ce1d4
399826e
e2142ee
e409d67
6f2d8a0
30c2ef3
53e75f1
89c980f
e838a7a
ec030eb
3a52ddc
f1d87a3
a12975d
688eaf1
fe7a612
756beb8
b2f072f
35e581e
bc668a4
1e863e1
a4032b7
4383dc7
b1d749f
9888ce5
7095e94
5e290b2
4b20c83
95dccd5
b4997ae
cab3034
2aedfe5
0db45d9
3753558
e6f042a
0ce44f1
01713a7
0a7aad4
82cc9b3
1926316
0ca8b3b
08ce557
24188af
e48c7b5
71d25bb
8855a36
97f6ab5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,12 +1,17 @@ | ||
| # noqa: D100 | ||
|
|
||
| import json | ||
| import logging | ||
| from typing import Any, Dict, List, Optional, Set, Tuple, Union | ||
|
|
||
| import ga4gh.core as ga4gh_core | ||
| import ga4gh.vrs as ga4gh_vrs | ||
| import hail as hl | ||
| from hail.utils.misc import new_temp_file # format and run isort on later when pulled | ||
matren395 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
matren395 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| from gnomad.utils.gen_stats import to_phred | ||
|
|
||
|
|
||
| logging.basicConfig( | ||
| format="%(asctime)s (%(name)s %(lineno)s): %(message)s", | ||
| datefmt="%m/%d/%Y %I:%M:%S %p", | ||
|
|
@@ -32,6 +37,61 @@ | |
| "pab_max": (0, 1, 50), | ||
| } | ||
|
|
||
| VRS_CHROM_IDS = { | ||
matren395 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| "GRCh38": { | ||
| "chr1": "ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", | ||
| "chr2": "ga4gh:SQ.pnAqCRBrTsUoBghSD1yp_jXWSmlbdh4g", | ||
| "chr3": "ga4gh:SQ.Zu7h9AggXxhTaGVsy7h_EZSChSZGcmgX", | ||
| "chr4": "ga4gh:SQ.HxuclGHh0XCDuF8x6yQrpHUBL7ZntAHc", | ||
| "chr5": "ga4gh:SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI", | ||
| "chr6": "ga4gh:SQ.0iKlIQk2oZLoeOG9P1riRU6hvL5Ux8TV", | ||
| "chr7": "ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", | ||
| "chr8": "ga4gh:SQ.209Z7zJ-mFypBEWLk4rNC6S_OxY5p7bs", | ||
| "chr9": "ga4gh:SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI", | ||
| "chr10": "ga4gh:SQ.ss8r_wB0-b9r44TQTMmVTI92884QvBiB", | ||
| "chr11": "ga4gh:SQ.2NkFm8HK88MqeNkCgj78KidCAXgnsfV1", | ||
| "chr12": "ga4gh:SQ.6wlJpONE3oNb4D69ULmEXhqyDZ4vwNfl", | ||
| "chr13": "ga4gh:SQ._0wi-qoDrvram155UmcSC-zA5ZK4fpLT", | ||
| "chr14": "ga4gh:SQ.eK4D2MosgK_ivBkgi6FVPg5UXs1bYESm", | ||
| "chr15": "ga4gh:SQ.AsXvWL1-2i5U_buw6_niVIxD6zTbAuS6", | ||
| "chr16": "ga4gh:SQ.yC_0RBj3fgBlvgyAuycbzdubtLxq-rE0", | ||
| "chr17": "ga4gh:SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7", | ||
| "chr18": "ga4gh:SQ.vWwFhJ5lQDMhh-czg06YtlWqu0lvFAZV", | ||
| "chr19": "ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl", | ||
| "chr20": "ga4gh:SQ.-A1QmD_MatoqxvgVxBLZTONHz9-c7nQo", | ||
| "chr21": "ga4gh:SQ.5ZUqxCmDDgN4xTRbaSjN8LwgZironmB8", | ||
| "chr22": "ga4gh:SQ.7B7SHsmchAR0dFcDCuSFjJAo7tX87krQ", | ||
| "chrX": "ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", | ||
| "chrY": "ga4gh:SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5", | ||
| }, | ||
| "GRCh37": { | ||
| "1": "ga4gh:SQ.S_KjnFVz-FE7M0W6yoaUDgYxLPc1jyWU", | ||
| "2": "ga4gh:SQ.9KdcA9ZpY1Cpvxvg8bMSLYDUpsX6GDLO", | ||
| "3": "ga4gh:SQ.VNBualIltAyi2AI_uXcKU7M9XUOuA7MS", | ||
| "4": "ga4gh:SQ.iy7Zfceb5_VGtTQzJ-v5JpPbpeifHD_V", | ||
| "5": "ga4gh:SQ.vbjOdMfHJvTjK_nqvFvpaSKhZillW0SX", | ||
| "6": "ga4gh:SQ.KqaUhJMW3CDjhoVtBetdEKT1n6hM-7Ek", | ||
| "7": "ga4gh:SQ.IW78mgV5Cqf6M24hy52hPjyyo5tCCd86", | ||
| "8": "ga4gh:SQ.tTm7wmhz0G4lpt8wPspcNkAD_qiminj6", | ||
| "9": "ga4gh:SQ.HBckYGQ4wYG9APHLpjoQ9UUe9v7NxExt", | ||
| "10": "ga4gh:SQ.-BOZ8Esn8J88qDwNiSEwUr5425UXdiGX", | ||
| "11": "ga4gh:SQ.XXi2_O1ly-CCOi3HP5TypAw7LtC6niFG", | ||
| "12": "ga4gh:SQ.105bBysLoDFQHhajooTAUyUkNiZ8LJEH", | ||
| "13": "ga4gh:SQ.Ewb9qlgTqN6e_XQiRVYpoUfZJHXeiUfH", | ||
| "14": "ga4gh:SQ.5Ji6FGEKfejK1U6BMScqrdKJK8GqmIGf", | ||
| "15": "ga4gh:SQ.zIMZb3Ft7RdWa5XYq0PxIlezLY2ccCgt", | ||
| "16": "ga4gh:SQ.W6wLoIFOn4G7cjopxPxYNk2lcEqhLQFb", | ||
| "17": "ga4gh:SQ.AjWXsI7AkTK35XW9pgd3UbjpC3MAevlz", | ||
| "18": "ga4gh:SQ.BTj4BDaaHYoPhD3oY2GdwC_l0uqZ92UD", | ||
| "19": "ga4gh:SQ.ItRDD47aMoioDCNW_occY5fWKZBKlxCX", | ||
| "20": "ga4gh:SQ.iy_UbUrvECxFRX5LPTH_KPojdlT7BKsf", | ||
| "21": "ga4gh:SQ.LpTaNW-hwuY_yARP0rtarCnpCQLkgVCg", | ||
| "22": "ga4gh:SQ.XOgHwwR3Upfp5sZYk6ZKzvV25a4RBVu8", | ||
| "X": "ga4gh:SQ.v7noePfnNpK8ghYXEqZ9NukMXW7YeNsm", | ||
| "Y": "ga4gh:SQ.BT7QyW5iXaX_1PSX-msSGYsqRdMKqkj-" | ||
| }, | ||
| } | ||
|
|
||
|
|
||
| def pop_max_expr( | ||
| freq: hl.expr.ArrayExpression, | ||
|
|
@@ -1138,3 +1198,219 @@ def hemi_expr( | |
| # mt.GT[0] is alternate allele | ||
| gt.is_haploid() & (sex_expr == male_str) & (gt[0] == 1), | ||
| ) | ||
|
|
||
|
|
||
| def get_gks( | ||
| ht: hl.Table, | ||
| variant: str, | ||
| label_name: str, | ||
| label_version: str, | ||
| coverage_ht: hl.Table, | ||
matren395 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| ancestry_groups: list = None, | ||
| ancestry_groups_dict: dict = None, | ||
| by_sex: bool = False, | ||
| vrs_only: bool = False, | ||
| ) -> dict: | ||
| """ | ||
| Filter to a specified variant and return a Python dictionary containing the GA4GH variant report schema. | ||
|
|
||
| :param ht: Hail Table to parse for desired variant. | ||
| :param variant: String of variant to search for (chromosome, position, ref, and alt, separated by '-'). Example for a variant in build GRCh38: "chr5-38258681-C-T". | ||
| :param label_name: Label name to use within the returned dictionary. Example: "gnomAD". | ||
| :param label_version: String listing the version of the HT being used. Example: "3.1.2" . | ||
| :param coverage_ht: Hail Table containing coverage statistics, with mean depth stored in "mean" annotation. | ||
| :param ancestry_groups: List of strings of shortened names of genetic ancestry groups to return results for. Example: ['afr','fin','nfe'] . | ||
| :param ancestry_groups_dict: Dict mapping shortened genetic ancestry group names to full names. Example: {'afr':'African/African American'} . | ||
| :param by_sex: Boolean to include breakdown of ancestry groups by inferred sex (XX and XY) as well. | ||
| :param vrs_only: Boolean to return only the VRS information and no general frequency information. Default is False. | ||
| :return: Dictionary containing VRS information (and frequency information split by ancestry groups and sex if desired) for the specified variant. | ||
|
|
||
| """ | ||
| from gnomad.utils.filtering import get_reference_genome | ||
matren395 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| # Define variables for variant information. | ||
| build_in = get_reference_genome(ht.locus).name | ||
| chrom_dict = VRS_CHROM_IDS[build_in] | ||
| chr_in, pos_in, ref_in, alt_in = variant.split("-") | ||
|
|
||
| # Filter HT to desired variant. | ||
| ht = ht.filter( | ||
| ( | ||
| ht.locus | ||
| == hl.locus(contig=chr_in, pos=int(pos_in), reference_genome=build_in) | ||
| ) | ||
| & (ht.alleles == [ref_in, alt_in]) | ||
| ) | ||
| ht = ht.checkpoint(new_temp_file("get_gks", extension="ht")) | ||
| # Check to ensure the ht is successfully filtered to 1 variant. | ||
| if ht.count() != 1: | ||
| raise ValueError( | ||
| "Error: can only work with one variant for this code, 0 or multiple" | ||
| " returned." | ||
| ) | ||
|
|
||
| # Define VRS Attributes that will later be read into the dictionary to be returned. | ||
| vrs_id = f"{ht.info.vrs.VRS_Allele_IDs[1].collect()[0]}" | ||
| vrs_chrom_id = f"{chrom_dict[chr_in]}" | ||
| vrs_start_value = ht.info.vrs.VRS_Starts[1].collect()[0] | ||
| vrs_end_value = ht.info.vrs.VRS_Ends[1].collect()[0] | ||
| vrs_state_sequence = f"{ht.info.vrs.VRS_States[1].collect()[0]}" | ||
|
|
||
| # Defining the dictionary for VRS information. | ||
| vrs_dict = { | ||
| "_id": vrs_id, | ||
| "location": { | ||
| "_id": "to-be-defined", | ||
| "interval": { | ||
| "end": {"type": "Number", "value": vrs_end_value}, | ||
| "start": { | ||
| "type": "Number", | ||
| "value": vrs_start_value, | ||
| }, | ||
| "type": "SequenceInterval", | ||
| }, | ||
| "sequence_id": vrs_chrom_id, | ||
| "type": "SequenceLocation", | ||
| }, | ||
| "state": {"sequence": vrs_state_sequence, "type": "LiteralSequenceExpression"}, | ||
| "type": "Allele", | ||
| } | ||
|
|
||
| # Set location ID | ||
| location_dict = vrs_dict["location"] | ||
| location_dict.pop("_id") | ||
| location = ga4gh_vrs.models.SequenceLocation(**location_dict) | ||
| location_id = ga4gh_core._internal.identifiers.ga4gh_identify(location) | ||
| vrs_dict["location"]["_id"] = location_id | ||
|
|
||
| logger.info(vrs_dict) | ||
|
|
||
matren395 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| # If vrs_only was passed, only return the above dict and stop. | ||
| if vrs_only: | ||
| return vrs_dict | ||
|
|
||
| # Create a list to then add the dictionaries for frequency reports for different ancestry groups to. | ||
| list_of_group_info_dicts = [] | ||
|
|
||
| # Define function to return a frequency report dictionary for a given group | ||
| def _create_group_dicts( | ||
| variant_ht: hl.Table, | ||
| group_index: int, | ||
| group_id: str, | ||
| group_label: str, | ||
| vrs_id: str, | ||
| ) -> dict: | ||
| """ | ||
| Return a dictionary for the frequency information of a given variant for a given subpopulation | ||
matren395 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| :param variant_ht: Hail Table with only one row, only containing the desired variant. | ||
| :param group_index: Index of frequency within the 'freq' annotation containing the desired group. | ||
| :param group_id: String containing variant, genetic ancestry group, and sex (if requested). Example: "chr19-41094895-C-T.afr.XX". | ||
| :param group_label: String containing the full name of genetic ancestry group requested. Example: "African/African American". | ||
| :param vrs_id: String containing the VRS ID of the variant in ht_subpop. | ||
| :return: Dictionary containing VRS information (and genetic ancestry group if desired) for specified variant. | ||
|
|
||
| """ | ||
|
|
||
| # Obtain frequency information for the specified variant | ||
| group_freq = variant_ht.freq[group_index] | ||
|
|
||
| # Dictionary to be returned containing information for a specified group | ||
| freq_record = { | ||
| "id": group_id, | ||
|
||
| "type": "PopulationAlleleFrequency", | ||
| "label": f"{group_label} Population Allele Frequency for {variant}", | ||
| "focusAllele": vrs_id, | ||
| "focusAlleleCount": group_freq["AC"].collect()[0], | ||
| "locusAlleleCount": group_freq["AN"].collect()[0], | ||
| "alleleFrequency": group_freq["AF"].collect()[0], | ||
| "population": f"{label_name}{label_version}:{group_id}", | ||
| "ancillaryResults": { | ||
| "homozygotes": group_freq["homozygote_count"].collect()[0] | ||
| }, | ||
| } | ||
|
|
||
| return freq_record | ||
|
|
||
| # Iterate through provided groups and generate dictionaries | ||
| if ancestry_groups: | ||
| for group in ancestry_groups: | ||
| key = f"{group}-adj" | ||
| index_value = ht.freq_index_dict.get(key) | ||
| group_result = _create_group_dicts( | ||
| variant_ht=ht, | ||
| group_index=index_value, | ||
| group_id=group, | ||
matren395 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| group_label=ancestry_groups_dict[group], | ||
| vrs_id=vrs_id, | ||
| ) | ||
|
|
||
| # If specified, stratify group information by sex. | ||
| if by_sex: | ||
| sex_list = [] | ||
| for sex in ["XX", "XY"]: | ||
| sex_key = f"{group}-{sex}-adj" | ||
| sex_index_value = ht.freq_index_dict.get(sex_key) | ||
| sex_label = f"{group}.{sex}" | ||
matren395 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| sex_result = _create_group_dicts( | ||
| variant_ht=ht, | ||
| group_index=sex_index_value, | ||
| group_id=sex_label, | ||
| group_label=ancestry_groups_dict[group], | ||
| vrs_id=vrs_id, | ||
| ) | ||
| sex_list.append(sex_result) | ||
|
|
||
| group_result["subpopulationFrequency"] = sex_list | ||
|
|
||
| list_of_group_info_dicts.append(group_result) | ||
|
|
||
| # Overall frequency, via label 'adj' which is currently stored at position #1 (index 0) | ||
| overall_freq = ht.freq[0] | ||
|
|
||
| # Read coverage statistics | ||
| coverage_ht = coverage_ht.filter( | ||
| coverage_ht.locus | ||
| == hl.locus(contig=chr_in, pos=int(pos_in), reference_genome=build_in) | ||
| ) | ||
| mean_coverage = coverage_ht.mean.collect()[0] | ||
|
|
||
| # Final dictionary to be returned | ||
| final_freq_dict = { | ||
| "id": f"{label_name}{label_version}:{variant}", | ||
| "type": "PopulationAlleleFrequency", | ||
| "label": f"Overall Population Allele Frequency for {variant}", | ||
| "derivedFrom": { | ||
| "id": f"{label_name}{label_version}", | ||
| "type": "DataSet", | ||
| "label": f"{label_name} v{label_version}", | ||
| "version": f"{label_version}", | ||
| }, | ||
| "focusAllele": vrs_dict, | ||
| "focusAlleleCount": overall_freq["AC"].collect()[0], | ||
| "locusAlleleCount": overall_freq["AN"].collect()[0], | ||
| "alleleFrequency": overall_freq["AF"].collect()[0], | ||
| "population": f"{label_name}{label_version}:global", | ||
| "ancillaryResults": { | ||
| "popMaxFAF95": { | ||
| "frequency": ht.popmax.faf95.collect()[0], | ||
| "confidenceInterval": 0.95, | ||
| "popFreqId": f"{variant}.{ht.popmax.pop.collect()[0].upper()}", | ||
| }, | ||
| "homozygotes": overall_freq["homozygote_count"].collect()[0], | ||
| "meanDepth": mean_coverage, | ||
| }, | ||
| } | ||
|
|
||
| # If ancestry_groups were passed, add the ancestry group dictionary to the final frequency dictionary to be returned. | ||
| if ancestry_groups: | ||
| final_freq_dict["subpopulationFrequency"] = list_of_group_info_dicts | ||
|
|
||
| # Validate that our constructed dictionary would convert to a JSON string. | ||
matren395 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| try: | ||
| validated_json = json.dumps(final_freq_dict) | ||
| except: | ||
| raise SyntaxError("The dictionary did not convert to a valid JSON") | ||
|
|
||
| # Returns the constructed dictionary. | ||
| return final_freq_dict | ||
Uh oh!
There was an error while loading. Please reload this page.