From 14819afb4ed40abe22b7bdf50db2bde3f6b5bb52 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Wed, 7 Jun 2023 14:53:44 -0400 Subject: [PATCH 01/24] Merge N freq arrays by sum only --- gnomad/utils/annotations.py | 136 ++++++++++++++++++++++++++++++++++-- 1 file changed, 132 insertions(+), 4 deletions(-) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index ac6881030..090fe6a07 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -125,7 +125,7 @@ def project_max_expr( > 0 ), # order the callstats computed by AF in decreasing order - lambda x: -x[1].AF[ai] + lambda x: -x[1].AF[ai], # take the n_projects projects with largest AF )[:n_projects].map( # add the project in the callstats struct @@ -219,8 +219,10 @@ def faf_expr( ~locus.in_autosome_or_par(), hl.struct( **{ - f"faf{str(threshold)[2:]}": hl.experimental.filtering_allele_frequency( - freq[i].AC, freq[i].AN, threshold + f"faf{str(threshold)[2:]}": ( + hl.experimental.filtering_allele_frequency( + freq[i].AC, freq[i].AN, threshold + ) ) for threshold in faf_thresholds } @@ -1106,7 +1108,7 @@ def region_flag_expr( :return: `region_flag` struct row annotation """ prob_flags_expr = ( - {"non_par": (t.locus.in_x_nonpar() | t.locus.in_y_nonpar())} if non_par else {} + {"non_par": t.locus.in_x_nonpar() | t.locus.in_y_nonpar()} if non_par else {} ) if prob_regions is not None: @@ -1188,3 +1190,129 @@ def hemi_expr( # mt.GT[0] is alternate allele gt.is_haploid() & (sex_expr == male_str) & (gt[0] == 1), ) + + +def make_freq_index_dict_from_meta( + freq_meta: List[Dict[str, str]], + label_delimiter: str = "_", + sort_order: Optional[List[str]] = SORT_ORDER, +) -> Dict[str, int]: + """ + Create a dictionary for accessing frequency array. + + :param freq_meta: List of dictionaries containing frequency metadata. + :param label_delimiter: Delimiter to use when joining frequency metadata labels. + :param sort_order: List of frequency metadata labels to use when sorting the dictionary. + :return: Dictionary of frequency metadata. + """ + index_dict = {} + for i, f in enumerate(hl.eval(freq_meta)): + if sort_order and len(set(f.keys()) - set(sort_order)) < 1: + index_dict[ + label_delimiter.join( + [ + f[g] + for g in sorted( + f.keys(), + key=(lambda x: sort_order.index(x)) if sort_order else None, + ) + ] + ) + ] = i + + return index_dict + + +def merge_freq_arrays( + farrays: List[hl.expr.ArrayExpression], + fmeta: List[List[Dict[str, str]]], + operation: str = "sum", + sort_order: Optional[List[str]] = SORT_ORDER, +) -> hl.expr.ArrayExpression: + """ + Merge frequency arrays from multiple datasets. + + Farrays and fmeta do not need to be the same or in the same order. They will be + merged based on the join type and operation. Missing values are dependent on the + join type and operation. + :param farrays: List of frequency arrays to merge. First entry in the list is the primary array to which other arrays will be added or subtracted. Array must be on the same HT. + :param fmeta: List of frequency metadata for arrays being merged. + :param fill_missing: If False, return an array with length equal to the shortest length of the arrays. If True, return an array equal to the longest length of the arrays, by extending the shorter arrays with missing values. + :param operation: Operation to perform on the frequency arrays. Default is "sum". Options are "sum" and "diff". + :return: Merged frequency array. + """ + if len(farrays) < 2: + raise ValueError("Must provide at least two frequency arrays to merge") + if operation not in ["sum", "diff"]: + raise ValueError("Operation must be either 'sum' or 'diff'") + if len(farrays) != len(fmeta): + raise ValueError("Length of farrays and fmeta must be equal") + + # Create dictionary of frequency metadata, use first entry in fmeta as primary to + # anchor ordering, add other indices as needed, basically build freq_meta_dict + # before the actual freq array is created + fmeta_dict = make_freq_index_dict_from_meta( + freq_meta=fmeta[0], sort_order=sort_order + ) + # Create list of all groups in fmeta and if they exist in fmeta_dict, + # use the end of the list as the index in the array and add the group to freq dict + for meta in fmeta[1:]: + meta_dict = make_freq_index_dict_from_meta(meta, sort_order=sort_order) + for group in meta_dict.keys(): + if group not in fmeta_dict.keys(): + fmeta_dict[group] = len( + fmeta_dict.keys() + ) # TODO: See if can update to hail expr all_groups = hl.array(hl.set(hl.flatten(hl.map(lambda d: d.keys(), fmeta_dict)))) does it but not sorted, look into sorted? + + # Create freq array where entry is group then use map to transform to freq struct + merged_farray = hl.map( + lambda x: x[0], hl.sorted(fmeta_dict, key=lambda item: item[1]) + ) + # Make a single merged freq array by finding farray index in corresponding + # fmeta_dict for each group in all_groups set + zfreq = hl.zip(farrays, fmeta_dict) + merged_farray = merged_farray.map( + lambda x: hl.struct( + AC=hl.sum( + hl.map( + lambda f: hl.if_else(f[1].contains(x), f[0][f[1][x]].AC, 0), zfreq + ) + ), + AN=hl.sum( + hl.map( + lambda f: hl.if_else(f[1].contains(x), f[0][f[1][x]].AN, 0), zfreq + ) + ), + AF=hl.if_else( + hl.sum( + hl.map( + lambda f: hl.if_else(f[1].contains(x), f[0][f[1][x]].AN, 0), + zfreq, + ) + ) + > 0, + hl.sum( + hl.map( + lambda f: hl.if_else(f[1].contains(x), f[0][f[1][x]].AC, 0), + zfreq, + ) + ) + / hl.sum( + hl.map( + lambda f: hl.if_else(f[1].contains(x), f[0][f[1][x]].AN, 0), + zfreq, + ) + ), + 0, + ), + homozygote_count=hl.sum( + hl.map( + lambda f: hl.if_else( + f[1].contains(x), f[0][f[1][x]].homozygote_count, 0 + ), + zfreq, + ) + ), + ) + ) + return merged_farray From 1f58568c08cf5b5082df9738cf3c85f001b4ed21 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Thu, 8 Jun 2023 15:55:37 -0400 Subject: [PATCH 02/24] Add subtraction --- gnomad/utils/annotations.py | 194 ++++++++++++++++++++++++------------ 1 file changed, 129 insertions(+), 65 deletions(-) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 090fe6a07..949112324 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -1237,8 +1237,8 @@ def merge_freq_arrays( join type and operation. :param farrays: List of frequency arrays to merge. First entry in the list is the primary array to which other arrays will be added or subtracted. Array must be on the same HT. :param fmeta: List of frequency metadata for arrays being merged. - :param fill_missing: If False, return an array with length equal to the shortest length of the arrays. If True, return an array equal to the longest length of the arrays, by extending the shorter arrays with missing values. - :param operation: Operation to perform on the frequency arrays. Default is "sum". Options are "sum" and "diff". + :param operation: Operation to perform on the frequency arrays. Default is "sum". Options are "sum" and "diff". IF "diff" is passed, the first array in the list will have the other arrays subtracted from it. + :param sort_order: Order of frequency metadata labels to use when building label when building key for the freq dictionary. :return: Merged frequency array. """ if len(farrays) < 2: @@ -1248,71 +1248,135 @@ def merge_freq_arrays( if len(farrays) != len(fmeta): raise ValueError("Length of farrays and fmeta must be equal") - # Create dictionary of frequency metadata, use first entry in fmeta as primary to - # anchor ordering, add other indices as needed, basically build freq_meta_dict - # before the actual freq array is created - fmeta_dict = make_freq_index_dict_from_meta( - freq_meta=fmeta[0], sort_order=sort_order - ) - # Create list of all groups in fmeta and if they exist in fmeta_dict, - # use the end of the list as the index in the array and add the group to freq dict - for meta in fmeta[1:]: - meta_dict = make_freq_index_dict_from_meta(meta, sort_order=sort_order) - for group in meta_dict.keys(): - if group not in fmeta_dict.keys(): - fmeta_dict[group] = len( - fmeta_dict.keys() - ) # TODO: See if can update to hail expr all_groups = hl.array(hl.set(hl.flatten(hl.map(lambda d: d.keys(), fmeta_dict)))) does it but not sorted, look into sorted? - - # Create freq array where entry is group then use map to transform to freq struct - merged_farray = hl.map( - lambda x: x[0], hl.sorted(fmeta_dict, key=lambda item: item[1]) + # Make list of freq_dicts, one for each entry in the fmeta array + fmeta_dicts = [ + make_freq_index_dict_from_meta(meta, sort_order=sort_order) for meta in fmeta + ] + # Create dictionary from the first entry in fmeta to anchor ordering of groups + # in the merged freq array + merged_freq_dict = fmeta_dicts[0] + + # Go through each freq array dictionary, if they contain a group not present in the anchor + # dictionary, add the group and use the length of the anchor dictionary as + # it's index in the array + for fmeta_dict in fmeta_dicts[1:]: + for group in fmeta_dict.keys(): + if group not in merged_freq_dict.keys(): + merged_freq_dict[group] = len( + merged_freq_dict.keys() + ) # TODO: See if can update to hail, i.e all_groups = hl.array(hl.set(hl.flatten(hl.map(lambda d: d.keys(), fmeta_dict)))) does it but wont be sorted, look into hl.sorted? + + # Create array where the entries are the freq agg groupings which will + # then be transformed into our merged freqeunecy array of callstat structs + merged_freq_array = hl.map( + lambda x: x[0], hl.sorted(merged_freq_dict, key=lambda item: item[1]) ) + + def _AC_merge( + group, zipped_array_dicts, operation="sum" + ) -> hl.expr.Int32Expression: + """ + Merge allele counts from multiple frequency arrays. + + :param group: Group to merge across freq arrays. + :param zipped_array_dicts: Zipped list of frequency arrays and their corresponding dictionaries. + :param operation: Operation to perform on the frequency arrays. Default is "sum". Options are "sum" + and "diff". If "diff" is passed, the first array in the list will have the other arrays AC subtracted from it. + :return: Merged allele count. + """ + merged_AC = hl.sum( + hl.map( + lambda f: hl.if_else(f[1].contains(group), f[0][f[1][group]].AC, 0), + zipped_array_dicts, + ) + ) + # If operation is diff, subtract the sum of all ACs from 2* AC of the + # first entry in the freq array + if operation == "diff": + merged = 2 * hl.sum(zfreq[0][0][zfreq[0][1][group]].AC) - merged_AC + return merged + + def _AN_merge( + group, zipped_array_dicts, operation="sum" + ) -> hl.expr.Int32Expression: + """ + Merge allele numbers from multiple frequency arrays. + + :param group: Group to merge across freq arrays. + :param zipped_array_dicts: Zipped list of frequency arrays and their corresponding dictionaries. + :param operation: Operation to perform on the frequency arrays. Default is "sum". Options are "sum" + and "diff". If "diff" is passed, the first array in the list will have the other arrays AN subtracted from it. + :return: Merged allele number. + """ + merged_AN = hl.sum( + hl.map( + lambda f: hl.if_else(f[1].contains(group), f[0][f[1][group]].AN, 0), + zipped_array_dicts, + ) + ) + # If operation is diff, subtract the sum of all ANs from 2*AN of the first + # entry in the freq array + if operation == "diff": + merged_AN = 2 * hl.sum(zfreq[0][0][zfreq[0][1][group]].AN) - merged_AN + + return merged_AN + + def _AF_merge( + group, zipped_array_dicts, operation="sum" + ) -> hl.expr.Float32Expression: + """ + Calculate new allele frequencies using merged AC and AN from multiple frequency arrays. + + :param group: Group to merge across freq arrays. + :param zipped_array_dicts: Zipped list of frequency arrays and their corresponding dictionaries. + :param operation: Operation to perform on the frequency arrays. Default is "sum". Options are "sum" and "diff". + If "diff" is passed, the first array in the list will have the other arrays AC and AN subtracted from its own. + :return: Merged allele frequency. + """ + merged_AC = _AC_merge(group, zipped_array_dicts, operation=operation) + merged_AN = _AN_merge(group, zipped_array_dicts, operation=operation) + return hl.if_else( + merged_AN > 0, + merged_AC / merged_AN, + 0, + ) + + def _homozygote_count_merge( + group, zipped_array_dicts, operation="sum" + ) -> hl.expr.Int32Expression: + """ + Merge homozygote counts from multiple frequency arrays. + + :param group: Group to merge across freq arrays. + :param zipped_array_dicts: Zipped list of frequency arrays and their corresponding dictionaries. + :param operation: Operation to perform on the frequency arrays. Default is "sum". Options are "sum" + and "diff". If "diff" is passed, the first array in the list will have the other arrays' homozygote count subtracted from it. + :return: Merged homozygote count. + """ + sum_homozygote_count = hl.sum( + hl.map( + lambda f: hl.if_else( + f[1].contains(group), f[0][f[1][group]].homozygote_count, 0 + ), + zipped_array_dicts, + ) + ) + if operation == "diff": + sum_homozygote_count = ( + 2 * hl.sum(zfreq[0][zfreq[1][group]].homozygote_count) + - sum_homozygote_count + ) + return sum_homozygote_count + # Make a single merged freq array by finding farray index in corresponding # fmeta_dict for each group in all_groups set - zfreq = hl.zip(farrays, fmeta_dict) - merged_farray = merged_farray.map( - lambda x: hl.struct( - AC=hl.sum( - hl.map( - lambda f: hl.if_else(f[1].contains(x), f[0][f[1][x]].AC, 0), zfreq - ) - ), - AN=hl.sum( - hl.map( - lambda f: hl.if_else(f[1].contains(x), f[0][f[1][x]].AN, 0), zfreq - ) - ), - AF=hl.if_else( - hl.sum( - hl.map( - lambda f: hl.if_else(f[1].contains(x), f[0][f[1][x]].AN, 0), - zfreq, - ) - ) - > 0, - hl.sum( - hl.map( - lambda f: hl.if_else(f[1].contains(x), f[0][f[1][x]].AC, 0), - zfreq, - ) - ) - / hl.sum( - hl.map( - lambda f: hl.if_else(f[1].contains(x), f[0][f[1][x]].AN, 0), - zfreq, - ) - ), - 0, - ), - homozygote_count=hl.sum( - hl.map( - lambda f: hl.if_else( - f[1].contains(x), f[0][f[1][x]].homozygote_count, 0 - ), - zfreq, - ) - ), + zfreq = hl.zip(farrays, fmeta_dicts) + merged_freq_array = merged_freq_array.map( + lambda group: hl.struct( + AC=_AC_merge(group, zfreq, operation=operation), + AN=_AN_merge(group, zfreq, operation=operation), + AF=_AF_merge(group, zfreq, operation=operation), + homozygote_count=_homozygote_count_merge(group, zfreq, operation=operation), ) ) - return merged_farray + return merged_freq_array From 87fd2fdf26631decd5ffb9fc6dd2dc4f56eeb8da Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Thu, 8 Jun 2023 17:45:04 -0400 Subject: [PATCH 03/24] Update diff dictionary and add comments --- gnomad/utils/annotations.py | 68 +++++++++++++++++++++++++------------ 1 file changed, 46 insertions(+), 22 deletions(-) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 949112324..6b568e82d 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -6,6 +6,7 @@ import hail as hl from gnomad.utils.gen_stats import to_phred +from gnomad.utils.vcf import SORT_ORDER logging.basicConfig( format="%(asctime)s (%(name)s %(lineno)s): %(message)s", @@ -1228,7 +1229,7 @@ def merge_freq_arrays( fmeta: List[List[Dict[str, str]]], operation: str = "sum", sort_order: Optional[List[str]] = SORT_ORDER, -) -> hl.expr.ArrayExpression: +) -> Tuple[hl.expr.ArrayExpression, Dict[str, int]]: """ Merge frequency arrays from multiple datasets. @@ -1239,7 +1240,7 @@ def merge_freq_arrays( :param fmeta: List of frequency metadata for arrays being merged. :param operation: Operation to perform on the frequency arrays. Default is "sum". Options are "sum" and "diff". IF "diff" is passed, the first array in the list will have the other arrays subtracted from it. :param sort_order: Order of frequency metadata labels to use when building label when building key for the freq dictionary. - :return: Merged frequency array. + :return: Tuple of merged frequency array and its freq index dictionary. """ if len(farrays) < 2: raise ValueError("Must provide at least two frequency arrays to merge") @@ -1248,7 +1249,8 @@ def merge_freq_arrays( if len(farrays) != len(fmeta): raise ValueError("Length of farrays and fmeta must be equal") - # Make list of freq_dicts, one for each entry in the fmeta array + # Make list of freq_dicts, one for each entry in the fmeta array, to + # ensure label formatting matches, fmeta_dicts = [ make_freq_index_dict_from_meta(meta, sort_order=sort_order) for meta in fmeta ] @@ -1256,15 +1258,16 @@ def merge_freq_arrays( # in the merged freq array merged_freq_dict = fmeta_dicts[0] - # Go through each freq array dictionary, if they contain a group not present in the anchor - # dictionary, add the group and use the length of the anchor dictionary as - # it's index in the array - for fmeta_dict in fmeta_dicts[1:]: - for group in fmeta_dict.keys(): - if group not in merged_freq_dict.keys(): - merged_freq_dict[group] = len( - merged_freq_dict.keys() - ) # TODO: See if can update to hail, i.e all_groups = hl.array(hl.set(hl.flatten(hl.map(lambda d: d.keys(), fmeta_dict)))) does it but wont be sorted, look into hl.sorted? + # If summing multiple arrays, add groups present in each freq array + # dictionary to the anchor dictionary as keys and use the present length + # of the anchor dictionary as the index in the freq array. + if operation == "sum": + for fmeta_dict in fmeta_dicts[1:]: + for group in fmeta_dict.keys(): + if group not in merged_freq_dict.keys(): + merged_freq_dict[group] = len( + merged_freq_dict.keys() + ) # TODO: See if can update to hail, i.e all_groups = hl.array(hl.set(hl.flatten(hl.map(lambda d: d.keys(), fmeta_dict)))) does it but wont be sorted, look into hl.sorted? # Create array where the entries are the freq agg groupings which will # then be transformed into our merged freqeunecy array of callstat structs @@ -1284,17 +1287,23 @@ def _AC_merge( and "diff". If "diff" is passed, the first array in the list will have the other arrays AC subtracted from it. :return: Merged allele count. """ + # Sum ACs from each freq array for the given group merged_AC = hl.sum( hl.map( lambda f: hl.if_else(f[1].contains(group), f[0][f[1][group]].AC, 0), zipped_array_dicts, ) ) - # If operation is diff, subtract the sum of all ACs from 2* AC of the - # first entry in the freq array + # If operation is diff, subtract the sum of all ACs for the given group + # from 2 * AC of the first entry in the freq array if operation == "diff": - merged = 2 * hl.sum(zfreq[0][0][zfreq[0][1][group]].AC) - merged_AC - return merged + freq_to_subtract_from = zipped_array_dicts[0] + merged_AC = ( + 2 * freq_to_subtract_from[0][freq_to_subtract_from[1][group]].AC + - merged_AC + ) + merged_AC = hl.if_else(merged_AC < 0, 0, merged_AC) + return merged_AC def _AN_merge( group, zipped_array_dicts, operation="sum" @@ -1308,6 +1317,7 @@ def _AN_merge( and "diff". If "diff" is passed, the first array in the list will have the other arrays AN subtracted from it. :return: Merged allele number. """ + # Sum ANs from each freq array for the given group merged_AN = hl.sum( hl.map( lambda f: hl.if_else(f[1].contains(group), f[0][f[1][group]].AN, 0), @@ -1317,8 +1327,12 @@ def _AN_merge( # If operation is diff, subtract the sum of all ANs from 2*AN of the first # entry in the freq array if operation == "diff": - merged_AN = 2 * hl.sum(zfreq[0][0][zfreq[0][1][group]].AN) - merged_AN - + freq_to_subtract_from = zipped_array_dicts[0] + merged_AN = ( + 2 * freq_to_subtract_from[0][freq_to_subtract_from[1][group]].AN + - merged_AN + ) + merged_AN = hl.if_else(merged_AN < 0, 0, merged_AN) return merged_AN def _AF_merge( @@ -1353,6 +1367,7 @@ def _homozygote_count_merge( and "diff". If "diff" is passed, the first array in the list will have the other arrays' homozygote count subtracted from it. :return: Merged homozygote count. """ + # Sum homozygote counts from each freq array for the given group sum_homozygote_count = hl.sum( hl.map( lambda f: hl.if_else( @@ -1361,15 +1376,24 @@ def _homozygote_count_merge( zipped_array_dicts, ) ) + # If operation is diff, subtract the sum of all homozygote counts from + # 2 * homozygote count of the first entry in the freq array if operation == "diff": + freq_to_subtract_from = zipped_array_dicts[0] sum_homozygote_count = ( - 2 * hl.sum(zfreq[0][zfreq[1][group]].homozygote_count) + 2 + * freq_to_subtract_from[0][ + freq_to_subtract_from[1][group] + ].homozygote_count - sum_homozygote_count ) + sum_homozygote_count = hl.if_else( + sum_homozygote_count < 0, 0, sum_homozygote_count + ) return sum_homozygote_count - # Make a single merged freq array by finding farray index in corresponding - # fmeta_dict for each group in all_groups set + # Make a single merged freq array by mapping over the merged freq array and + # creating a new struct with the merged AC, AN, AF, and homozygote count zfreq = hl.zip(farrays, fmeta_dicts) merged_freq_array = merged_freq_array.map( lambda group: hl.struct( @@ -1379,4 +1403,4 @@ def _homozygote_count_merge( homozygote_count=_homozygote_count_merge(group, zfreq, operation=operation), ) ) - return merged_freq_array + return merged_freq_array, merged_freq_dict From 634ab95e07bfc145385d078e7db51a81ff3d53a4 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Fri, 9 Jun 2023 14:10:04 -0400 Subject: [PATCH 04/24] Switch to Julia's merge --- gnomad/utils/annotations.py | 206 +++++++++--------------------------- 1 file changed, 49 insertions(+), 157 deletions(-) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 6b568e82d..e5875eb12 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -1227,8 +1227,6 @@ def make_freq_index_dict_from_meta( def merge_freq_arrays( farrays: List[hl.expr.ArrayExpression], fmeta: List[List[Dict[str, str]]], - operation: str = "sum", - sort_order: Optional[List[str]] = SORT_ORDER, ) -> Tuple[hl.expr.ArrayExpression, Dict[str, int]]: """ Merge frequency arrays from multiple datasets. @@ -1236,171 +1234,65 @@ def merge_freq_arrays( Farrays and fmeta do not need to be the same or in the same order. They will be merged based on the join type and operation. Missing values are dependent on the join type and operation. - :param farrays: List of frequency arrays to merge. First entry in the list is the primary array to which other arrays will be added or subtracted. Array must be on the same HT. + :param farrays: List of frequency arrays to merge. First entry in the list is the + primary array to which other arrays will be added or subtracted. Array must be on the same HT. :param fmeta: List of frequency metadata for arrays being merged. - :param operation: Operation to perform on the frequency arrays. Default is "sum". Options are "sum" and "diff". IF "diff" is passed, the first array in the list will have the other arrays subtracted from it. - :param sort_order: Order of frequency metadata labels to use when building label when building key for the freq dictionary. + Options are "sum" and "diff". IF "diff" is passed, the first array in the list will have the other arrays subtracted from it. :return: Tuple of merged frequency array and its freq index dictionary. """ if len(farrays) < 2: raise ValueError("Must provide at least two frequency arrays to merge") - if operation not in ["sum", "diff"]: - raise ValueError("Operation must be either 'sum' or 'diff'") if len(farrays) != len(fmeta): raise ValueError("Length of farrays and fmeta must be equal") - # Make list of freq_dicts, one for each entry in the fmeta array, to - # ensure label formatting matches, - fmeta_dicts = [ - make_freq_index_dict_from_meta(meta, sort_order=sort_order) for meta in fmeta - ] - # Create dictionary from the first entry in fmeta to anchor ordering of groups - # in the merged freq array - merged_freq_dict = fmeta_dicts[0] - - # If summing multiple arrays, add groups present in each freq array - # dictionary to the anchor dictionary as keys and use the present length - # of the anchor dictionary as the index in the freq array. - if operation == "sum": - for fmeta_dict in fmeta_dicts[1:]: - for group in fmeta_dict.keys(): - if group not in merged_freq_dict.keys(): - merged_freq_dict[group] = len( - merged_freq_dict.keys() - ) # TODO: See if can update to hail, i.e all_groups = hl.array(hl.set(hl.flatten(hl.map(lambda d: d.keys(), fmeta_dict)))) does it but wont be sorted, look into hl.sorted? - - # Create array where the entries are the freq agg groupings which will - # then be transformed into our merged freqeunecy array of callstat structs - merged_freq_array = hl.map( - lambda x: x[0], hl.sorted(merged_freq_dict, key=lambda item: item[1]) - ) - - def _AC_merge( - group, zipped_array_dicts, operation="sum" - ) -> hl.expr.Int32Expression: - """ - Merge allele counts from multiple frequency arrays. - - :param group: Group to merge across freq arrays. - :param zipped_array_dicts: Zipped list of frequency arrays and their corresponding dictionaries. - :param operation: Operation to perform on the frequency arrays. Default is "sum". Options are "sum" - and "diff". If "diff" is passed, the first array in the list will have the other arrays AC subtracted from it. - :return: Merged allele count. - """ - # Sum ACs from each freq array for the given group - merged_AC = hl.sum( - hl.map( - lambda f: hl.if_else(f[1].contains(group), f[0][f[1][group]].AC, 0), - zipped_array_dicts, - ) - ) - # If operation is diff, subtract the sum of all ACs for the given group - # from 2 * AC of the first entry in the freq array - if operation == "diff": - freq_to_subtract_from = zipped_array_dicts[0] - merged_AC = ( - 2 * freq_to_subtract_from[0][freq_to_subtract_from[1][group]].AC - - merged_AC - ) - merged_AC = hl.if_else(merged_AC < 0, 0, merged_AC) - return merged_AC - - def _AN_merge( - group, zipped_array_dicts, operation="sum" - ) -> hl.expr.Int32Expression: - """ - Merge allele numbers from multiple frequency arrays. - - :param group: Group to merge across freq arrays. - :param zipped_array_dicts: Zipped list of frequency arrays and their corresponding dictionaries. - :param operation: Operation to perform on the frequency arrays. Default is "sum". Options are "sum" - and "diff". If "diff" is passed, the first array in the list will have the other arrays AN subtracted from it. - :return: Merged allele number. - """ - # Sum ANs from each freq array for the given group - merged_AN = hl.sum( - hl.map( - lambda f: hl.if_else(f[1].contains(group), f[0][f[1][group]].AN, 0), - zipped_array_dicts, - ) - ) - # If operation is diff, subtract the sum of all ANs from 2*AN of the first - # entry in the freq array - if operation == "diff": - freq_to_subtract_from = zipped_array_dicts[0] - merged_AN = ( - 2 * freq_to_subtract_from[0][freq_to_subtract_from[1][group]].AN - - merged_AN + # Create a list where each entry is a dictionary whose the key is an aggregation + # group and the value is the corresponding index in the freq array. + fmeta = [hl.dict(hl.enumerate(f).map(lambda x: (x[1], [x[0]]))) for f in fmeta] + + # Merge dictionaries in the list into a single dictionary where key is aggregation + # group and the value is a list of the group's index in each of the freq arrays, if + # it exists. + fmeta = hl.fold( + lambda i, j: hl.dict( + (i.key_set() | j.key_set()).map( + lambda k: ( + k, + i.get(k, [hl.missing(hl.tint32)]).extend( + j.get(k, [hl.missing(hl.tint32)]) + ), + ) ) - merged_AN = hl.if_else(merged_AN < 0, 0, merged_AN) - return merged_AN - - def _AF_merge( - group, zipped_array_dicts, operation="sum" - ) -> hl.expr.Float32Expression: - """ - Calculate new allele frequencies using merged AC and AN from multiple frequency arrays. - - :param group: Group to merge across freq arrays. - :param zipped_array_dicts: Zipped list of frequency arrays and their corresponding dictionaries. - :param operation: Operation to perform on the frequency arrays. Default is "sum". Options are "sum" and "diff". - If "diff" is passed, the first array in the list will have the other arrays AC and AN subtracted from its own. - :return: Merged allele frequency. - """ - merged_AC = _AC_merge(group, zipped_array_dicts, operation=operation) - merged_AN = _AN_merge(group, zipped_array_dicts, operation=operation) - return hl.if_else( - merged_AN > 0, - merged_AC / merged_AN, - 0, - ) + ), + fmeta[0], + fmeta[1:], + ) - def _homozygote_count_merge( - group, zipped_array_dicts, operation="sum" - ) -> hl.expr.Int32Expression: - """ - Merge homozygote counts from multiple frequency arrays. - - :param group: Group to merge across freq arrays. - :param zipped_array_dicts: Zipped list of frequency arrays and their corresponding dictionaries. - :param operation: Operation to perform on the frequency arrays. Default is "sum". Options are "sum" - and "diff". If "diff" is passed, the first array in the list will have the other arrays' homozygote count subtracted from it. - :return: Merged homozygote count. - """ - # Sum homozygote counts from each freq array for the given group - sum_homozygote_count = hl.sum( - hl.map( - lambda f: hl.if_else( - f[1].contains(group), f[0][f[1][group]].homozygote_count, 0 + # Create a list of tuples from the dictionary, sorted by the list of indices for + # each aggregation group. + fmeta = hl.sorted(fmeta.items(), key=lambda f: f[1]) + + # Create a list of the aggregation groups, maintaining the sorted order + new_freq_meta = fmeta.map(lambda x: x[0]) + + # Create array for each aggregation group of arrays containing the group's freq + # values from each freq array + freq_meta_idx = fmeta.map(lambda x: hl.zip(farrays, x[1]).map(lambda i: i[0][i[1]])) + + # Iterate through the groups and their freq lists to merge callstats + new_freq = freq_meta_idx.map( + lambda x: hl.bind( + lambda y: y.annotate(AF=hl.if_else(y.AN == 0, 0, y.AC / y.AN)), + hl.fold( + lambda i, j: hl.struct( + AC=hl.or_else(i.AC, 0) + hl.or_else(j.AC, 0), + AN=hl.or_else(i.AN, 0) + hl.or_else(j.AN, 0), + homozygote_count=hl.or_else(i.homozygote_count, 0) + + hl.or_else(j.homozygote_count, 0), ), - zipped_array_dicts, - ) - ) - # If operation is diff, subtract the sum of all homozygote counts from - # 2 * homozygote count of the first entry in the freq array - if operation == "diff": - freq_to_subtract_from = zipped_array_dicts[0] - sum_homozygote_count = ( - 2 - * freq_to_subtract_from[0][ - freq_to_subtract_from[1][group] - ].homozygote_count - - sum_homozygote_count - ) - sum_homozygote_count = hl.if_else( - sum_homozygote_count < 0, 0, sum_homozygote_count - ) - return sum_homozygote_count - - # Make a single merged freq array by mapping over the merged freq array and - # creating a new struct with the merged AC, AN, AF, and homozygote count - zfreq = hl.zip(farrays, fmeta_dicts) - merged_freq_array = merged_freq_array.map( - lambda group: hl.struct( - AC=_AC_merge(group, zfreq, operation=operation), - AN=_AN_merge(group, zfreq, operation=operation), - AF=_AF_merge(group, zfreq, operation=operation), - homozygote_count=_homozygote_count_merge(group, zfreq, operation=operation), + x[0].select("AC", "AN", "homozygote_count"), + x[1:], + ), ) ) - return merged_freq_array, merged_freq_dict + + return new_freq, new_freq_meta From 3499e40c0d989f5d3867170c593137355e4b6f83 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Tue, 13 Jun 2023 21:08:36 -0400 Subject: [PATCH 05/24] Add diff back with options for negative values --- gnomad/utils/annotations.py | 73 ++++++++++++++++++++++++++++++++----- 1 file changed, 63 insertions(+), 10 deletions(-) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index e5875eb12..14fc2147d 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -1227,6 +1227,8 @@ def make_freq_index_dict_from_meta( def merge_freq_arrays( farrays: List[hl.expr.ArrayExpression], fmeta: List[List[Dict[str, str]]], + operation="sum", + set_negatives_to_zero: bool = True, ) -> Tuple[hl.expr.ArrayExpression, Dict[str, int]]: """ Merge frequency arrays from multiple datasets. @@ -1235,26 +1237,39 @@ def merge_freq_arrays( merged based on the join type and operation. Missing values are dependent on the join type and operation. :param farrays: List of frequency arrays to merge. First entry in the list is the - primary array to which other arrays will be added or subtracted. Array must be on the same HT. + primary array to which other arrays will be added or subtracted. Array must be on + the same HT. :param fmeta: List of frequency metadata for arrays being merged. - Options are "sum" and "diff". IF "diff" is passed, the first array in the list will have the other arrays subtracted from it. + :param operation: Merge operation to perform. Options are "sum" and "diff". If + "diff" is passed, the first array in the list will have the other arrays subtracted + from it. + :param set_negatives_to_zero: If True, set negative array values to 0 for AC, AN, + AF, and homozygote_count. If False, raise a ValueError. Default is True. :return: Tuple of merged frequency array and its freq index dictionary. """ if len(farrays) < 2: raise ValueError("Must provide at least two frequency arrays to merge") if len(farrays) != len(fmeta): raise ValueError("Length of farrays and fmeta must be equal") + if operation not in ["sum", "diff"]: + raise ValueError("Operation must be either 'sum' or 'diff'") # Create a list where each entry is a dictionary whose the key is an aggregation - # group and the value is the corresponding index in the freq array. + # group and the value is the corresponding index in the freq array. If operation is + # set to diff, only use the first entry's meta. fmeta = [hl.dict(hl.enumerate(f).map(lambda x: (x[1], [x[0]]))) for f in fmeta] # Merge dictionaries in the list into a single dictionary where key is aggregation # group and the value is a list of the group's index in each of the freq arrays, if - # it exists. + # it exists. For sum operation, use keys, aka groups, found in all freq dictionaries. + # For "diff" operations, only use key_set from the first entry. fmeta = hl.fold( lambda i, j: hl.dict( - (i.key_set() | j.key_set()).map( + ( + hl.if_else( + operation == "sum", (i.key_set() | j.key_set()), i.key_set() + ) # TODO: confirms this works on freq meta arrays with more than two entires + ).map( lambda k: ( k, i.get(k, [hl.missing(hl.tint32)]).extend( @@ -1278,21 +1293,59 @@ def merge_freq_arrays( # values from each freq array freq_meta_idx = fmeta.map(lambda x: hl.zip(farrays, x[1]).map(lambda i: i[0][i[1]])) + def _sum_or_diff_fields( + field_1_expr: str, field_2_expr: str + ) -> hl.expr.Int32Expression: + """ + Sum or subtract fields in call statistics struct. + + :param field_1_expr: First field to sum or diff. + :param field_2_expr: Second field to sum or diff. + :return: Merged field value. + """ + return hl.if_else( + operation == "sum", + hl.or_else(field_1_expr, 0) + hl.or_else(field_2_expr, 0), + hl.or_else(field_1_expr, 0) - hl.or_else(field_2_expr, 0), + ) + # Iterate through the groups and their freq lists to merge callstats new_freq = freq_meta_idx.map( lambda x: hl.bind( - lambda y: y.annotate(AF=hl.if_else(y.AN == 0, 0, y.AC / y.AN)), + lambda y: y.annotate(AF=hl.if_else(y.AN > 0, y.AC / y.AN, 0)), hl.fold( lambda i, j: hl.struct( - AC=hl.or_else(i.AC, 0) + hl.or_else(j.AC, 0), - AN=hl.or_else(i.AN, 0) + hl.or_else(j.AN, 0), - homozygote_count=hl.or_else(i.homozygote_count, 0) - + hl.or_else(j.homozygote_count, 0), + AC=_sum_or_diff_fields(i.AC, j.AC), + AN=_sum_or_diff_fields(i.AN, j.AN), + homozygote_count=_sum_or_diff_fields( + i.homozygote_count, j.homozygote_count + ), ), x[0].select("AC", "AN", "homozygote_count"), x[1:], ), ) ) + # Check and see if any annotation within the merged array is negative. If so, + # raise an error if set_negatives_to_zero is False or set the value to 0 if + # set_negatives_to_zero is True. + if operation == "diff": + negative_value_error_msg = ( + "Negative values found in merged frequency array. Review data or set" + " `set_negatives_to_zero` to True to set negative values to 0." + ) + new_freq = new_freq.map( + lambda x: x.annotate( + AC=hl.case() + .when(set_negatives_to_zero, hl.max(x.AC, 0)) + .or_error(negative_value_error_msg), + AN=hl.case() + .when(set_negatives_to_zero, hl.max(x.AN, 0)) + .or_error(negative_value_error_msg), + homozygote_count=hl.case() + .when(set_negatives_to_zero, hl.max(x.homozygote_count, 0)) + .or_error(negative_value_error_msg), + ) + ) return new_freq, new_freq_meta From 4d374d2ddd297a555e63898fa34b9d2d0a4bd699 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Tue, 13 Jun 2023 21:36:14 -0400 Subject: [PATCH 06/24] Add description of function --- gnomad/utils/annotations.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 14fc2147d..30a3c1199 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -1231,18 +1231,22 @@ def merge_freq_arrays( set_negatives_to_zero: bool = True, ) -> Tuple[hl.expr.ArrayExpression, Dict[str, int]]: """ - Merge frequency arrays from multiple datasets. - - Farrays and fmeta do not need to be the same or in the same order. They will be - merged based on the join type and operation. Missing values are dependent on the - join type and operation. - :param farrays: List of frequency arrays to merge. First entry in the list is the - primary array to which other arrays will be added or subtracted. Array must be on - the same HT. + Merge frequency arrays on the same HT. + + .. note:: + Arrays do not have to contain the same groupings or order of groupings but + the array indices for a freq array in farrays must be the same as its associated + frequency metadata index in fmeta i.e., farrays = [freq1, freq2] then fmeta + must equal [fmeta1, fmeta2] where fmeta1 contains the metadata information + for freq1. If th merge operation is set to "sum", groups in the merged array + will be the union of groupings found within the arrays' metadata and all arrays + with be summed by grouping. If the operation is set to "diff", the merged array + will contain groups only found in the first array of fmeta. Any array containing + any of these groups will have thier values subtracted from the values of the first array. + + :param farrays: List of frequency arrays to merge. First entry in the list is the primary array to which other arrays will be added or subtracted. Array must be on the same HT. :param fmeta: List of frequency metadata for arrays being merged. - :param operation: Merge operation to perform. Options are "sum" and "diff". If - "diff" is passed, the first array in the list will have the other arrays subtracted - from it. + :param operation: Merge operation to perform. Options are "sum" and "diff". If "diff" is passed, the first freq array in the list will have the other arrays subtracted from it. :param set_negatives_to_zero: If True, set negative array values to 0 for AC, AN, AF, and homozygote_count. If False, raise a ValueError. Default is True. :return: Tuple of merged frequency array and its freq index dictionary. From 3aa7f68b0c23431302dbb5b5c306aaa98612dcad Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Tue, 13 Jun 2023 21:42:30 -0400 Subject: [PATCH 07/24] Drop TODOs --- gnomad/utils/annotations.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 30a3c1199..222383b18 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -1270,9 +1270,7 @@ def merge_freq_arrays( fmeta = hl.fold( lambda i, j: hl.dict( ( - hl.if_else( - operation == "sum", (i.key_set() | j.key_set()), i.key_set() - ) # TODO: confirms this works on freq meta arrays with more than two entires + hl.if_else(operation == "sum", (i.key_set() | j.key_set()), i.key_set()) ).map( lambda k: ( k, From 55340de933d3b044cf3b669ade4a919ee8ae57d5 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Wed, 14 Jun 2023 10:15:53 -0400 Subject: [PATCH 08/24] Update gnomad/utils/annotations.py Fix incorrect formatting --- gnomad/utils/annotations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 222383b18..c133ffb7c 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -1109,7 +1109,7 @@ def region_flag_expr( :return: `region_flag` struct row annotation """ prob_flags_expr = ( - {"non_par": t.locus.in_x_nonpar() | t.locus.in_y_nonpar()} if non_par else {} + {"non_par": (t.locus.in_x_nonpar() | t.locus.in_y_nonpar())} if non_par else {} ) if prob_regions is not None: From 2d40b725caef67644b765054b9e4c5521a8fb67a Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Wed, 14 Jun 2023 10:16:27 -0400 Subject: [PATCH 09/24] Update gnomad/utils/annotations.py --- gnomad/utils/annotations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index c133ffb7c..ee654238d 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -126,7 +126,7 @@ def project_max_expr( > 0 ), # order the callstats computed by AF in decreasing order - lambda x: -x[1].AF[ai], + lambda x: -x[1].AF[ai] # take the n_projects projects with largest AF )[:n_projects].map( # add the project in the callstats struct From ac50e048559c2b58a6ccede22eb3cb143b22ff8a Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Wed, 14 Jun 2023 10:44:43 -0400 Subject: [PATCH 10/24] Move to same module as original function --- gnomad/utils/annotations.py | 31 ------------------------------- gnomad/utils/release.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 31 deletions(-) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index ee654238d..144731d39 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -1193,37 +1193,6 @@ def hemi_expr( ) -def make_freq_index_dict_from_meta( - freq_meta: List[Dict[str, str]], - label_delimiter: str = "_", - sort_order: Optional[List[str]] = SORT_ORDER, -) -> Dict[str, int]: - """ - Create a dictionary for accessing frequency array. - - :param freq_meta: List of dictionaries containing frequency metadata. - :param label_delimiter: Delimiter to use when joining frequency metadata labels. - :param sort_order: List of frequency metadata labels to use when sorting the dictionary. - :return: Dictionary of frequency metadata. - """ - index_dict = {} - for i, f in enumerate(hl.eval(freq_meta)): - if sort_order and len(set(f.keys()) - set(sort_order)) < 1: - index_dict[ - label_delimiter.join( - [ - f[g] - for g in sorted( - f.keys(), - key=(lambda x: sort_order.index(x)) if sort_order else None, - ) - ] - ) - ] = i - - return index_dict - - def merge_freq_arrays( farrays: List[hl.expr.ArrayExpression], fmeta: List[List[Dict[str, str]]], diff --git a/gnomad/utils/release.py b/gnomad/utils/release.py index feb9b1412..b0ed0c805 100644 --- a/gnomad/utils/release.py +++ b/gnomad/utils/release.py @@ -92,3 +92,34 @@ def _get_index(label_groups): ) return index_dict + + +def make_freq_index_dict_from_meta( + freq_meta: List[Dict[str, str]], + label_delimiter: str = "_", + sort_order: Optional[List[str]] = SORT_ORDER, +) -> Dict[str, int]: + """ + Create a dictionary for accessing frequency array. + + :param freq_meta: List of dictionaries containing frequency metadata. + :param label_delimiter: Delimiter to use when joining frequency metadata labels. + :param sort_order: List of frequency metadata labels to use when sorting the dictionary. + :return: Dictionary of frequency metadata. + """ + index_dict = {} + for i, f in enumerate(hl.eval(freq_meta)): + if sort_order and len(set(f.keys()) - set(sort_order)) < 1: + index_dict[ + label_delimiter.join( + [ + f[g] + for g in sorted( + f.keys(), + key=(lambda x: sort_order.index(x)) if sort_order else None, + ) + ] + ) + ] = i + + return index_dict From 8c29fb8166d4c67691d7b82dec6d294f93ffde47 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Wed, 14 Jun 2023 10:46:30 -0400 Subject: [PATCH 11/24] Update imports --- gnomad/utils/release.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gnomad/utils/release.py b/gnomad/utils/release.py index b0ed0c805..cb0933626 100644 --- a/gnomad/utils/release.py +++ b/gnomad/utils/release.py @@ -2,6 +2,8 @@ from typing import Dict, List, Optional +import hail as hl + from gnomad.resources.grch38.gnomad import ( CURRENT_MAJOR_RELEASE, GROUPS, @@ -9,7 +11,7 @@ SEXES, SUBSETS, ) -from gnomad.utils.vcf import index_globals +from gnomad.utils.vcf import SORT_ORDER, index_globals def make_faf_index_dict( From ce76d83e9f7c90b14ec402a20483d54946ea954e Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Wed, 14 Jun 2023 10:47:29 -0400 Subject: [PATCH 12/24] Remove unnecessary imports --- gnomad/utils/annotations.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 144731d39..d5240d469 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -6,7 +6,6 @@ import hail as hl from gnomad.utils.gen_stats import to_phred -from gnomad.utils.vcf import SORT_ORDER logging.basicConfig( format="%(asctime)s (%(name)s %(lineno)s): %(message)s", @@ -126,7 +125,7 @@ def project_max_expr( > 0 ), # order the callstats computed by AF in decreasing order - lambda x: -x[1].AF[ai] + lambda x: -x[1].AF[ai], # take the n_projects projects with largest AF )[:n_projects].map( # add the project in the callstats struct @@ -1109,7 +1108,7 @@ def region_flag_expr( :return: `region_flag` struct row annotation """ prob_flags_expr = ( - {"non_par": (t.locus.in_x_nonpar() | t.locus.in_y_nonpar())} if non_par else {} + {"non_par": t.locus.in_x_nonpar() | t.locus.in_y_nonpar()} if non_par else {} ) if prob_regions is not None: From 93be0d31f6332c05a7c4774b7cd1ea2e6cd82a4c Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Wed, 14 Jun 2023 10:53:24 -0400 Subject: [PATCH 13/24] Apply suggestions from code review Correcting incorrect formatting again... --- gnomad/utils/annotations.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index d5240d469..4d99fa79e 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -125,7 +125,7 @@ def project_max_expr( > 0 ), # order the callstats computed by AF in decreasing order - lambda x: -x[1].AF[ai], + lambda x: -x[1].AF[ai] # take the n_projects projects with largest AF )[:n_projects].map( # add the project in the callstats struct @@ -1108,7 +1108,7 @@ def region_flag_expr( :return: `region_flag` struct row annotation """ prob_flags_expr = ( - {"non_par": t.locus.in_x_nonpar() | t.locus.in_y_nonpar()} if non_par else {} + {"non_par": (t.locus.in_x_nonpar() | t.locus.in_y_nonpar())} if non_par else {} ) if prob_regions is not None: From 0d4a67a8230c6b030cada48f89c96f5d4cd15ae9 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Wed, 14 Jun 2023 11:00:53 -0400 Subject: [PATCH 14/24] update merge docs --- gnomad/utils/annotations.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 4d99fa79e..e3b0d5272 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -1202,6 +1202,7 @@ def merge_freq_arrays( Merge frequency arrays on the same HT. .. note:: + Arrays do not have to contain the same groupings or order of groupings but the array indices for a freq array in farrays must be the same as its associated frequency metadata index in fmeta i.e., farrays = [freq1, freq2] then fmeta From 19f3c261b8d4420726968c63c9203791fa3b2cc0 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Wed, 14 Jun 2023 11:19:51 -0400 Subject: [PATCH 15/24] update merge docs --- gnomad/utils/annotations.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index e3b0d5272..9aaccf435 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -125,7 +125,7 @@ def project_max_expr( > 0 ), # order the callstats computed by AF in decreasing order - lambda x: -x[1].AF[ai] + lambda x: -x[1].AF[ai], # take the n_projects projects with largest AF )[:n_projects].map( # add the project in the callstats struct @@ -1108,7 +1108,7 @@ def region_flag_expr( :return: `region_flag` struct row annotation """ prob_flags_expr = ( - {"non_par": (t.locus.in_x_nonpar() | t.locus.in_y_nonpar())} if non_par else {} + {"non_par": t.locus.in_x_nonpar() | t.locus.in_y_nonpar()} if non_par else {} ) if prob_regions is not None: @@ -1216,8 +1216,7 @@ def merge_freq_arrays( :param farrays: List of frequency arrays to merge. First entry in the list is the primary array to which other arrays will be added or subtracted. Array must be on the same HT. :param fmeta: List of frequency metadata for arrays being merged. :param operation: Merge operation to perform. Options are "sum" and "diff". If "diff" is passed, the first freq array in the list will have the other arrays subtracted from it. - :param set_negatives_to_zero: If True, set negative array values to 0 for AC, AN, - AF, and homozygote_count. If False, raise a ValueError. Default is True. + :param set_negatives_to_zero: If True, set negative array values to 0 for AC, AN, AF, and homozygote_count. If False, raise a ValueError. Default is True. :return: Tuple of merged frequency array and its freq index dictionary. """ if len(farrays) < 2: From 50c2b22e764f2aa0bb81ec1ed85497b9f2319902 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Wed, 14 Jun 2023 11:30:19 -0400 Subject: [PATCH 16/24] Black is misformating these, hail errors without them --- gnomad/utils/annotations.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 9aaccf435..047ecf364 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -125,7 +125,7 @@ def project_max_expr( > 0 ), # order the callstats computed by AF in decreasing order - lambda x: -x[1].AF[ai], + lambda x: -x[1].AF[ai] # take the n_projects projects with largest AF )[:n_projects].map( # add the project in the callstats struct @@ -1108,7 +1108,7 @@ def region_flag_expr( :return: `region_flag` struct row annotation """ prob_flags_expr = ( - {"non_par": t.locus.in_x_nonpar() | t.locus.in_y_nonpar()} if non_par else {} + {"non_par": (t.locus.in_x_nonpar() | t.locus.in_y_nonpar())} if non_par else {} ) if prob_regions is not None: From c4d65b88ab52a25614e340918985b32c5fe597f6 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Wed, 21 Jun 2023 09:01:42 -0400 Subject: [PATCH 17/24] Apply suggestions from code review Co-authored-by: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> --- gnomad/utils/annotations.py | 45 ++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 047ecf364..fb5de151c 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -1197,43 +1197,42 @@ def merge_freq_arrays( fmeta: List[List[Dict[str, str]]], operation="sum", set_negatives_to_zero: bool = True, -) -> Tuple[hl.expr.ArrayExpression, Dict[str, int]]: +) -> Tuple[hl.expr.ArrayExpression, List[Dict[str, int]]]: """ - Merge frequency arrays on the same HT. + Merge a list of frequency arrays based on the supplied `operation`. .. note:: Arrays do not have to contain the same groupings or order of groupings but - the array indices for a freq array in farrays must be the same as its associated - frequency metadata index in fmeta i.e., farrays = [freq1, freq2] then fmeta - must equal [fmeta1, fmeta2] where fmeta1 contains the metadata information - for freq1. If th merge operation is set to "sum", groups in the merged array + the array indices for a freq array in `farrays` must be the same as its associated + frequency metadata index in `fmeta` i.e., `farrays = [freq1, freq2]` then `fmeta` + must equal `[fmeta1, fmeta2]` where fmeta1 contains the metadata information + for freq1. If `operation` is set to "sum", groups in the merged array will be the union of groupings found within the arrays' metadata and all arrays - with be summed by grouping. If the operation is set to "diff", the merged array - will contain groups only found in the first array of fmeta. Any array containing + with be summed by grouping. If `operation` is set to "diff", the merged array + will contain groups only found in the first array of `fmeta`. Any array containing any of these groups will have thier values subtracted from the values of the first array. - :param farrays: List of frequency arrays to merge. First entry in the list is the primary array to which other arrays will be added or subtracted. Array must be on the same HT. + :param farrays: List of frequency arrays to merge. First entry in the list is the primary array to which other arrays will be added or subtracted. All arrays must be on the same Table. :param fmeta: List of frequency metadata for arrays being merged. :param operation: Merge operation to perform. Options are "sum" and "diff". If "diff" is passed, the first freq array in the list will have the other arrays subtracted from it. :param set_negatives_to_zero: If True, set negative array values to 0 for AC, AN, AF, and homozygote_count. If False, raise a ValueError. Default is True. :return: Tuple of merged frequency array and its freq index dictionary. """ if len(farrays) < 2: - raise ValueError("Must provide at least two frequency arrays to merge") + raise ValueError("Must provide at least two frequency arrays to merge!") if len(farrays) != len(fmeta): - raise ValueError("Length of farrays and fmeta must be equal") + raise ValueError("Length of farrays and fmeta must be equal!") if operation not in ["sum", "diff"]: - raise ValueError("Operation must be either 'sum' or 'diff'") + raise ValueError("Operation must be either 'sum' or 'diff'!") - # Create a list where each entry is a dictionary whose the key is an aggregation - # group and the value is the corresponding index in the freq array. If operation is - # set to diff, only use the first entry's meta. + # Create a list where each entry is a dictionary whose key is an aggregation + # group and the value is the corresponding index in the freq array. fmeta = [hl.dict(hl.enumerate(f).map(lambda x: (x[1], [x[0]]))) for f in fmeta] # Merge dictionaries in the list into a single dictionary where key is aggregation # group and the value is a list of the group's index in each of the freq arrays, if - # it exists. For sum operation, use keys, aka groups, found in all freq dictionaries. + # it exists. For "sum" operation, use keys, aka groups, found in all freq dictionaries. # For "diff" operations, only use key_set from the first entry. fmeta = hl.fold( lambda i, j: hl.dict( @@ -1256,11 +1255,11 @@ def merge_freq_arrays( # each aggregation group. fmeta = hl.sorted(fmeta.items(), key=lambda f: f[1]) - # Create a list of the aggregation groups, maintaining the sorted order + # Create a list of the aggregation groups, maintaining the sorted order. new_freq_meta = fmeta.map(lambda x: x[0]) # Create array for each aggregation group of arrays containing the group's freq - # values from each freq array + # values from each freq array. freq_meta_idx = fmeta.map(lambda x: hl.zip(farrays, x[1]).map(lambda i: i[0][i[1]])) def _sum_or_diff_fields( @@ -1280,16 +1279,16 @@ def _sum_or_diff_fields( ) # Iterate through the groups and their freq lists to merge callstats + callstat_ann = ["AC", "AN", "homozygote_count"] new_freq = freq_meta_idx.map( lambda x: hl.bind( lambda y: y.annotate(AF=hl.if_else(y.AN > 0, y.AC / y.AN, 0)), hl.fold( lambda i, j: hl.struct( - AC=_sum_or_diff_fields(i.AC, j.AC), - AN=_sum_or_diff_fields(i.AN, j.AN), - homozygote_count=_sum_or_diff_fields( - i.homozygote_count, j.homozygote_count - ), + **{ + ann: _sum_or_diff_fields(i[ann], j[ann]) + for ann in callstat_ann + } ), x[0].select("AC", "AN", "homozygote_count"), x[1:], From e6ad87f62205215ebe3e41d8a5fdb442e97aada7 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Wed, 21 Jun 2023 09:11:16 -0400 Subject: [PATCH 18/24] Add AF to correction for when AC < 0 --- gnomad/utils/annotations.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index fb5de151c..bb4f5d746 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -125,7 +125,7 @@ def project_max_expr( > 0 ), # order the callstats computed by AF in decreasing order - lambda x: -x[1].AF[ai] + lambda x: -x[1].AF[ai], # take the n_projects projects with largest AF )[:n_projects].map( # add the project in the callstats struct @@ -1108,7 +1108,7 @@ def region_flag_expr( :return: `region_flag` struct row annotation """ prob_flags_expr = ( - {"non_par": (t.locus.in_x_nonpar() | t.locus.in_y_nonpar())} if non_par else {} + {"non_par": t.locus.in_x_nonpar() | t.locus.in_y_nonpar()} if non_par else {} ) if prob_regions is not None: @@ -1195,12 +1195,15 @@ def hemi_expr( def merge_freq_arrays( farrays: List[hl.expr.ArrayExpression], fmeta: List[List[Dict[str, str]]], - operation="sum", + operation: str = "sum", set_negatives_to_zero: bool = True, ) -> Tuple[hl.expr.ArrayExpression, List[Dict[str, int]]]: """ Merge a list of frequency arrays based on the supplied `operation`. + .. warning:: + Arrays must be on the same Table. + .. note:: Arrays do not have to contain the same groupings or order of groupings but @@ -1278,17 +1281,14 @@ def _sum_or_diff_fields( hl.or_else(field_1_expr, 0) - hl.or_else(field_2_expr, 0), ) - # Iterate through the groups and their freq lists to merge callstats + # Iterate through the groups and their freq lists to merge callstats. callstat_ann = ["AC", "AN", "homozygote_count"] new_freq = freq_meta_idx.map( lambda x: hl.bind( lambda y: y.annotate(AF=hl.if_else(y.AN > 0, y.AC / y.AN, 0)), hl.fold( lambda i, j: hl.struct( - **{ - ann: _sum_or_diff_fields(i[ann], j[ann]) - for ann in callstat_ann - } + **{ann: _sum_or_diff_fields(i[ann], j[ann]) for ann in callstat_ann} ), x[0].select("AC", "AN", "homozygote_count"), x[1:], @@ -1303,17 +1303,17 @@ def _sum_or_diff_fields( "Negative values found in merged frequency array. Review data or set" " `set_negatives_to_zero` to True to set negative values to 0." ) + callstat_ann.append("AF") new_freq = new_freq.map( lambda x: x.annotate( - AC=hl.case() - .when(set_negatives_to_zero, hl.max(x.AC, 0)) - .or_error(negative_value_error_msg), - AN=hl.case() - .when(set_negatives_to_zero, hl.max(x.AN, 0)) - .or_error(negative_value_error_msg), - homozygote_count=hl.case() - .when(set_negatives_to_zero, hl.max(x.homozygote_count, 0)) - .or_error(negative_value_error_msg), + **{ + ann: ( + hl.case() + .when(set_negatives_to_zero, hl.max(x[ann], 0)) + .or_error(negative_value_error_msg) + ) + for ann in callstat_ann + } ) ) From 95bf58c15db942d113b1f07dd715a7a826fa02ef Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Wed, 21 Jun 2023 10:12:22 -0400 Subject: [PATCH 19/24] Set default to error --- gnomad/utils/annotations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index bb4f5d746..fb301fdca 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -1196,7 +1196,7 @@ def merge_freq_arrays( farrays: List[hl.expr.ArrayExpression], fmeta: List[List[Dict[str, str]]], operation: str = "sum", - set_negatives_to_zero: bool = True, + set_negatives_to_zero: bool = False, ) -> Tuple[hl.expr.ArrayExpression, List[Dict[str, int]]]: """ Merge a list of frequency arrays based on the supplied `operation`. From 980b1e88c9468eb0e586b511bf41486bcc20020f Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Wed, 21 Jun 2023 13:15:56 -0400 Subject: [PATCH 20/24] Remove optional sort_order, add warning and docs --- gnomad/utils/release.py | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/gnomad/utils/release.py b/gnomad/utils/release.py index cb0933626..361578a20 100644 --- a/gnomad/utils/release.py +++ b/gnomad/utils/release.py @@ -1,5 +1,6 @@ # noqa: D100 +import logging from typing import Dict, List, Optional import hail as hl @@ -13,6 +14,13 @@ ) from gnomad.utils.vcf import SORT_ORDER, index_globals +logging.basicConfig( + format="%(asctime)s (%(name)s %(lineno)s): %(message)s", + datefmt="%m/%d/%Y %I:%M:%S %p", +) +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + def make_faf_index_dict( faf_meta: List[Dict[str, str]], @@ -99,16 +107,33 @@ def _get_index(label_groups): def make_freq_index_dict_from_meta( freq_meta: List[Dict[str, str]], label_delimiter: str = "_", - sort_order: Optional[List[str]] = SORT_ORDER, + sort_order: List[str] = SORT_ORDER, ) -> Dict[str, int]: """ Create a dictionary for accessing frequency array. + The dictionary is keyed by the grouping combinations found in the frequency metadata + array, where values are the corresponding 0-based indices for the groupings in the + frequency array. For example, if the freq_meta entry [{'pop': 'nfe'}, {'sex': 'XX'}] + corresponds to the 5th entry in the frequency array, the returned dictionary entry + would be {`nfe_XX`: 4}. + :param freq_meta: List of dictionaries containing frequency metadata. :param label_delimiter: Delimiter to use when joining frequency metadata labels. :param sort_order: List of frequency metadata labels to use when sorting the dictionary. :return: Dictionary of frequency metadata. """ + # Confirm all groups in freq_meta are in sort_order. Warn user if not. + diff = hl.eval(hl.set(freq_meta.flatmap(lambda i: i.keys()))) - set(SORT_ORDER) + if diff: + logger.warning( + "Found unexpected frequency metadata groupings: %s. These groupings are" + " not present in the provided sort_order: %s. These groupings will not" + " be included in the returned dictionary.", + diff, + sort_order, + ) + index_dict = {} for i, f in enumerate(hl.eval(freq_meta)): if sort_order and len(set(f.keys()) - set(sort_order)) < 1: From 589a2fc1e6dbb4892e6fce7c94c4bee75ca20cf4 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Fri, 23 Jun 2023 13:20:01 -0400 Subject: [PATCH 21/24] Apply suggestions from code review Co-authored-by: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> --- gnomad/utils/annotations.py | 4 +++- gnomad/utils/release.py | 6 +++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index fb301fdca..7a2553a17 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -1210,7 +1210,9 @@ def merge_freq_arrays( the array indices for a freq array in `farrays` must be the same as its associated frequency metadata index in `fmeta` i.e., `farrays = [freq1, freq2]` then `fmeta` must equal `[fmeta1, fmeta2]` where fmeta1 contains the metadata information - for freq1. If `operation` is set to "sum", groups in the merged array + for freq1. + + If `operation` is set to "sum", groups in the merged array will be the union of groupings found within the arrays' metadata and all arrays with be summed by grouping. If `operation` is set to "diff", the merged array will contain groups only found in the first array of `fmeta`. Any array containing diff --git a/gnomad/utils/release.py b/gnomad/utils/release.py index 361578a20..24a2b628e 100644 --- a/gnomad/utils/release.py +++ b/gnomad/utils/release.py @@ -114,9 +114,9 @@ def make_freq_index_dict_from_meta( The dictionary is keyed by the grouping combinations found in the frequency metadata array, where values are the corresponding 0-based indices for the groupings in the - frequency array. For example, if the freq_meta entry [{'pop': 'nfe'}, {'sex': 'XX'}] + frequency array. For example, if the `freq_meta` entry [{'pop': 'nfe'}, {'sex': 'XX'}] corresponds to the 5th entry in the frequency array, the returned dictionary entry - would be {`nfe_XX`: 4}. + would be {'nfe_XX': 4}. :param freq_meta: List of dictionaries containing frequency metadata. :param label_delimiter: Delimiter to use when joining frequency metadata labels. @@ -124,7 +124,7 @@ def make_freq_index_dict_from_meta( :return: Dictionary of frequency metadata. """ # Confirm all groups in freq_meta are in sort_order. Warn user if not. - diff = hl.eval(hl.set(freq_meta.flatmap(lambda i: i.keys()))) - set(SORT_ORDER) + diff = hl.eval(hl.set(freq_meta.flatmap(lambda i: i.keys()))) - set(sort_order) if diff: logger.warning( "Found unexpected frequency metadata groupings: %s. These groupings are" From 65de2163f5261dd15e0c97ddfa8ac8671ece2e96 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Fri, 23 Jun 2023 13:25:58 -0400 Subject: [PATCH 22/24] Fix return doc and prevent formatting that breaks hail --- gnomad/utils/annotations.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 7a2553a17..ae33b7d48 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -1108,7 +1108,8 @@ def region_flag_expr( :return: `region_flag` struct row annotation """ prob_flags_expr = ( - {"non_par": t.locus.in_x_nonpar() | t.locus.in_y_nonpar()} if non_par else {} + {"non_par": (t.locus.in_x_nonpar() | t.locus.in_y_nonpar()) + } if non_par else {} # fmt: skip ) if prob_regions is not None: @@ -1210,8 +1211,8 @@ def merge_freq_arrays( the array indices for a freq array in `farrays` must be the same as its associated frequency metadata index in `fmeta` i.e., `farrays = [freq1, freq2]` then `fmeta` must equal `[fmeta1, fmeta2]` where fmeta1 contains the metadata information - for freq1. - + for freq1. + If `operation` is set to "sum", groups in the merged array will be the union of groupings found within the arrays' metadata and all arrays with be summed by grouping. If `operation` is set to "diff", the merged array @@ -1222,7 +1223,7 @@ def merge_freq_arrays( :param fmeta: List of frequency metadata for arrays being merged. :param operation: Merge operation to perform. Options are "sum" and "diff". If "diff" is passed, the first freq array in the list will have the other arrays subtracted from it. :param set_negatives_to_zero: If True, set negative array values to 0 for AC, AN, AF, and homozygote_count. If False, raise a ValueError. Default is True. - :return: Tuple of merged frequency array and its freq index dictionary. + :return: Tuple of merged frequency array and its frequency metadata list. """ if len(farrays) < 2: raise ValueError("Must provide at least two frequency arrays to merge!") From 8fd0ef9eea4a7c7fc5dd15b242d0cab51a5a8928 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Fri, 23 Jun 2023 13:29:00 -0400 Subject: [PATCH 23/24] Make sort_order optional --- gnomad/utils/release.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gnomad/utils/release.py b/gnomad/utils/release.py index 24a2b628e..1ca77777b 100644 --- a/gnomad/utils/release.py +++ b/gnomad/utils/release.py @@ -107,7 +107,7 @@ def _get_index(label_groups): def make_freq_index_dict_from_meta( freq_meta: List[Dict[str, str]], label_delimiter: str = "_", - sort_order: List[str] = SORT_ORDER, + sort_order: Optional[List[str]] = SORT_ORDER, ) -> Dict[str, int]: """ Create a dictionary for accessing frequency array. @@ -136,7 +136,7 @@ def make_freq_index_dict_from_meta( index_dict = {} for i, f in enumerate(hl.eval(freq_meta)): - if sort_order and len(set(f.keys()) - set(sort_order)) < 1: + if sort_order is None or len(set(f.keys()) - set(sort_order)) < 1: index_dict[ label_delimiter.join( [ From 00418c60a4200cdb2bd722b3229f3f0c570f12e9 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Fri, 23 Jun 2023 13:56:26 -0400 Subject: [PATCH 24/24] Handle diff error when sort_order None is passed --- gnomad/utils/release.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/gnomad/utils/release.py b/gnomad/utils/release.py index 1ca77777b..61382ab2b 100644 --- a/gnomad/utils/release.py +++ b/gnomad/utils/release.py @@ -124,15 +124,16 @@ def make_freq_index_dict_from_meta( :return: Dictionary of frequency metadata. """ # Confirm all groups in freq_meta are in sort_order. Warn user if not. - diff = hl.eval(hl.set(freq_meta.flatmap(lambda i: i.keys()))) - set(sort_order) - if diff: - logger.warning( - "Found unexpected frequency metadata groupings: %s. These groupings are" - " not present in the provided sort_order: %s. These groupings will not" - " be included in the returned dictionary.", - diff, - sort_order, - ) + if sort_order is not None: + diff = hl.eval(hl.set(freq_meta.flatmap(lambda i: i.keys()))) - set(sort_order) + if diff: + logger.warning( + "Found unexpected frequency metadata groupings: %s. These groupings" + " are not present in the provided sort_order: %s. These groupings" + " will not be included in the returned dictionary.", + diff, + sort_order, + ) index_dict = {} for i, f in enumerate(hl.eval(freq_meta)):