-
Notifications
You must be signed in to change notification settings - Fork 31
Merge freq array function and new frequency dictionary builder #551
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
14819af
1f58568
87fd2fd
634ab95
3499e40
4d374d2
3aa7f68
55340de
2d40b72
ac50e04
8c29fb8
ce76d83
93be0d3
0d4a67a
19f3c26
50c2b22
c4d65b8
e6ad87f
95bf58c
980b1e8
589a2fc
65de216
8fd0ef9
00418c6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -125,7 +125,7 @@ def project_max_expr( | |
> 0 | ||
), | ||
# order the callstats computed by AF in decreasing order | ||
lambda x: -x[1].AF[ai] | ||
lambda x: -x[1].AF[ai], | ||
mike-w-wilson marked this conversation as resolved.
Show resolved
Hide resolved
mike-w-wilson marked this conversation as resolved.
Show resolved
Hide resolved
mike-w-wilson marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# take the n_projects projects with largest AF | ||
)[:n_projects].map( | ||
# add the project in the callstats struct | ||
|
@@ -219,8 +219,10 @@ def faf_expr( | |
~locus.in_autosome_or_par(), | ||
hl.struct( | ||
**{ | ||
f"faf{str(threshold)[2:]}": hl.experimental.filtering_allele_frequency( | ||
freq[i].AC, freq[i].AN, threshold | ||
f"faf{str(threshold)[2:]}": ( | ||
hl.experimental.filtering_allele_frequency( | ||
freq[i].AC, freq[i].AN, threshold | ||
) | ||
) | ||
for threshold in faf_thresholds | ||
} | ||
|
@@ -1106,7 +1108,8 @@ def region_flag_expr( | |
:return: `region_flag` struct row annotation | ||
""" | ||
prob_flags_expr = ( | ||
{"non_par": (t.locus.in_x_nonpar() | t.locus.in_y_nonpar())} if non_par else {} | ||
{"non_par": (t.locus.in_x_nonpar() | t.locus.in_y_nonpar()) | ||
} if non_par else {} # fmt: skip | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ha the extra note made the line long enough that black wants parentheses there now... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oh boy. I'm surprised it was even removing the parentheses, mine doesn't do that |
||
) | ||
|
||
if prob_regions is not None: | ||
|
@@ -1188,3 +1191,133 @@ def hemi_expr( | |
# mt.GT[0] is alternate allele | ||
gt.is_haploid() & (sex_expr == male_str) & (gt[0] == 1), | ||
) | ||
|
||
|
||
def merge_freq_arrays( | ||
farrays: List[hl.expr.ArrayExpression], | ||
fmeta: List[List[Dict[str, str]]], | ||
operation: str = "sum", | ||
set_negatives_to_zero: bool = False, | ||
) -> Tuple[hl.expr.ArrayExpression, List[Dict[str, int]]]: | ||
""" | ||
Merge a list of frequency arrays based on the supplied `operation`. | ||
|
||
.. warning:: | ||
Arrays must be on the same Table. | ||
|
||
jkgoodrich marked this conversation as resolved.
Show resolved
Hide resolved
|
||
.. note:: | ||
|
||
Arrays do not have to contain the same groupings or order of groupings but | ||
the array indices for a freq array in `farrays` must be the same as its associated | ||
frequency metadata index in `fmeta` i.e., `farrays = [freq1, freq2]` then `fmeta` | ||
must equal `[fmeta1, fmeta2]` where fmeta1 contains the metadata information | ||
for freq1. | ||
|
||
If `operation` is set to "sum", groups in the merged array | ||
will be the union of groupings found within the arrays' metadata and all arrays | ||
with be summed by grouping. If `operation` is set to "diff", the merged array | ||
will contain groups only found in the first array of `fmeta`. Any array containing | ||
any of these groups will have thier values subtracted from the values of the first array. | ||
|
||
:param farrays: List of frequency arrays to merge. First entry in the list is the primary array to which other arrays will be added or subtracted. All arrays must be on the same Table. | ||
:param fmeta: List of frequency metadata for arrays being merged. | ||
:param operation: Merge operation to perform. Options are "sum" and "diff". If "diff" is passed, the first freq array in the list will have the other arrays subtracted from it. | ||
:param set_negatives_to_zero: If True, set negative array values to 0 for AC, AN, AF, and homozygote_count. If False, raise a ValueError. Default is True. | ||
:return: Tuple of merged frequency array and its frequency metadata list. | ||
""" | ||
if len(farrays) < 2: | ||
raise ValueError("Must provide at least two frequency arrays to merge!") | ||
if len(farrays) != len(fmeta): | ||
raise ValueError("Length of farrays and fmeta must be equal!") | ||
if operation not in ["sum", "diff"]: | ||
raise ValueError("Operation must be either 'sum' or 'diff'!") | ||
|
||
# Create a list where each entry is a dictionary whose key is an aggregation | ||
# group and the value is the corresponding index in the freq array. | ||
fmeta = [hl.dict(hl.enumerate(f).map(lambda x: (x[1], [x[0]]))) for f in fmeta] | ||
|
||
# Merge dictionaries in the list into a single dictionary where key is aggregation | ||
# group and the value is a list of the group's index in each of the freq arrays, if | ||
# it exists. For "sum" operation, use keys, aka groups, found in all freq dictionaries. | ||
# For "diff" operations, only use key_set from the first entry. | ||
jkgoodrich marked this conversation as resolved.
Show resolved
Hide resolved
|
||
fmeta = hl.fold( | ||
lambda i, j: hl.dict( | ||
( | ||
hl.if_else(operation == "sum", (i.key_set() | j.key_set()), i.key_set()) | ||
).map( | ||
lambda k: ( | ||
k, | ||
i.get(k, [hl.missing(hl.tint32)]).extend( | ||
j.get(k, [hl.missing(hl.tint32)]) | ||
), | ||
) | ||
) | ||
), | ||
fmeta[0], | ||
fmeta[1:], | ||
) | ||
|
||
# Create a list of tuples from the dictionary, sorted by the list of indices for | ||
# each aggregation group. | ||
fmeta = hl.sorted(fmeta.items(), key=lambda f: f[1]) | ||
|
||
# Create a list of the aggregation groups, maintaining the sorted order. | ||
new_freq_meta = fmeta.map(lambda x: x[0]) | ||
|
||
# Create array for each aggregation group of arrays containing the group's freq | ||
# values from each freq array. | ||
freq_meta_idx = fmeta.map(lambda x: hl.zip(farrays, x[1]).map(lambda i: i[0][i[1]])) | ||
|
||
def _sum_or_diff_fields( | ||
field_1_expr: str, field_2_expr: str | ||
) -> hl.expr.Int32Expression: | ||
""" | ||
Sum or subtract fields in call statistics struct. | ||
|
||
:param field_1_expr: First field to sum or diff. | ||
:param field_2_expr: Second field to sum or diff. | ||
:return: Merged field value. | ||
""" | ||
return hl.if_else( | ||
operation == "sum", | ||
hl.or_else(field_1_expr, 0) + hl.or_else(field_2_expr, 0), | ||
hl.or_else(field_1_expr, 0) - hl.or_else(field_2_expr, 0), | ||
) | ||
|
||
# Iterate through the groups and their freq lists to merge callstats. | ||
callstat_ann = ["AC", "AN", "homozygote_count"] | ||
new_freq = freq_meta_idx.map( | ||
mike-w-wilson marked this conversation as resolved.
Show resolved
Hide resolved
|
||
lambda x: hl.bind( | ||
lambda y: y.annotate(AF=hl.if_else(y.AN > 0, y.AC / y.AN, 0)), | ||
hl.fold( | ||
lambda i, j: hl.struct( | ||
**{ann: _sum_or_diff_fields(i[ann], j[ann]) for ann in callstat_ann} | ||
), | ||
x[0].select("AC", "AN", "homozygote_count"), | ||
x[1:], | ||
), | ||
) | ||
) | ||
# Check and see if any annotation within the merged array is negative. If so, | ||
# raise an error if set_negatives_to_zero is False or set the value to 0 if | ||
# set_negatives_to_zero is True. | ||
if operation == "diff": | ||
negative_value_error_msg = ( | ||
"Negative values found in merged frequency array. Review data or set" | ||
" `set_negatives_to_zero` to True to set negative values to 0." | ||
) | ||
callstat_ann.append("AF") | ||
new_freq = new_freq.map( | ||
jkgoodrich marked this conversation as resolved.
Show resolved
Hide resolved
|
||
lambda x: x.annotate( | ||
**{ | ||
ann: ( | ||
hl.case() | ||
.when(set_negatives_to_zero, hl.max(x[ann], 0)) | ||
.or_error(negative_value_error_msg) | ||
) | ||
for ann in callstat_ann | ||
} | ||
) | ||
) | ||
|
||
return new_freq, new_freq_meta |
Uh oh!
There was an error while loading. Please reload this page.