From 6f2d8a0ae8fe0de2f68187a9ef77166658631dee Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Thu, 24 Aug 2023 09:52:26 -0400 Subject: [PATCH 01/13] function to remove items from freq and freq_meta --- gnomad/utils/filtering.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/gnomad/utils/filtering.py b/gnomad/utils/filtering.py index fba17eaa1..0bd7eced4 100644 --- a/gnomad/utils/filtering.py +++ b/gnomad/utils/filtering.py @@ -529,3 +529,30 @@ def split_vds_by_strata( ) return [hl.vds.filter_samples(vds, list(s)) for strata, s in s_by_strata.items()] + + +def remove_items_from_freq(ht: hl.Table, items_to_remove: Dict[str, str]) -> hl.Table: + """ + Script to remove items from the freq array and freq_meta array in the Table. + + :param ht: Input Table with freq and freq_meta arrays. + :param items_to_remove: Dictionary of items to remove from the freq array and freq_meta array. + :return: Table with specified items removed from the freq array and freq_meta array. + """ + # TODO: we may have to change if we only have a list of keys to drop + freq = hl.map(lambda x: x[0].annotate(meta=x[1]), hl.zip(ht.freq, ht.freq_meta)) + + for k, v in items_to_remove.items(): + freq = hl.filter( + lambda f: (~f.meta.contains(k) | (f.meta.get(k) != v)), + freq, + ) + + ht = ht.annotate(freq=freq.map(lambda x: x[0:4])) + + for k, v in items_to_remove.items(): + ht = ht.annotate_globals( + freq_meta=ht.freq_meta.filter(lambda m: (~m.contains(k) | (m.get(k) != v))) + ) + + return ht From 53e75f1648e25f9ca32996b63fedb56c85e94d87 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Fri, 25 Aug 2023 05:47:37 -0400 Subject: [PATCH 02/13] make it general to list of keys and dict of keys-values --- gnomad/utils/filtering.py | 49 ++++++++++++++++++++++++++++++--------- 1 file changed, 38 insertions(+), 11 deletions(-) diff --git a/gnomad/utils/filtering.py b/gnomad/utils/filtering.py index bc195a8d7..c4f8f1fcc 100644 --- a/gnomad/utils/filtering.py +++ b/gnomad/utils/filtering.py @@ -3,7 +3,7 @@ import functools import logging import operator -from typing import Callable, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Union import hail as hl @@ -531,28 +531,55 @@ def split_vds_by_strata( return [hl.vds.filter_samples(vds, list(s)) for strata, s in s_by_strata.items()] -def remove_items_from_freq(ht: hl.Table, items_to_remove: Dict[str, str]) -> hl.Table: +def remove_items_from_freq( + ht: hl.Table, items_to_remove: Union[Dict[str, List[Any]], List[Any]] +) -> hl.Table: """ Script to remove items from the freq array and freq_meta array in the Table. :param ht: Input Table with freq and freq_meta arrays. - :param items_to_remove: Dictionary of items to remove from the freq array and freq_meta array. + :param items_to_remove: Dictionary or list of items to remove from the freq and freq_meta arrays. :return: Table with specified items removed from the freq array and freq_meta array. """ - # TODO: we may have to change if we only have a list of keys to drop - freq = hl.map(lambda x: x[0].annotate(meta=x[1]), hl.zip(ht.freq, ht.freq_meta)) - for k, v in items_to_remove.items(): + def _remove_key_value_pair_from_freq( + ht: hl.Table, + key: str, + value: str, + ) -> hl.Table: + """ + Remove key-value pair from freq and freq_meta arrays. + + :param key: Key to remove from freq_meta array. + :param value: Value to remove from freq_meta array. + :param ht: Input Table with freq and freq_meta arrays. + :return: Table with specified key-value pair removed from freq and freq_meta arrays. + """ + freq = hl.map(lambda x: x[0].annotate(meta=x[1]), hl.zip(ht.freq, ht.freq_meta)) + freq = hl.filter( - lambda f: (~f.meta.contains(k) | (f.meta.get(k) != v)), + lambda f: (~f.meta.contains(key) | (f.meta.get(key) != value)), freq, ) + ht = ht.annotate(freq=freq.map(lambda x: x[0:4])) + ht = ht.annotate_globals( + freq_meta=ht.freq_meta.filter( + lambda m: ~m.contains(key) | (m.get(key) != value) + ) + ) + return ht - ht = ht.annotate(freq=freq.map(lambda x: x[0:4])) - - for k, v in items_to_remove.items(): + if isinstance(items_to_remove, list): + freq = hl.map(lambda x: x[0].annotate(meta=x[1]), hl.zip(ht.freq, ht.freq_meta)) + for key in items_to_remove: + freq = hl.filter(lambda f: ~f.meta.contains(key), freq) + ht = ht.annotate(freq=freq.map(lambda x: x[0:4])) ht = ht.annotate_globals( - freq_meta=ht.freq_meta.filter(lambda m: (~m.contains(k) | (m.get(k) != v))) + freq_meta=ht.freq_meta.filter(lambda m: ~m.contains(key)) ) + elif isinstance(items_to_remove, dict): + for k, v in items_to_remove.items(): + for value in v: + ht = _remove_key_value_pair_from_freq(ht, k, value) return ht From ec030eb28b016a1bf79c153aa68dcf3b417ec8b1 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Fri, 25 Aug 2023 12:39:33 -0400 Subject: [PATCH 03/13] address comment review --- gnomad/utils/filtering.py | 60 +++++++++++++++++++++++---------------- 1 file changed, 36 insertions(+), 24 deletions(-) diff --git a/gnomad/utils/filtering.py b/gnomad/utils/filtering.py index c4f8f1fcc..801d48fcf 100644 --- a/gnomad/utils/filtering.py +++ b/gnomad/utils/filtering.py @@ -532,54 +532,66 @@ def split_vds_by_strata( def remove_items_from_freq( - ht: hl.Table, items_to_remove: Union[Dict[str, List[Any]], List[Any]] -) -> hl.Table: + freq_expr: hl.expr.ArrayExpression, + freq_meta_expr: hl.expr.ArrayExpression, + items_to_remove: Union[Dict[str, List[Any]], List[Any]], +) -> [hl.expr.ArrayExpression, hl.expr.ArrayExpression]: """ Script to remove items from the freq array and freq_meta array in the Table. - :param ht: Input Table with freq and freq_meta arrays. - :param items_to_remove: Dictionary or list of items to remove from the freq and freq_meta arrays. + :param freq_expr: ArrayExpression containing the freq array. + :param freq_meta_expr: ArrayExpression containing the freq_meta array. + :param items_to_remove: Dictionary or list of items to remove from the freq + and freq_meta arrays, the format has to be + {key: [value]}, {key: [value1, value2, ...]} or [key1, key2, ...]. :return: Table with specified items removed from the freq array and freq_meta array. """ + freq_meta_expr = freq_meta_expr.collect(_localize=False)[0] def _remove_key_value_pair_from_freq( - ht: hl.Table, + freq_expr: hl.expr.ArrayExpression, + freq_meta_expr: hl.expr.ArrayExpression, key: str, value: str, - ) -> hl.Table: + ) -> [hl.expr.ArrayExpression, hl.expr.ArrayExpression]: """ Remove key-value pair from freq and freq_meta arrays. + :param freq_expr: ArrayExpression containing the freq array. + :param freq_meta_expr: ArrayExpression containing the freq_meta array. :param key: Key to remove from freq_meta array. :param value: Value to remove from freq_meta array. - :param ht: Input Table with freq and freq_meta arrays. :return: Table with specified key-value pair removed from freq and freq_meta arrays. """ - freq = hl.map(lambda x: x[0].annotate(meta=x[1]), hl.zip(ht.freq, ht.freq_meta)) + freq_expr = hl.map( + lambda x: x[0].annotate(_meta=x[1]), hl.zip(freq_expr, freq_meta_expr) + ) - freq = hl.filter( + freq_expr = hl.filter( lambda f: (~f.meta.contains(key) | (f.meta.get(key) != value)), - freq, + freq_expr, ) - ht = ht.annotate(freq=freq.map(lambda x: x[0:4])) - ht = ht.annotate_globals( - freq_meta=ht.freq_meta.filter( - lambda m: ~m.contains(key) | (m.get(key) != value) - ) + freq_expr = freq_expr.map(lambda x: x.drop("_meta")) + freq_meta_expr = freq_meta_expr.filter( + lambda m: ~m.contains(key) | (m.get(key) != value) ) - return ht + + return freq_expr, freq_meta_expr if isinstance(items_to_remove, list): - freq = hl.map(lambda x: x[0].annotate(meta=x[1]), hl.zip(ht.freq, ht.freq_meta)) - for key in items_to_remove: - freq = hl.filter(lambda f: ~f.meta.contains(key), freq) - ht = ht.annotate(freq=freq.map(lambda x: x[0:4])) - ht = ht.annotate_globals( - freq_meta=ht.freq_meta.filter(lambda m: ~m.contains(key)) + freq_expr = hl.map( + lambda x: x[0].annotate(_meta=x[1]), hl.zip(freq_expr, freq_meta_expr) ) + for key in items_to_remove: + freq_expr = hl.filter(lambda f: ~f.meta.contains(key), freq_expr) + freq_expr = freq_expr.map(lambda x: x.drop("_meta")) + freq_meta_expr = freq_meta_expr.filter(lambda m: ~m.contains(key)) elif isinstance(items_to_remove, dict): for k, v in items_to_remove.items(): for value in v: - ht = _remove_key_value_pair_from_freq(ht, k, value) - return ht + freq_expr, freq_meta_expr = _remove_key_value_pair_from_freq( + freq_expr, freq_meta_expr, k, value + ) + + return freq_expr, freq_meta_expr From 3a52ddcd887cca6f4ae2997fa537a28fcfb9cc8a Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Fri, 25 Aug 2023 12:40:41 -0400 Subject: [PATCH 04/13] address comment review --- gnomad/utils/filtering.py | 47 +++++++++------------------------------ 1 file changed, 11 insertions(+), 36 deletions(-) diff --git a/gnomad/utils/filtering.py b/gnomad/utils/filtering.py index 801d48fcf..765e0c80c 100644 --- a/gnomad/utils/filtering.py +++ b/gnomad/utils/filtering.py @@ -547,51 +547,26 @@ def remove_items_from_freq( :return: Table with specified items removed from the freq array and freq_meta array. """ freq_meta_expr = freq_meta_expr.collect(_localize=False)[0] - - def _remove_key_value_pair_from_freq( - freq_expr: hl.expr.ArrayExpression, - freq_meta_expr: hl.expr.ArrayExpression, - key: str, - value: str, - ) -> [hl.expr.ArrayExpression, hl.expr.ArrayExpression]: - """ - Remove key-value pair from freq and freq_meta arrays. - - :param freq_expr: ArrayExpression containing the freq array. - :param freq_meta_expr: ArrayExpression containing the freq_meta array. - :param key: Key to remove from freq_meta array. - :param value: Value to remove from freq_meta array. - :return: Table with specified key-value pair removed from freq and freq_meta arrays. - """ - freq_expr = hl.map( - lambda x: x[0].annotate(_meta=x[1]), hl.zip(freq_expr, freq_meta_expr) - ) - - freq_expr = hl.filter( - lambda f: (~f.meta.contains(key) | (f.meta.get(key) != value)), - freq_expr, - ) - freq_expr = freq_expr.map(lambda x: x.drop("_meta")) - freq_meta_expr = freq_meta_expr.filter( - lambda m: ~m.contains(key) | (m.get(key) != value) - ) - - return freq_expr, freq_meta_expr + freq_expr = hl.map( + lambda x: x[0].annotate(_meta=x[1]), hl.zip(freq_expr, freq_meta_expr) + ) if isinstance(items_to_remove, list): - freq_expr = hl.map( - lambda x: x[0].annotate(_meta=x[1]), hl.zip(freq_expr, freq_meta_expr) - ) for key in items_to_remove: - freq_expr = hl.filter(lambda f: ~f.meta.contains(key), freq_expr) + freq_expr = hl.filter(lambda f: ~f._meta.contains(key), freq_expr) freq_expr = freq_expr.map(lambda x: x.drop("_meta")) freq_meta_expr = freq_meta_expr.filter(lambda m: ~m.contains(key)) elif isinstance(items_to_remove, dict): for k, v in items_to_remove.items(): for value in v: - freq_expr, freq_meta_expr = _remove_key_value_pair_from_freq( - freq_expr, freq_meta_expr, k, value + freq_expr = hl.filter( + lambda f: (~f._meta.contains(k) | (f._meta.get(k) != value)), + freq_expr, + ) + freq_expr = freq_expr.map(lambda x: x.drop("_meta")) + freq_meta_expr = freq_meta_expr.filter( + lambda m: ~m.contains(k) | (m.get(k) != value) ) return freq_expr, freq_meta_expr From f1d87a34937490b7c20751358b911bdbfa311385 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Fri, 25 Aug 2023 16:15:29 -0400 Subject: [PATCH 05/13] add docstring to Julia's versatile version --- gnomad/utils/filtering.py | 73 +++++++++++++++++++++++---------------- 1 file changed, 43 insertions(+), 30 deletions(-) diff --git a/gnomad/utils/filtering.py b/gnomad/utils/filtering.py index 765e0c80c..718472a26 100644 --- a/gnomad/utils/filtering.py +++ b/gnomad/utils/filtering.py @@ -531,42 +531,55 @@ def split_vds_by_strata( return [hl.vds.filter_samples(vds, list(s)) for strata, s in s_by_strata.items()] -def remove_items_from_freq( +def filter_freq_by_meta( freq_expr: hl.expr.ArrayExpression, freq_meta_expr: hl.expr.ArrayExpression, - items_to_remove: Union[Dict[str, List[Any]], List[Any]], + items_to_filter: Union[Dict[str, List[Any]], List[Any]], + keep: bool = True, + operator: str = "and", ) -> [hl.expr.ArrayExpression, hl.expr.ArrayExpression]: """ - Script to remove items from the freq array and freq_meta array in the Table. - - :param freq_expr: ArrayExpression containing the freq array. - :param freq_meta_expr: ArrayExpression containing the freq_meta array. - :param items_to_remove: Dictionary or list of items to remove from the freq - and freq_meta arrays, the format has to be - {key: [value]}, {key: [value1, value2, ...]} or [key1, key2, ...]. - :return: Table with specified items removed from the freq array and freq_meta array. + Filter frequency and frequency meta expressions by freq_meta items. + + This function is designed to filter in different cases, for example: + simply filter by a list of keys, e.g. ["sex", "downsampling"], + or by specific populations by using {"pop": ["han", "papuan"]}, + or a more complicated use case: {"pop": ["afr"], "sex": ["XX"]}, + one can decide to keep or remove the items, either they appear + at the same time in one freq_meta dictionary by using "and", + or they appear in different freq_meta dictionaries by using "or". + + :param freq_expr: frequency expression + :param freq_meta_expr: frequency meta expression + :param items_to_filter: items to filter by, either a list or a dictionary + :param keep: whether to keep or remove the items + :param operator: whether to use "and" or "or" to combine the items + :return: filtered frequency and frequency meta expressions """ freq_meta_expr = freq_meta_expr.collect(_localize=False)[0] - freq_expr = hl.map( - lambda x: x[0].annotate(_meta=x[1]), hl.zip(freq_expr, freq_meta_expr) - ) - if isinstance(items_to_remove, list): - for key in items_to_remove: - freq_expr = hl.filter(lambda f: ~f._meta.contains(key), freq_expr) - freq_expr = freq_expr.map(lambda x: x.drop("_meta")) - freq_meta_expr = freq_meta_expr.filter(lambda m: ~m.contains(key)) - - elif isinstance(items_to_remove, dict): - for k, v in items_to_remove.items(): - for value in v: - freq_expr = hl.filter( - lambda f: (~f._meta.contains(k) | (f._meta.get(k) != value)), - freq_expr, - ) - freq_expr = freq_expr.map(lambda x: x.drop("_meta")) - freq_meta_expr = freq_meta_expr.filter( - lambda m: ~m.contains(k) | (m.get(k) != value) - ) + if operator == "and": + operator_func = hl.all + elif operator == "or": + operator_func = hl.any + + if isinstance(items_to_filter, list): + filter_func = lambda m, k: m.contains(k) + elif isinstance(items_to_filter, dict): + filter_func = lambda m, k: (m.get(k[0], "") == k[1]) + items_to_filter = [ + (k, v) for k, values in items_to_filter.items() for v in values + ] + else: + raise TypeError("") + + freq_meta_expr = hl.enumerate(freq_meta_expr).filter( + lambda m: hl.bind( + lambda x: hl.if_else(keep, x, ~x), + operator_func([filter_func(m[1], k) for k in items_to_filter]), + ), + ) + freq_expr = freq_meta_expr.map(lambda x: freq_expr[x[0]]) + freq_meta_expr = freq_meta_expr.map(lambda x: x[1]) return freq_expr, freq_meta_expr From 688eaf1ef072d3abb24772826267c55a29348179 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Fri, 25 Aug 2023 16:26:59 -0400 Subject: [PATCH 06/13] merge from origin/main --- gnomad/utils/filtering.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/gnomad/utils/filtering.py b/gnomad/utils/filtering.py index cfac5d632..5a3851368 100644 --- a/gnomad/utils/filtering.py +++ b/gnomad/utils/filtering.py @@ -515,20 +515,22 @@ def filter_for_mu( def split_vds_by_strata( vds: hl.vds.VariantDataset, strata_expr: hl.expr.Expression -) -> List[hl.vds.VariantDataset]: +) -> Dict[str, hl.vds.VariantDataset]: """ - Split a VDS into a list of VDSs based on `strata_expr`. + Split a VDS into multiple VDSs based on `strata_expr`. :param vds: Input VDS. :param strata_expr: Expression on VDS variant_data MT to split on. - :return: List of VDSs. + :return: Dictionary where strata value is key and VDS is value. """ vmt = vds.variant_data s_by_strata = vmt.aggregate_cols( hl.agg.group_by(strata_expr, hl.agg.collect_as_set(vmt.s)) ) - return [hl.vds.filter_samples(vds, list(s)) for strata, s in s_by_strata.items()] + return { + strata: hl.vds.filter_samples(vds, list(s)) for strata, s in s_by_strata.items() + } def filter_freq_by_meta( From fe7a612d578b859f642f5f693c005c54bd35faad Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Fri, 25 Aug 2023 16:32:20 -0400 Subject: [PATCH 07/13] import Any from typing --- gnomad/utils/filtering.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/utils/filtering.py b/gnomad/utils/filtering.py index 5a3851368..e114d4b24 100644 --- a/gnomad/utils/filtering.py +++ b/gnomad/utils/filtering.py @@ -3,7 +3,7 @@ import functools import logging import operator -from typing import Callable, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Union import hail as hl From bc668a496eb8979c9f325ac31e2d9905992c581f Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Mon, 28 Aug 2023 14:25:49 -0400 Subject: [PATCH 08/13] address review comments round 2 --- gnomad/utils/filtering.py | 50 ++++++++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/gnomad/utils/filtering.py b/gnomad/utils/filtering.py index e114d4b24..ca1b6d511 100644 --- a/gnomad/utils/filtering.py +++ b/gnomad/utils/filtering.py @@ -538,47 +538,63 @@ def filter_freq_by_meta( freq_meta_expr: hl.expr.ArrayExpression, items_to_filter: Union[Dict[str, List[Any]], List[Any]], keep: bool = True, - operator: str = "and", + combine_operator: str = "and", ) -> [hl.expr.ArrayExpression, hl.expr.ArrayExpression]: """ - Filter frequency and frequency meta expressions by freq_meta items. + Filter frequency and frequency meta expressions specified by `items_to_filter`. - This function is designed to filter in different cases, for example: - simply filter by a list of keys, e.g. ["sex", "downsampling"], - or by specific populations by using {"pop": ["han", "papuan"]}, - or a more complicated use case: {"pop": ["afr"], "sex": ["XX"]}, - one can decide to keep or remove the items, either they appear - at the same time in one freq_meta dictionary by using "and", - or they appear in different freq_meta dictionaries by using "or". + The `items_to_filter` can be used to filter in the following ways based on + `freq_meta_expr` items: + - By a list of keys, e.g. ["sex", "downsampling"]. + - By specific key: value pairs, e.g. to filter where 'pop' is 'han' or 'papuan' + {"pop": ["han", "papuan"]}, or where 'pop' is 'afr' and/or 'sex' is 'XX' + {"pop": ["afr"], "sex": ["XX"]}. - :param freq_expr: frequency expression + The items can be kept or removed from `freq_expr` and `freq_meta_expr` based on the + value of `keep`. + + The filtering can also be applied such that all criteria must be met + (`combine_operator` = "and") by the `freq_meta_expr` item in order to be filtered, + or at least one of the specified criteria must be met (`combine_operator` = "or") + by the `freq_meta_expr` item in order to be filtered. + + :param freq_expr: Frequency expression. :param freq_meta_expr: frequency meta expression - :param items_to_filter: items to filter by, either a list or a dictionary + :param items_to_filter: Items to filter by, either a list or a dictionary. :param keep: whether to keep or remove the items - :param operator: whether to use "and" or "or" to combine the items + :param combine_operator: Whether to use "and" or "or" to combine the items + specified by `items_to_filter`. :return: filtered frequency and frequency meta expressions """ freq_meta_expr = freq_meta_expr.collect(_localize=False)[0] - if operator == "and": + if combine_operator == "and": operator_func = hl.all - elif operator == "or": + elif combine_operator == "or": operator_func = hl.any + else: + raise ValueError( + "combine_operator must be one of 'and' or 'or', but found" + f" {combine_operator}" + ) if isinstance(items_to_filter, list): filter_func = lambda m, k: m.contains(k) + items_to_filter = [[k] for k in items_to_filter] elif isinstance(items_to_filter, dict): filter_func = lambda m, k: (m.get(k[0], "") == k[1]) items_to_filter = [ - (k, v) for k, values in items_to_filter.items() for v in values + [(k, v) for v in values] for k, values in items_to_filter.items() ] else: - raise TypeError("") + raise TypeError(f"items_to_filter must be a list or a dictionary") freq_meta_expr = hl.enumerate(freq_meta_expr).filter( lambda m: hl.bind( lambda x: hl.if_else(keep, x, ~x), - operator_func([filter_func(m[1], k) for k in items_to_filter]), + operator_func( + [hl.any([filter_func(m[1], v) for v in k]) for k in items_to_filter] + ), ), ) freq_expr = freq_meta_expr.map(lambda x: freq_expr[x[0]]) From 1e863e1a835e420df7d159a873d61e7c18f5ddc8 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Mon, 28 Aug 2023 14:31:35 -0400 Subject: [PATCH 09/13] change fstring --- gnomad/utils/filtering.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/utils/filtering.py b/gnomad/utils/filtering.py index ca1b6d511..0512d0ded 100644 --- a/gnomad/utils/filtering.py +++ b/gnomad/utils/filtering.py @@ -587,7 +587,7 @@ def filter_freq_by_meta( [(k, v) for v in values] for k, values in items_to_filter.items() ] else: - raise TypeError(f"items_to_filter must be a list or a dictionary") + raise TypeError("items_to_filter must be a list or a dictionary") freq_meta_expr = hl.enumerate(freq_meta_expr).filter( lambda m: hl.bind( From a4032b703f33e06941a5a035800329390e6c4f38 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Mon, 28 Aug 2023 14:45:37 -0400 Subject: [PATCH 10/13] fix unexpected indentation --- gnomad/utils/filtering.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gnomad/utils/filtering.py b/gnomad/utils/filtering.py index 0512d0ded..5a2f9d278 100644 --- a/gnomad/utils/filtering.py +++ b/gnomad/utils/filtering.py @@ -3,7 +3,7 @@ import functools import logging import operator -from typing import Any, Callable, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import hail as hl @@ -539,7 +539,7 @@ def filter_freq_by_meta( items_to_filter: Union[Dict[str, List[Any]], List[Any]], keep: bool = True, combine_operator: str = "and", -) -> [hl.expr.ArrayExpression, hl.expr.ArrayExpression]: +) -> Tuple[hl.expr.ArrayExpression, hl.expr.ArrayExpression]: """ Filter frequency and frequency meta expressions specified by `items_to_filter`. From 4383dc7749af8d8a13f8aa8aadc289f84a8a804f Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Mon, 28 Aug 2023 14:52:27 -0400 Subject: [PATCH 11/13] fix unexpected indentation again --- gnomad/utils/filtering.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gnomad/utils/filtering.py b/gnomad/utils/filtering.py index 5a2f9d278..33f925089 100644 --- a/gnomad/utils/filtering.py +++ b/gnomad/utils/filtering.py @@ -545,10 +545,10 @@ def filter_freq_by_meta( The `items_to_filter` can be used to filter in the following ways based on `freq_meta_expr` items: - - By a list of keys, e.g. ["sex", "downsampling"]. - - By specific key: value pairs, e.g. to filter where 'pop' is 'han' or 'papuan' - {"pop": ["han", "papuan"]}, or where 'pop' is 'afr' and/or 'sex' is 'XX' - {"pop": ["afr"], "sex": ["XX"]}. + - By a list of keys, e.g. ["sex", "downsampling"]. + - By specific key: value pairs, e.g. to filter where 'pop' is 'han' or 'papuan' + {"pop": ["han", "papuan"]}, or where 'pop' is 'afr' and/or 'sex' is 'XX' + {"pop": ["afr"], "sex": ["XX"]}. The items can be kept or removed from `freq_expr` and `freq_meta_expr` based on the value of `keep`. From b1d749f069344b3d06ddccf8a1be06660266f8df Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Mon, 28 Aug 2023 15:00:34 -0400 Subject: [PATCH 12/13] fix unexpected indentation again again --- gnomad/utils/filtering.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gnomad/utils/filtering.py b/gnomad/utils/filtering.py index 33f925089..b5955656d 100644 --- a/gnomad/utils/filtering.py +++ b/gnomad/utils/filtering.py @@ -545,10 +545,10 @@ def filter_freq_by_meta( The `items_to_filter` can be used to filter in the following ways based on `freq_meta_expr` items: - - By a list of keys, e.g. ["sex", "downsampling"]. - - By specific key: value pairs, e.g. to filter where 'pop' is 'han' or 'papuan' - {"pop": ["han", "papuan"]}, or where 'pop' is 'afr' and/or 'sex' is 'XX' - {"pop": ["afr"], "sex": ["XX"]}. + - By a list of keys, e.g. ["sex", "downsampling"]. + - By specific key: value pairs, e.g. to filter where 'pop' is 'han' or 'papuan' + {"pop": ["han", "papuan"]}, or where 'pop' is 'afr' and/or 'sex' is 'XX' + {"pop": ["afr"], "sex": ["XX"]}. The items can be kept or removed from `freq_expr` and `freq_meta_expr` based on the value of `keep`. From 9888ce591ae5adac2be31094dbc826a592706127 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Mon, 28 Aug 2023 15:29:45 -0400 Subject: [PATCH 13/13] commit omitted suggestions --- gnomad/utils/filtering.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/gnomad/utils/filtering.py b/gnomad/utils/filtering.py index b5955656d..94e4c3b46 100644 --- a/gnomad/utils/filtering.py +++ b/gnomad/utils/filtering.py @@ -3,7 +3,7 @@ import functools import logging import operator -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Callable, Dict, List, Optional, Tuple, Union import hail as hl @@ -536,7 +536,7 @@ def split_vds_by_strata( def filter_freq_by_meta( freq_expr: hl.expr.ArrayExpression, freq_meta_expr: hl.expr.ArrayExpression, - items_to_filter: Union[Dict[str, List[Any]], List[Any]], + items_to_filter: Union[Dict[str, List[str]], List[str]], keep: bool = True, combine_operator: str = "and", ) -> Tuple[hl.expr.ArrayExpression, hl.expr.ArrayExpression]: @@ -559,12 +559,12 @@ def filter_freq_by_meta( by the `freq_meta_expr` item in order to be filtered. :param freq_expr: Frequency expression. - :param freq_meta_expr: frequency meta expression + :param freq_meta_expr: Frequency meta expression. :param items_to_filter: Items to filter by, either a list or a dictionary. - :param keep: whether to keep or remove the items + :param keep: Whether to keep or remove the items specified by `items_to_filter`. :param combine_operator: Whether to use "and" or "or" to combine the items specified by `items_to_filter`. - :return: filtered frequency and frequency meta expressions + :return: Tuple of the filtered frequency and frequency meta expressions. """ freq_meta_expr = freq_meta_expr.collect(_localize=False)[0] @@ -575,7 +575,7 @@ def filter_freq_by_meta( else: raise ValueError( "combine_operator must be one of 'and' or 'or', but found" - f" {combine_operator}" + f" {combine_operator}!" ) if isinstance(items_to_filter, list): @@ -587,7 +587,7 @@ def filter_freq_by_meta( [(k, v) for v in values] for k, values in items_to_filter.items() ] else: - raise TypeError("items_to_filter must be a list or a dictionary") + raise TypeError("items_to_filter must be a list or a dictionary!") freq_meta_expr = hl.enumerate(freq_meta_expr).filter( lambda m: hl.bind(