Skip to content

Commit 978b577

Browse files
authored
Merge pull request #421 from wlu04/subset_samples_and_variants
Modified subset_samples_and_variants()
2 parents 953c83a + 91290cc commit 978b577

File tree

1 file changed

+33
-14
lines changed

1 file changed

+33
-14
lines changed

gnomad/utils/filtering.py

Lines changed: 33 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -225,26 +225,37 @@ def add_filters_expr(
225225

226226

227227
def subset_samples_and_variants(
228-
mt: hl.MatrixTable,
228+
mtds: Union[hl.MatrixTable, hl.vds.VariantDataset],
229229
sample_path: str,
230230
header: bool = True,
231231
table_key: str = "s",
232232
sparse: bool = False,
233233
gt_expr: str = "GT",
234-
) -> hl.MatrixTable:
234+
remove_dead_alleles: bool = False,
235+
) -> Union[hl.MatrixTable, hl.vds.VariantDataset]:
235236
"""
236-
Subset the MatrixTable to the provided list of samples and their variants.
237+
Subset the MatrixTable or VariantDataset to the provided list of samples and their variants.
237238
238-
:param mt: Input MatrixTable
239+
:param mtds: Input MatrixTable or VariantDataset
239240
:param sample_path: Path to a file with list of samples
240241
:param header: Whether file with samples has a header. Default is True
241242
:param table_key: Key to sample Table. Default is "s"
242243
:param sparse: Whether the MatrixTable is sparse. Default is False
243244
:param gt_expr: Name of field in MatrixTable containing genotype expression. Default is "GT"
244-
:return: MatrixTable subsetted to specified samples and their variants
245+
:param remove_dead_alleles: Remove alleles observed in no samples. This option is currently only relevant when `mtds` is a VariantDataset. Default is False
246+
:return: MatrixTable or VariantDataset subsetted to specified samples and their variants
245247
"""
246248
sample_ht = hl.import_table(sample_path, no_header=not header, key=table_key)
247249
sample_count = sample_ht.count()
250+
is_vds = isinstance(mtds, hl.vds.VariantDataset)
251+
if is_vds:
252+
mt = mtds.variant_data
253+
else:
254+
if remove_dead_alleles:
255+
raise ValueError(
256+
"Removal of alleles observed in no samples is currently only implemented when the input dataset is a VariantDataset."
257+
)
258+
mt = mtds
248259
missing_ht = sample_ht.anti_join(mt.cols())
249260
missing_ht_count = missing_ht.count()
250261
full_count = mt.count_cols()
@@ -253,24 +264,32 @@ def subset_samples_and_variants(
253264
missing_samples = missing_ht.s.collect()
254265
raise DataException(
255266
f"Only {sample_count - missing_ht_count} out of {sample_count} "
256-
"subsetting-table IDs matched IDs in the MT.\n"
267+
f"subsetting-table IDs matched IDs in the {'VariantDataset' if is_vds else 'MatrixTable'}.\n"
257268
f"IDs that aren't in the MT: {missing_samples}\n"
258269
)
259270

260-
mt = mt.semi_join_cols(sample_ht)
261-
if sparse:
262-
mt = mt.filter_rows(
263-
hl.agg.any(mt[gt_expr].is_non_ref() | hl.is_defined(mt.END))
271+
if is_vds:
272+
mtds = hl.vds.filter_samples(
273+
mtds, sample_ht, keep=True, remove_dead_alleles=remove_dead_alleles
264274
)
275+
n_cols = mtds.variant_data.count_cols()
265276
else:
266-
mt = mt.filter_rows(hl.agg.any(mt[gt_expr].is_non_ref()))
277+
mtds = mtds.semi_join_cols(sample_ht)
278+
if sparse:
279+
mtds = mtds.filter_rows(
280+
hl.agg.any(mtds[gt_expr].is_non_ref() | hl.is_defined(mtds.END))
281+
)
282+
else:
283+
mtds = mtds.filter_rows(hl.agg.any(mtds[gt_expr].is_non_ref()))
284+
n_cols = mtds.count_cols()
267285

268286
logger.info(
269-
"Finished subsetting samples. Kept %d out of %d samples in MT",
270-
mt.count_cols(),
287+
"Finished subsetting samples. Kept %d out of %d samples in %s",
288+
n_cols,
271289
full_count,
290+
"VariantDataset" if is_vds else "MatrixTable",
272291
)
273-
return mt
292+
return mtds
274293

275294

276295
def filter_to_clinvar_pathogenic(

0 commit comments

Comments
 (0)