From 333eeb2c7e68c350421dbe19b6378d0764c89a5e Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Fri, 23 Feb 2024 11:43:48 -0700 Subject: [PATCH 1/5] Fix sex ploidy adjustment so XX samples still get set to missing on chrY --- gnomad/sample_qc/sex.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/sample_qc/sex.py b/gnomad/sample_qc/sex.py index 09ad75108..f3d4a631c 100644 --- a/gnomad/sample_qc/sex.py +++ b/gnomad/sample_qc/sex.py @@ -42,8 +42,8 @@ def adjusted_sex_ploidy_expr( return ( hl.case(missing_false=True) + .when(col_idx.xx & (row_idx.y_par | row_idx.y_nonpar), hl.missing(hl.tcall)) .when(~row_idx.in_non_par, gt_expr) - .when(col_idx.xx & (row_idx.y_par | row_idx.y_nonpar), hl.null(hl.tcall)) .when( col_idx.xy & (row_idx.x_nonpar | row_idx.y_nonpar) & gt_expr.is_het(), hl.null(hl.tcall), From d907983f08cc15a284c0be8a5167a91143129d12 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Fri, 23 Feb 2024 12:06:53 -0700 Subject: [PATCH 2/5] Reduce the number of checks by entry in sex ploidy adjustment --- gnomad/sample_qc/sex.py | 9 ++++++--- gnomad/utils/annotations.py | 1 + 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/gnomad/sample_qc/sex.py b/gnomad/sample_qc/sex.py index f3d4a631c..aea6c8696 100644 --- a/gnomad/sample_qc/sex.py +++ b/gnomad/sample_qc/sex.py @@ -42,14 +42,17 @@ def adjusted_sex_ploidy_expr( return ( hl.case(missing_false=True) - .when(col_idx.xx & (row_idx.y_par | row_idx.y_nonpar), hl.missing(hl.tcall)) + # Added to reduce the checks by entry. + .when(row_idx.in_autosome, gt_expr) + .when((row_idx.y_par | row_idx.y_nonpar) & col_idx.xx, hl.missing(hl.tcall)) + # Added to reduce the checks by entry. .when(~row_idx.in_non_par, gt_expr) .when( - col_idx.xy & (row_idx.x_nonpar | row_idx.y_nonpar) & gt_expr.is_het(), + (row_idx.x_nonpar | row_idx.y_nonpar) & col_idx.xy & gt_expr.is_het(), hl.null(hl.tcall), ) .when( - col_idx.xy & (row_idx.x_nonpar | row_idx.y_nonpar), + (row_idx.x_nonpar | row_idx.y_nonpar) & col_idx.xy, hl.call(gt_expr[0], phased=False), ) .default(gt_expr) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 2df5bd7d4..5994f724d 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -640,6 +640,7 @@ def annotate_and_index_source_mt_for_sex_ploidy( ).cols() row_ht = source_mt.annotate_rows( in_non_par=~locus_expr.in_autosome_or_par(), + in_autosome=~locus_expr.in_autosome(), x_nonpar=locus_expr.in_x_nonpar(), y_par=locus_expr.in_y_par(), y_nonpar=locus_expr.in_y_nonpar(), From 8f22cbadaf32c19339f9451454a50eac0c24d006 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Fri, 23 Feb 2024 12:08:08 -0700 Subject: [PATCH 3/5] Remove ~ from in_autosome --- gnomad/utils/annotations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 5994f724d..5ddfa5792 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -640,7 +640,7 @@ def annotate_and_index_source_mt_for_sex_ploidy( ).cols() row_ht = source_mt.annotate_rows( in_non_par=~locus_expr.in_autosome_or_par(), - in_autosome=~locus_expr.in_autosome(), + in_autosome=locus_expr.in_autosome(), x_nonpar=locus_expr.in_x_nonpar(), y_par=locus_expr.in_y_par(), y_nonpar=locus_expr.in_y_nonpar(), From bdd948d70ac8b8266dc61787ba9b891789c214d8 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Fri, 23 Feb 2024 12:32:19 -0700 Subject: [PATCH 4/5] Update gnomad/sample_qc/sex.py Co-authored-by: Mike Wilson --- gnomad/sample_qc/sex.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gnomad/sample_qc/sex.py b/gnomad/sample_qc/sex.py index aea6c8696..82ba62d5d 100644 --- a/gnomad/sample_qc/sex.py +++ b/gnomad/sample_qc/sex.py @@ -45,7 +45,6 @@ def adjusted_sex_ploidy_expr( # Added to reduce the checks by entry. .when(row_idx.in_autosome, gt_expr) .when((row_idx.y_par | row_idx.y_nonpar) & col_idx.xx, hl.missing(hl.tcall)) - # Added to reduce the checks by entry. .when(~row_idx.in_non_par, gt_expr) .when( (row_idx.x_nonpar | row_idx.y_nonpar) & col_idx.xy & gt_expr.is_het(), From f6e19213b005c33ce3ae27b210b3f2b267cf7641 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Fri, 23 Feb 2024 16:16:02 -0700 Subject: [PATCH 5/5] Move rekey to `densify_all_reference_sites` --- gnomad/utils/annotations.py | 4 ++-- gnomad/utils/sparse_mt.py | 11 +++++------ 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 5ddfa5792..f64b45d4e 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -603,8 +603,8 @@ def create_frequency_bins_expr( def annotate_and_index_source_mt_for_sex_ploidy( - locus_expr: hl.expr.LocusExpression = None, - karyotype_expr: hl.expr.StringExpression = None, + locus_expr: hl.expr.LocusExpression, + karyotype_expr: hl.expr.StringExpression, xy_karyotype_str: str = "XY", xx_karyotype_str: str = "XX", ) -> Tuple[hl.expr.StructExpression, hl.expr.StructExpression]: diff --git a/gnomad/utils/sparse_mt.py b/gnomad/utils/sparse_mt.py index a5a59d1a3..ddf6fe1e0 100644 --- a/gnomad/utils/sparse_mt.py +++ b/gnomad/utils/sparse_mt.py @@ -990,6 +990,10 @@ def densify_all_reference_sites( # Unfilter entries so that entries with no ref block overlap aren't null. mt = mt.unfilter_entries() + # Rekey by requested row key field and drop unused keys. + mt = mt.key_rows_by(*row_key_fields) + mt = mt.drop(*[k for k in mt_row_key_fields if k not in row_key_fields]) + return mt @@ -1192,12 +1196,7 @@ def compute_stats_per_ref_site( select_fields=row_keep_fields, entry_agg_group_membership=entry_agg_group_membership, ) - ht = ht.checkpoint(hl.utils.new_temp_file("agg_stats", "ht")) - - # Drop no longer needed fields - current_keys = list(ht.key) - ht = ht.key_by(*row_key_fields).select_globals() - ht = ht.drop(*[k for k in current_keys if k not in row_key_fields]) + ht = ht.select_globals().checkpoint(hl.utils.new_temp_file("agg_stats", "ht")) group_globals = group_membership_ht.index_globals() global_expr = {}