Skip to content

Commit 415cce7

Browse files
committed
Revert "copy base branch"
This reverts commit 1aa1a45.
1 parent 1aa1a45 commit 415cce7

File tree

1 file changed

+53
-72
lines changed

1 file changed

+53
-72
lines changed

gnomad/utils/transcript_annotation.py

Lines changed: 53 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -1,95 +1,76 @@
11
"""Utils module containing generic functions that are useful for adding transcript expression-aware annotations."""
2-
import logging
3-
from typing import Callable, List, Optional, Tuple, Union
2+
from typing import Callable, List, Optional, Tuple
43

54
import hail as hl
65

7-
logging.basicConfig(
8-
format="%(asctime)s (%(name)s %(lineno)s): %(message)s",
9-
datefmt="%m/%d/%Y %I:%M:%S %p",
10-
)
11-
logger = logging.getLogger("transcript_annotation_utils")
12-
logger.setLevel(logging.INFO)
136

14-
15-
def summarize_transcript_expression(
16-
mt: hl.MatrixTable,
17-
transcript_expression_expr: Union[hl.expr.NumericExpression, str] = "x",
18-
tissue_expr: Union[hl.expr.StringExpression, str] = "tissue",
7+
def summarize_rsem_mt(
8+
rsem_mt: hl.MatrixTable,
9+
rsem_expr: hl.expr.NumericExpression,
10+
tissue_expr: hl.expr.StringExpression,
1911
summary_agg_func: Optional[Callable] = None,
12+
tissue_as_row: bool = False,
2013
) -> Tuple[hl.Table, hl.Table]:
2114
"""
22-
Summarize a transcript expression MatrixTable by transcript, gene, and tissue.
15+
Summarize an RSEM table with ENSTs and ENSGs as rows and samples as columns by tissue.
2316
2417
The `summary_agg_func` argument allows the user to specify a Hail aggregation
2518
function to use to summarize the expression by tissue. By default, the median is
2619
used.
2720
28-
The returned Table has a row annotation for each tissue containing the summarized
29-
tissue expression value.
30-
31-
:param mt: MatrixTable of transcript (rows) expression quantifications (entry) by
32-
sample (columns).
33-
:param tissue_expr: Column expression indicating tissue type. Default is 'tissue'.
34-
:param transcript_expression_expr: Entry expression indicating transcript expression
35-
quantification. Default is 'x'.
36-
:param summary_agg_func: Optional aggregation function to use to summarize the
37-
transcript expression quantification by tissue. Example: `hl.agg.mean`. Default
38-
is None, which will use a median aggregation.
39-
:return: A Table of summarized transcript expression by tissue and a Table of
40-
summarized gene expression by tissue.
21+
.. note::
22+
23+
The outputs can be returned in one of the following formats:
24+
25+
- A Table with a field containing an array of summarized expression
26+
values by tissue, where the order of tissues in the array is indicated by
27+
the "tissues" global annotation (`tissue_as_row` set to False).
28+
- A Table with a row annotation for each tissue containing the summarized
29+
tissue expression value (`tissue_as_row` set to True).
30+
31+
:param rsem_mt: MatrixTable of RSEM quantifications.
32+
:param tissue_expr: Column expression indicating tissue type.
33+
:param rsem_expr: Entry expression indicating RSEM quantification.
34+
:param summary_agg_func: Optional aggregation function to use to summarize the RSEM
35+
values by tissue. Default is None, which will use a median aggregation.
36+
:param tissue_as_row: If True, return a Table with a row annotation for each tissue
37+
instead of an array of RSEM values. Default is False.
38+
:return: A Table of summarized transcript expression and a Table of summarized
39+
gene expression.
4140
"""
4241
if summary_agg_func is None:
4342
summary_agg_func = lambda x: hl.median(hl.agg.collect(x))
4443

45-
mt = mt.group_cols_by(tissue=tissue_expr).aggregate(
46-
tx=summary_agg_func(transcript_expression_expr)
44+
rsem_mt = rsem_mt.group_cols_by(tissue=tissue_expr).aggregate(
45+
transcript_expression=summary_agg_func(rsem_expr)
4746
)
4847

49-
transcript_ht = mt.rename({"tx": ""}).make_table()
50-
transcript_ht = transcript_ht.key_by("transcript_id", "gene_id")
51-
52-
gene_ht = transcript_ht.group_by("gene_id").aggregate(
53-
**{
54-
tissue: hl.agg.sum(transcript_ht[tissue])
55-
for tissue in list(transcript_ht.row_value)
56-
}
57-
)
58-
59-
return transcript_ht, gene_ht
60-
61-
62-
def tissue_expression_ht_to_array(
63-
ht: hl.Table,
64-
tissues: Optional[List[str]] = None,
65-
tissues_to_filter: Optional[List[str]] = None,
66-
) -> hl.Table:
67-
"""
68-
Convert a Table with a row annotation for each tissue to a Table with tissues as an array.
69-
70-
The output is a Table with a field 'tissue_expression' containing an array of
71-
summarized expression values by tissue, where the order of tissues in the array is
72-
indicated by the "tissues" global annotation.
73-
74-
:param ht: Table with a row annotation for each tissue.
75-
:param tissues: Optional list of tissues to keep in the 'tissue_expression' array.
76-
Default is all non-key rows in the Table.
77-
:param tissues_to_filter: Optional list of tissues to exclude from the tissue
78-
expression array.
79-
:return: Table with a field 'tissue_expression' containing an array of summarized
80-
expression values by tissue.
81-
"""
82-
if tissues is None:
83-
tissues = list(ht.row_value)
84-
85-
if tissues_to_filter is not None:
86-
logger.info("Filtering tissues: %s", tissues_to_filter)
87-
tissues = [t for t in tissues if t not in tissues_to_filter]
88-
89-
ht = ht.select_globals(tissues=tissues)
90-
ht = ht.select(tissue_expression=[ht[t] for t in tissues])
48+
if tissue_as_row:
49+
transcript_ht = rsem_mt.rename({"transcript_expression": ""}).make_table()
50+
gene_ht = transcript_ht.key_by("gene_id").drop("transcript_id")
51+
tissues = list(gene_ht.row)
52+
tissues.remove("gene_id")
53+
gene_ht = gene_ht.group_by(*gene_ht.key).aggregate(
54+
**{tissue: hl.agg.sum(gene_ht[tissue]) for tissue in tissues}
55+
)
56+
else:
57+
transcript_ht = rsem_mt.localize_entries(
58+
columns_array_field_name="tissues",
59+
entries_array_field_name="transcript_expression",
60+
)
61+
transcript_ht = transcript_ht.annotate(
62+
transcript_expression=transcript_ht.transcript_expression.map(
63+
lambda x: x.transcript_expression
64+
)
65+
)
66+
transcript_ht = transcript_ht.annotate_globals(
67+
tissues=transcript_ht.tissues.map(lambda x: x.tissue)
68+
)
69+
gene_ht = transcript_ht.group_by(transcript_ht.gene_id).aggregate(
70+
gene_expression=hl.agg.array_sum(transcript_ht.transcript_expression)
71+
)
9172

92-
return ht
73+
return transcript_ht.key_by("transcript_id", "gene_id"), gene_ht.key_by("gene_id")
9374

9475

9576
def get_expression_proportion(

0 commit comments

Comments
 (0)