|
1 | 1 | """Utils module containing generic functions that are useful for adding transcript expression-aware annotations.""" |
2 | | -import logging |
3 | | -from typing import Callable, List, Optional, Tuple, Union |
| 2 | +from typing import Callable, List, Optional, Tuple |
4 | 3 |
|
5 | 4 | import hail as hl |
6 | 5 |
|
7 | | -logging.basicConfig( |
8 | | - format="%(asctime)s (%(name)s %(lineno)s): %(message)s", |
9 | | - datefmt="%m/%d/%Y %I:%M:%S %p", |
10 | | -) |
11 | | -logger = logging.getLogger("transcript_annotation_utils") |
12 | | -logger.setLevel(logging.INFO) |
13 | 6 |
|
14 | | - |
15 | | -def summarize_transcript_expression( |
16 | | - mt: hl.MatrixTable, |
17 | | - transcript_expression_expr: Union[hl.expr.NumericExpression, str] = "x", |
18 | | - tissue_expr: Union[hl.expr.StringExpression, str] = "tissue", |
| 7 | +def summarize_rsem_mt( |
| 8 | + rsem_mt: hl.MatrixTable, |
| 9 | + rsem_expr: hl.expr.NumericExpression, |
| 10 | + tissue_expr: hl.expr.StringExpression, |
19 | 11 | summary_agg_func: Optional[Callable] = None, |
| 12 | + tissue_as_row: bool = False, |
20 | 13 | ) -> Tuple[hl.Table, hl.Table]: |
21 | 14 | """ |
22 | | - Summarize a transcript expression MatrixTable by transcript, gene, and tissue. |
| 15 | + Summarize an RSEM table with ENSTs and ENSGs as rows and samples as columns by tissue. |
23 | 16 |
|
24 | 17 | The `summary_agg_func` argument allows the user to specify a Hail aggregation |
25 | 18 | function to use to summarize the expression by tissue. By default, the median is |
26 | 19 | used. |
27 | 20 |
|
28 | | - The returned Table has a row annotation for each tissue containing the summarized |
29 | | - tissue expression value. |
30 | | -
|
31 | | - :param mt: MatrixTable of transcript (rows) expression quantifications (entry) by |
32 | | - sample (columns). |
33 | | - :param tissue_expr: Column expression indicating tissue type. Default is 'tissue'. |
34 | | - :param transcript_expression_expr: Entry expression indicating transcript expression |
35 | | - quantification. Default is 'x'. |
36 | | - :param summary_agg_func: Optional aggregation function to use to summarize the |
37 | | - transcript expression quantification by tissue. Example: `hl.agg.mean`. Default |
38 | | - is None, which will use a median aggregation. |
39 | | - :return: A Table of summarized transcript expression by tissue and a Table of |
40 | | - summarized gene expression by tissue. |
| 21 | + .. note:: |
| 22 | +
|
| 23 | + The outputs can be returned in one of the following formats: |
| 24 | +
|
| 25 | + - A Table with a field containing an array of summarized expression |
| 26 | + values by tissue, where the order of tissues in the array is indicated by |
| 27 | + the "tissues" global annotation (`tissue_as_row` set to False). |
| 28 | + - A Table with a row annotation for each tissue containing the summarized |
| 29 | + tissue expression value (`tissue_as_row` set to True). |
| 30 | +
|
| 31 | + :param rsem_mt: MatrixTable of RSEM quantifications. |
| 32 | + :param tissue_expr: Column expression indicating tissue type. |
| 33 | + :param rsem_expr: Entry expression indicating RSEM quantification. |
| 34 | + :param summary_agg_func: Optional aggregation function to use to summarize the RSEM |
| 35 | + values by tissue. Default is None, which will use a median aggregation. |
| 36 | + :param tissue_as_row: If True, return a Table with a row annotation for each tissue |
| 37 | + instead of an array of RSEM values. Default is False. |
| 38 | + :return: A Table of summarized transcript expression and a Table of summarized |
| 39 | + gene expression. |
41 | 40 | """ |
42 | 41 | if summary_agg_func is None: |
43 | 42 | summary_agg_func = lambda x: hl.median(hl.agg.collect(x)) |
44 | 43 |
|
45 | | - mt = mt.group_cols_by(tissue=tissue_expr).aggregate( |
46 | | - tx=summary_agg_func(transcript_expression_expr) |
| 44 | + rsem_mt = rsem_mt.group_cols_by(tissue=tissue_expr).aggregate( |
| 45 | + transcript_expression=summary_agg_func(rsem_expr) |
47 | 46 | ) |
48 | 47 |
|
49 | | - transcript_ht = mt.rename({"tx": ""}).make_table() |
50 | | - transcript_ht = transcript_ht.key_by("transcript_id", "gene_id") |
51 | | - |
52 | | - gene_ht = transcript_ht.group_by("gene_id").aggregate( |
53 | | - **{ |
54 | | - tissue: hl.agg.sum(transcript_ht[tissue]) |
55 | | - for tissue in list(transcript_ht.row_value) |
56 | | - } |
57 | | - ) |
58 | | - |
59 | | - return transcript_ht, gene_ht |
60 | | - |
61 | | - |
62 | | -def tissue_expression_ht_to_array( |
63 | | - ht: hl.Table, |
64 | | - tissues: Optional[List[str]] = None, |
65 | | - tissues_to_filter: Optional[List[str]] = None, |
66 | | -) -> hl.Table: |
67 | | - """ |
68 | | - Convert a Table with a row annotation for each tissue to a Table with tissues as an array. |
69 | | -
|
70 | | - The output is a Table with a field 'tissue_expression' containing an array of |
71 | | - summarized expression values by tissue, where the order of tissues in the array is |
72 | | - indicated by the "tissues" global annotation. |
73 | | -
|
74 | | - :param ht: Table with a row annotation for each tissue. |
75 | | - :param tissues: Optional list of tissues to keep in the 'tissue_expression' array. |
76 | | - Default is all non-key rows in the Table. |
77 | | - :param tissues_to_filter: Optional list of tissues to exclude from the tissue |
78 | | - expression array. |
79 | | - :return: Table with a field 'tissue_expression' containing an array of summarized |
80 | | - expression values by tissue. |
81 | | - """ |
82 | | - if tissues is None: |
83 | | - tissues = list(ht.row_value) |
84 | | - |
85 | | - if tissues_to_filter is not None: |
86 | | - logger.info("Filtering tissues: %s", tissues_to_filter) |
87 | | - tissues = [t for t in tissues if t not in tissues_to_filter] |
88 | | - |
89 | | - ht = ht.select_globals(tissues=tissues) |
90 | | - ht = ht.select(tissue_expression=[ht[t] for t in tissues]) |
| 48 | + if tissue_as_row: |
| 49 | + transcript_ht = rsem_mt.rename({"transcript_expression": ""}).make_table() |
| 50 | + gene_ht = transcript_ht.key_by("gene_id").drop("transcript_id") |
| 51 | + tissues = list(gene_ht.row) |
| 52 | + tissues.remove("gene_id") |
| 53 | + gene_ht = gene_ht.group_by(*gene_ht.key).aggregate( |
| 54 | + **{tissue: hl.agg.sum(gene_ht[tissue]) for tissue in tissues} |
| 55 | + ) |
| 56 | + else: |
| 57 | + transcript_ht = rsem_mt.localize_entries( |
| 58 | + columns_array_field_name="tissues", |
| 59 | + entries_array_field_name="transcript_expression", |
| 60 | + ) |
| 61 | + transcript_ht = transcript_ht.annotate( |
| 62 | + transcript_expression=transcript_ht.transcript_expression.map( |
| 63 | + lambda x: x.transcript_expression |
| 64 | + ) |
| 65 | + ) |
| 66 | + transcript_ht = transcript_ht.annotate_globals( |
| 67 | + tissues=transcript_ht.tissues.map(lambda x: x.tissue) |
| 68 | + ) |
| 69 | + gene_ht = transcript_ht.group_by(transcript_ht.gene_id).aggregate( |
| 70 | + gene_expression=hl.agg.array_sum(transcript_ht.transcript_expression) |
| 71 | + ) |
91 | 72 |
|
92 | | - return ht |
| 73 | + return transcript_ht.key_by("transcript_id", "gene_id"), gene_ht.key_by("gene_id") |
93 | 74 |
|
94 | 75 |
|
95 | 76 | def get_expression_proportion( |
|
0 commit comments