Revert "copy base branch"

KoalaQin · KoalaQin · commit 415cce764676 · 2023-12-13T15:19:27.000-05:00
This reverts commit 1aa1a45.
diff --git a/gnomad/utils/transcript_annotation.py b/gnomad/utils/transcript_annotation.py
@@ -1,95 +1,76 @@
 """Utils module containing generic functions that are useful for adding transcript expression-aware annotations."""
-import logging
-from typing import Callable, List, Optional, Tuple, Union
+from typing import Callable, List, Optional, Tuple
 
 import hail as hl
 
-logging.basicConfig(
-    format="%(asctime)s (%(name)s %(lineno)s): %(message)s",
-    datefmt="%m/%d/%Y %I:%M:%S %p",
-)
-logger = logging.getLogger("transcript_annotation_utils")
-logger.setLevel(logging.INFO)
 
-
-def summarize_transcript_expression(
-    mt: hl.MatrixTable,
-    transcript_expression_expr: Union[hl.expr.NumericExpression, str] = "x",
-    tissue_expr: Union[hl.expr.StringExpression, str] = "tissue",
+def summarize_rsem_mt(
+    rsem_mt: hl.MatrixTable,
+    rsem_expr: hl.expr.NumericExpression,
+    tissue_expr: hl.expr.StringExpression,
     summary_agg_func: Optional[Callable] = None,
+    tissue_as_row: bool = False,
 ) -> Tuple[hl.Table, hl.Table]:
     """
-    Summarize a transcript expression MatrixTable by transcript, gene, and tissue.
+    Summarize an RSEM table with ENSTs and ENSGs as rows and samples as columns by tissue.
 
     The `summary_agg_func` argument allows the user to specify a Hail aggregation
     function to use to summarize the expression by tissue. By default, the median is
     used.
 
-    The returned Table has a row annotation for each tissue containing the summarized
-    tissue expression value.
-
-    :param mt: MatrixTable of transcript (rows) expression quantifications (entry) by
-        sample (columns).
-    :param tissue_expr: Column expression indicating tissue type. Default is 'tissue'.
-    :param transcript_expression_expr: Entry expression indicating transcript expression
-        quantification. Default is 'x'.
-    :param summary_agg_func: Optional aggregation function to use to summarize the
-        transcript expression quantification by tissue. Example: `hl.agg.mean`. Default
-        is None, which will use a median aggregation.
-    :return: A Table of summarized transcript expression by tissue and a Table of
-        summarized gene expression by tissue.
+    .. note::
+
+        The outputs can be returned in one of the following formats:
+
+        - A Table with a field containing an array of summarized expression
+          values by tissue, where the order of tissues in the array is indicated by
+          the "tissues" global annotation (`tissue_as_row` set to False).
+        - A Table with a row annotation for each tissue containing the summarized
+          tissue expression value (`tissue_as_row` set to True).
+
+    :param rsem_mt: MatrixTable of RSEM quantifications.
+    :param tissue_expr: Column expression indicating tissue type.
+    :param rsem_expr: Entry expression indicating RSEM quantification.
+    :param summary_agg_func: Optional aggregation function to use to summarize the RSEM
+        values by tissue. Default is None, which will use a median aggregation.
+    :param tissue_as_row: If True, return a Table with a row annotation for each tissue
+        instead of an array of RSEM values. Default is False.
+    :return: A Table of summarized transcript expression and a Table of summarized
+        gene expression.
     """
     if summary_agg_func is None:
         summary_agg_func = lambda x: hl.median(hl.agg.collect(x))
 
-    mt = mt.group_cols_by(tissue=tissue_expr).aggregate(
-        tx=summary_agg_func(transcript_expression_expr)
+    rsem_mt = rsem_mt.group_cols_by(tissue=tissue_expr).aggregate(
+        transcript_expression=summary_agg_func(rsem_expr)
     )
 
-    transcript_ht = mt.rename({"tx": ""}).make_table()
-    transcript_ht = transcript_ht.key_by("transcript_id", "gene_id")
-
-    gene_ht = transcript_ht.group_by("gene_id").aggregate(
-        **{
-            tissue: hl.agg.sum(transcript_ht[tissue])
-            for tissue in list(transcript_ht.row_value)
-        }
-    )
-
-    return transcript_ht, gene_ht
-
-
-def tissue_expression_ht_to_array(
-    ht: hl.Table,
-    tissues: Optional[List[str]] = None,
-    tissues_to_filter: Optional[List[str]] = None,
-) -> hl.Table:
-    """
-    Convert a Table with a row annotation for each tissue to a Table with tissues as an array.
-
-    The output is a Table with a field 'tissue_expression' containing an array of
-    summarized expression values by tissue, where the order of tissues in the array is
-    indicated by the "tissues" global annotation.
-
-    :param ht: Table with a row annotation for each tissue.
-    :param tissues: Optional list of tissues to keep in the 'tissue_expression' array.
-        Default is all non-key rows in the Table.
-    :param tissues_to_filter: Optional list of tissues to exclude from the tissue
-        expression array.
-    :return: Table with a field 'tissue_expression' containing an array of summarized
-        expression values by tissue.
-    """
-    if tissues is None:
-        tissues = list(ht.row_value)
-
-    if tissues_to_filter is not None:
-        logger.info("Filtering tissues: %s", tissues_to_filter)
-        tissues = [t for t in tissues if t not in tissues_to_filter]
-
-    ht = ht.select_globals(tissues=tissues)
-    ht = ht.select(tissue_expression=[ht[t] for t in tissues])
+    if tissue_as_row:
+        transcript_ht = rsem_mt.rename({"transcript_expression": ""}).make_table()
+        gene_ht = transcript_ht.key_by("gene_id").drop("transcript_id")
+        tissues = list(gene_ht.row)
+        tissues.remove("gene_id")
+        gene_ht = gene_ht.group_by(*gene_ht.key).aggregate(
+            **{tissue: hl.agg.sum(gene_ht[tissue]) for tissue in tissues}
+        )
+    else:
+        transcript_ht = rsem_mt.localize_entries(
+            columns_array_field_name="tissues",
+            entries_array_field_name="transcript_expression",
+        )
+        transcript_ht = transcript_ht.annotate(
+            transcript_expression=transcript_ht.transcript_expression.map(
+                lambda x: x.transcript_expression
+            )
+        )
+        transcript_ht = transcript_ht.annotate_globals(
+            tissues=transcript_ht.tissues.map(lambda x: x.tissue)
+        )
+        gene_ht = transcript_ht.group_by(transcript_ht.gene_id).aggregate(
+            gene_expression=hl.agg.array_sum(transcript_ht.transcript_expression)
+        )
 
-    return ht
+    return transcript_ht.key_by("transcript_id", "gene_id"), gene_ht.key_by("gene_id")
 
 
 def get_expression_proportion(