easystats · strengejacke · Sep 18, 2025 · Oct 1, 2025 · Oct 1, 2025 · Oct 1, 2025
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: datawizard
 Title: Easy Data Wrangling and Statistical Transformations
-Version: 1.2.0.8
+Version: 1.2.0.9
 Authors@R: c(
     person("Indrajeet", "Patil", , "[email protected]", role = "aut",
            comment = c(ORCID = "0000-0003-1995-6531")),

diff --git a/NAMESPACE b/NAMESPACE
@@ -8,6 +8,8 @@ S3method(as.double,parameters_smoothness)
 S3method(as.numeric,parameters_kurtosis)
 S3method(as.numeric,parameters_skewness)
 S3method(as.numeric,parameters_smoothness)
+S3method(as.prop.table,datawizard_crosstab)
+S3method(as.prop.table,datawizard_crosstabs)
 S3method(as.table,datawizard_crosstab)
 S3method(as.table,datawizard_crosstabs)
 S3method(as.table,datawizard_table)
@@ -230,6 +232,7 @@ S3method(winsorize,factor)
 S3method(winsorize,logical)
 S3method(winsorize,numeric)
 export(adjust)
+export(as.prop.table)
 export(assign_labels)
 export(categorize)
 export(center)

diff --git a/NEWS.md b/NEWS.md
@@ -10,6 +10,9 @@ BREAKING CHANGES
 
 CHANGES
 
+* `data_tabulate()` now saves the table of proportions for crosstables as
+  attribute, accessible via the new `as.prop.table()` method (#656).
+
 * Due to changes in the package `insight`, `data_tabulate()` no longer prints
   decimals when all values in a column are integers (#641).
 

diff --git a/R/data_tabulate.R b/R/data_tabulate.R
@@ -58,6 +58,9 @@
 #' frequencies of the variable. This is useful for further statistical analysis,
 #' e.g. for using `chisq.test()` on the frequency table. See 'Examples'.
 #'
+#' Finally, the `as.prop.table()` method returns the proportions of the
+#' crosstable as a table object, if `by` was supplied to `data_tabulate()`.
+#'
 #' @section Crosstables:
 #' If `by` is supplied, a crosstable is created. The crosstable includes `<NA>`
 #' (missing) values by default. The first column indicates values of `x`, the
@@ -675,6 +678,94 @@ as.table.datawizard_crosstabs <- function(
   out
 }
 
+#' @rdname data_tabulate
+#' @export
+as.prop.table <- function(x, ...) {
+  UseMethod("as.prop.table")
+}
+
+#' @rdname data_tabulate
+#' @export
+as.prop.table.datawizard_crosstab <- function(
+  x,
+  remove_na = TRUE,
+  simplify = FALSE,
+  verbose = TRUE,
+  ...
+) {
+  # sanity check - the `.data.frame` method  returns a list, but not the
+  # default method
+  if (!is.data.frame(x)) {
+    x <- x[[1]]
+  }
+  prop_table <- attributes(x)$prop_table
+
+  if (is.null(prop_table)) {
+    insight::format_warning("No proportions available.")
+    return(NULL)
+  }
+
+  if (remove_na) {
+    if (
+      verbose &&
+        ("NA" %in% colnames(prop_table) || "NA" %in% rownames(prop_table))
+    ) {
+      insight::format_alert("Removing NA values from frequency table.")
+    }
+    if (!is.null(prop_table[["NA"]])) {
+      prop_table[["NA"]] <- NULL
+    }
+    if ("NA" %in% rownames(prop_table)) {
+      prop_table <- prop_table[rownames(prop_table) != "NA", ]
+    }
+  }
+  # coerce to table
+  result <- as.table(as.matrix(prop_table))
+  # if we don't want to simplify the table, we wrap it into a list
+  if (!simplify) {
+    result <- list(result)
+  }
+
+  result
+}
+
+#' @export
+as.prop.table.datawizard_crosstabs <- function(
+  x,
+  remove_na = TRUE,
+  simplify = FALSE,
+  verbose = TRUE,
+  ...
+) {
+  # only show message once we set `verbose = FALSE` in the lapply()
+  if (remove_na && verbose) {
+    prop_table <- attributes(x[[1]])$prop_table
+    if ("NA" %in% colnames(prop_table) || "NA" %in% rownames(prop_table)) {
+      insight::format_alert("Removing NA values from frequency table.")
+    }
+  }
+
+  out <- insight::compact_list(lapply(
+    x,
+    as.prop.table.datawizard_crosstab,
+    remove_na = remove_na,
+    simplify = TRUE,
+    verbose = FALSE,
+    ...
+  ))
+
+  # if no proportions found, return NULL
+  if (!length(out)) {
+    return(NULL)
+  }
+
+  # if only one table is returned, "unlist"
+  if (length(out) == 1 && simplify) {
+    out <- out[[1]]
+  }
+  out
+}
+
 
 .is_grouped_df_xtab <- function(x) {
   if (!is.data.frame(x)) {

diff --git a/R/data_xtabulate.R b/R/data_xtabulate.R
@@ -90,12 +90,100 @@
   attr(out, "proportions") <- proportions
   attr(out, "varname") <- obj_name
   attr(out, "grouped_df") <- !is.null(group_variable)
+  attr(out, "prop_table") <- .prop_table(out)
 
   class(out) <- c("datawizard_crosstab", "data.frame")
 
   out
 }
 
+# Helper function to calculate a table of proportions from a frequency table
+.prop_table <- function(x) {
+  # Extract the "proportions" attribute, which determines the type of calculation (row, column, or full)
+  props <- attributes(x)$proportions
+  out <- NULL
+
+  # Proceed only if the "proportions" attribute is set
+  if (!is.null(props)) {
+    # Identify numeric columns, as proportions are only calculated for these
+    numeric_columns <- vapply(x, is.numeric, logical(1))
+    # Get the total count from the attributes, used for "full" proportions
+    total_n <- attributes(x)$total_n
+
+    # Use a switch to perform the calculation based on the "props" value
+    out <- switch(
+      props,
+      # Calculate row-wise proportions
+      row = lapply(seq_len(nrow(x)), function(i) {
+        # Sum of the current row's numeric values
+        row_sum <- sum(x[i, numeric_columns], na.rm = TRUE)
+        # Avoid division by zero; if row sum is 0, return a row of zeros
+        if (row_sum == 0) {
+          tmp <- as.data.frame(as.list(rep(0, sum(numeric_columns))))
+          # for later rbind, we need identical column names
+          colnames(tmp) <- colnames(x)[numeric_columns]
+          tmp
+        } else {
+          x[i, numeric_columns] / row_sum
+        }
+      }),
+      # Calculate column-wise proportions
+      column = lapply(seq_len(ncol(x))[numeric_columns], function(i) {
+        # Sum of the current column's values
+        col_sum <- sum(x[, i], na.rm = TRUE)
+        # Avoid division by zero; if column sum is 0, return a vector of zeros
+        if (col_sum == 0) {
+          rep(0, nrow(x))
+        } else {
+          x[, i] / col_sum
+        }
+      }),
+      # Calculate proportions relative to the total count of the entire table
+      full = lapply(seq_len(ncol(x))[numeric_columns], function(i) {
+        # Avoid division by zero; if total is 0, return a vector of zeros
+        if (total_n == 0) {
+          rep(0, nrow(x))
+        } else {
+          x[, i] / total_n
+        }
+      })
+    )
+  }
+
+  # If a proportion table was calculated, format it into a data frame
+  if (!is.null(out)) {
+    # The output of the switch is a list. We need to bind it into a data frame.
+    # For row proportions, we bind rows. For column/full, we bind columns.
+    out <- switch(
+      props,
+      row = as.data.frame(do.call(rbind, out)),
+      as.data.frame(do.call(cbind, out))
+    )
+    # Set the column names of the new proportion table
+    colnames(out) <- colnames(x)[numeric_columns]
+
+    # Check if the dimensions are consistent before setting row names
+    if (nrow(out) == nrow(x)) {
+      # If the first column of the original data is not numeric, it's likely a
+      # label column. Use these labels as row names in the output for better
+      # readability. This is useful for identifying rows, especially when NAs
+      # are present.
+      if (isFALSE(numeric_columns[1])) {
+        r_names <- x[[1]]
+        r_names <- as.character(r_names)
+        # Replace NA in labels with the string "NA", else we cannot set rownames
+        r_names[is.na(r_names)] <- "NA"
+        rownames(out) <- r_names
+      } else {
+        # Otherwise, just use the original row names
+        rownames(out) <- rownames(x)
+      }
+    }
+  }
+
+  out
+}
+
 
 # methods ---------------------
 
@@ -113,7 +201,7 @@ format.datawizard_crosstab <- function(
   x <- as.data.frame(x)
 
   # find numeric columns, only for these we need row/column sums
-  numeric_columns <- vapply(x, is.numeric, logical(1))
+  numeric_columns <- which(vapply(x, is.numeric, logical(1)))
 
   # compute total N for rows and columns
   total_n <- attributes(x)$total_n
@@ -122,55 +210,18 @@ format.datawizard_crosstab <- function(
 
   # proportions?
   props <- attributes(x)$proportions
-
-  if (!is.null(props)) {
-    # we copy x to tmp, because when we create strings with "sprintf()", the
-    # variable is coerced to character, and in subsequent iterations of the loop,
-    # mathemathical operations are not possible anymore
-    tmp <- x
-    if (identical(props, "row")) {
-      for (i in seq_len(nrow(x))) {
-        row_sum <- sum(x[i, numeric_columns], na.rm = TRUE)
-        if (row_sum == 0) {
-          row_sum_string <- "(0%)"
-        } else {
-          row_sum_string <- sprintf(
-            "(%.*f%%)",
-            digits,
-            100 * x[i, numeric_columns] / row_sum
-          )
-        }
-        tmp[i, numeric_columns] <- paste(
-          format(x[i, numeric_columns]),
-          format(row_sum_string, justify = "right")
+  prop_table <- attributes(x)$prop_table
+
+  if (!is.null(props) && !is.null(prop_table)) {
+    for (i in seq_len(ncol(prop_table))) {
+      x[, numeric_columns[i]] <- paste(
+        format(x[, numeric_columns[i]]),
+        format(
+          sprintf("(%.*f%%)", digits, 100 * prop_table[, i]),
+          justify = "right"
         )
-      }
-    } else if (identical(props, "column")) {
-      for (i in seq_len(ncol(x))[numeric_columns]) {
-        col_sum <- sum(x[, i], na.rm = TRUE)
-        if (col_sum == 0) {
-          col_sum_string <- "(0%)"
-        } else {
-          col_sum_string <- sprintf("(%.*f%%)", digits, 100 * x[, i] / col_sum)
-        }
-        tmp[, i] <- paste(
-          format(x[, i]),
-          format(col_sum_string, justify = "right")
-        )
-      }
-    } else if (identical(props, "full")) {
-      for (i in seq_len(ncol(x))[numeric_columns]) {
-        tmp[, i] <- paste(
-          format(x[, i]),
-          format(
-            sprintf("(%.*f%%)", digits, 100 * x[, i] / total_n),
-            justify = "right"
-          )
-        )
-      }
+      )
     }
-    # copy back final result
-    x <- tmp
   }
 
   x[] <- lapply(x, as.character)

diff --git a/R/utils.R b/R/utils.R
@@ -163,7 +163,7 @@
   for (nm in setdiff(names(custom_attr), names(attributes(data.frame())))) {
     attr(data, which = nm) <- custom_attr[[nm]]
   }
-  return(data)
+  data
 }
 
 
@@ -241,7 +241,7 @@
 }
 
 
-#' Taken from https://github.com/coolbutuseless/gluestick [licence: MIT]
+#' Taken from https://github.com/coolbutuseless/gluestick (licence: MIT)
 #' Same functionality as `{glue}`
 #'
 #' @noRd
@@ -292,15 +292,15 @@
 
   # Evaluate
   if (eval) {
-    args <- lapply(exprs, function(expr) {
+    fun_args <- lapply(exprs, function(expr) {
       eval(parse(text = expr), envir = src)
     })
   } else {
-    args <- unname(mget(exprs, envir = as.environment(src)))
+    fun_args <- unname(mget(exprs, envir = as.environment(src)))
   }
 
   # Create the string(s)
-  do.call(sprintf, c(list(fmt_sprintf), args))
+  do.call(sprintf, c(list(fmt_sprintf), fun_args))
 }
 
 

diff --git a/man/data_tabulate.Rd b/man/data_tabulate.Rd