diff --git a/NAMESPACE b/NAMESPACE index a48533d..6012f97 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -32,6 +32,11 @@ S3method(sample_n,SingleCellExperiment) S3method(select,SingleCellExperiment) S3method(separate,SingleCellExperiment) S3method(slice,SingleCellExperiment) +S3method(slice_head,SingleCellExperiment) +S3method(slice_max,SingleCellExperiment) +S3method(slice_min,SingleCellExperiment) +S3method(slice_sample,SingleCellExperiment) +S3method(slice_tail,SingleCellExperiment) S3method(summarise,SingleCellExperiment) S3method(summarize,SingleCellExperiment) S3method(tbl_format_header,tidySingleCellExperiment) @@ -82,6 +87,11 @@ importFrom(dplyr,sample_frac) importFrom(dplyr,sample_n) importFrom(dplyr,select) importFrom(dplyr,slice) +importFrom(dplyr,slice_head) +importFrom(dplyr,slice_max) +importFrom(dplyr,slice_min) +importFrom(dplyr,slice_sample) +importFrom(dplyr,slice_tail) importFrom(dplyr,summarise) importFrom(dplyr,summarize) importFrom(dplyr,vars) @@ -111,6 +121,7 @@ importFrom(purrr,reduce) importFrom(purrr,when) importFrom(rlang,":=") importFrom(rlang,dots_values) +importFrom(rlang,enexpr) importFrom(rlang,enquo) importFrom(rlang,enquos) importFrom(rlang,expr) @@ -130,6 +141,7 @@ importFrom(stringr,str_replace_all) importFrom(tibble,as_tibble) importFrom(tibble,enframe) importFrom(tibble,glimpse) +importFrom(tibble,rowid_to_column) importFrom(tidyr,extract) importFrom(tidyr,nest) importFrom(tidyr,pivot_longer) diff --git a/R/dplyr_methods.R b/R/dplyr_methods.R index 6910153..db079f9 100755 --- a/R/dplyr_methods.R +++ b/R/dplyr_methods.R @@ -507,6 +507,188 @@ slice.SingleCellExperiment <- function(.data, ..., .by=NULL, .preserve=FALSE) { .data[, rownames(new_meta)] } +#' @name slice_sample +#' @rdname slice +#' @inherit dplyr::slice_sample +#' @examples +#' data(pbmc_small) +#' pbmc_small |> slice_sample(n=1) +#' pbmc_small |> slice_sample(prop=0.1) +#' +#' @importFrom SummarizedExperiment colData +#' @importFrom dplyr slice_sample +#' @export +slice_sample.SingleCellExperiment <- function(.data, ..., n=NULL, + prop=NULL, by=NULL, weight_by=NULL, replace=FALSE) { + lifecycle::signal_superseded("1.0.0", "sample_n()", "slice_sample()") + + if (!is.null(n)) + new_meta <- + .data |> + colData() |> + as_tibble(rownames=c_(.data)$name) |> + select(-everything(), c_(.data)$name, {{ by }}, {{ weight_by }}) |> + slice_sample(..., n=n, by={{ by }}, + weight_by={{ weight_by }}, replace=replace) + else if (!is.null(prop)) + new_meta <- + .data |> + colData() |> + as_tibble(rownames=c_(.data)$name) |> + select(-everything(), c_(.data)$name, {{ by }}, {{ weight_by }}) |> + slice_sample(..., prop=prop, by={{ by }}, + weight_by={{ weight_by }}, replace=replace) + else + stop("tidySingleCellExperiment says:", + " you should provide `n` or `prop` arguments") + + count_cells <- new_meta %>% + select(!!c_(.data)$symbol) %>% + count(!!c_(.data)$symbol) + + .max_cell_count <- ifelse(nrow(count_cells)==0, 0, max(count_cells$n)) + + # If repeated cells due to replacement + if (.max_cell_count |> gt(1)){ + message("tidySingleCellExperiment says: When sampling with replacement", + " a data frame is returned for independent data analysis.") + .data |> + as_tibble() |> + right_join(new_meta %>% + select(!!c_(.data)$symbol), by=c_(.data)$name) + } else { + .data[, pull(new_meta, !!c_(.data)$symbol)] + } +} + +#' @name slice_head +#' @rdname slice +#' @inherit dplyr::slice_head +#' @examples +#' data(pbmc_small) +#' # First rows based on existing order +#' pbmc_small |> slice_head(n=5) +#' +#' @importFrom dplyr slice_head +#' @importFrom tibble rowid_to_column +#' @export +slice_head.SingleCellExperiment <- function(.data, ..., n, prop, by=NULL) { + row_number___ <- NULL + idx <- .data |> + colData() |> + as.data.frame() |> + select(-everything(), {{ by }}) |> + rowid_to_column(var='row_number___') |> + slice_head(..., n=n, prop=prop, by={{ by }}) |> + pull(row_number___) + + new_obj <- .data[, idx] + new_obj +} + +#' @name slice_tail +#' @rdname slice +#' @inherit dplyr::slice_tail +#' @examples +#' data(pbmc_small) +#' # First rows based on existing order +#' pbmc_small |> slice_tail(n=5) +#' +#' @importFrom dplyr slice_tail +#' @importFrom tibble rowid_to_column +#' @export +slice_tail.SingleCellExperiment <- function(.data, ..., n, prop, by=NULL) { + row_number___ <- NULL + idx <- .data |> + colData() |> + as.data.frame() |> + select(-everything(), {{ by }}) |> + rowid_to_column(var='row_number___') |> + slice_tail(..., n=n, prop=prop, by={{ by }}) |> + pull(row_number___) + + new_obj <- .data[, idx] + new_obj +} + +#' @name slice_min +#' @rdname slice +#' @inherit dplyr::slice_min +#' @examples +#' data(pbmc_small) +#' +#' # Rows with minimum and maximum values of a metadata variable +#' pbmc_small |> slice_min(nFeature_RNA, n=5) +#' +#' # slice_min() and slice_max() may return more rows than requested +#' # in the presence of ties. +#' pbmc_small |> slice_min(nFeature_RNA, n=2) +#' +#' # Use with_ties=FALSE to return exactly n matches +#' pbmc_small |> slice_min(nFeature_RNA, n=2, with_ties=FALSE) +#' +#' # Or use additional variables to break the tie: +#' pbmc_small |> slice_min(tibble::tibble(nFeature_RNA, nCount_RNA), n=2) +#' +#' # Use by for group-wise operations +#' pbmc_small |> slice_min(nFeature_RNA, n=5, by=groups) +#' +#' @importFrom dplyr slice_min +#' @importFrom tibble rowid_to_column +#' @export +slice_min.SingleCellExperiment <- function(.data, order_by, ..., n, prop, + by=NULL, with_ties=TRUE, na_rm=FALSE) { + row_number___ <- NULL + order_by_variables <- return_arguments_of(!!enexpr(order_by)) + + idx <- .data |> + colData() |> + as.data.frame() |> + select(-everything(), !!!order_by_variables, {{ by }}) |> + rowid_to_column(var ='row_number___') |> + slice_min( + order_by={{ order_by }}, ..., n=n, prop=prop, by={{ by }}, + with_ties=with_ties, na_rm=na_rm + ) |> + pull(row_number___) + + new_obj <- .data[, idx] + new_obj +} + +#' @name slice_max +#' @rdname slice +#' @inherit dplyr::slice_max +#' @examples +#' data(pbmc_small) +#' # Rows with minimum and maximum values of a metadata variable +#' pbmc_small |> slice_max(nFeature_RNA, n=5) +#' +#' @importFrom dplyr slice_max +#' @importFrom tibble rowid_to_column +#' @export +slice_max.SingleCellExperiment <- function(.data, order_by, ..., n, prop, + by=NULL, with_ties=TRUE, na_rm=FALSE) { + row_number___ <- NULL + + order_by_variables <- return_arguments_of(!!enexpr(order_by)) + + idx <- .data |> + colData() |> + as.data.frame() |> + select(-everything(), !!!order_by_variables, {{ by }}) |> + rowid_to_column(var ='row_number___') |> + slice_max( + order_by={{ order_by }}, ..., n=n, prop=prop, by={{ by }}, + with_ties=with_ties, na_rm=na_rm + ) |> + pull(row_number___) + + new_obj <- .data[, idx] + new_obj +} + + #' @name select #' @rdname select #' @inherit dplyr::select diff --git a/R/utilities.R b/R/utilities.R index 5dbde44..207fee4 100755 --- a/R/utilities.R +++ b/R/utilities.R @@ -306,6 +306,18 @@ quo_names <- function(v) { unlist() } +#' returns variables from an expression +#' @param expression an expression +#' @importFrom rlang enexpr +#' @return list of symbols +return_arguments_of <- function(expression){ + variables <- enexpr(expression) |> as.list() + if(length(variables) > 1) { + variables <- variables[-1] # removes first element which is function + } + variables +} + #' @importFrom purrr when #' @importFrom dplyr select #' @importFrom rlang expr diff --git a/man/bind_rows.Rd b/man/bind_rows.Rd index 39a9050..99f2ad8 100644 --- a/man/bind_rows.Rd +++ b/man/bind_rows.Rd @@ -5,7 +5,7 @@ \alias{bind_rows.SingleCellExperiment} \alias{bind_cols.SingleCellExperiment} \alias{bind_cols} -\title{Efficiently bind multiple data frames by row and column} +\title{#' Efficiently bind multiple data frames by row and column} \usage{ \method{bind_rows}{SingleCellExperiment}(..., .id = NULL, add.cell.ids = NULL) diff --git a/man/return_arguments_of.Rd b/man/return_arguments_of.Rd new file mode 100644 index 0000000..448e9b8 --- /dev/null +++ b/man/return_arguments_of.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities.R +\name{return_arguments_of} +\alias{return_arguments_of} +\title{returns variables from an expression} +\usage{ +return_arguments_of(expression) +} +\arguments{ +\item{expression}{an expression} +} +\value{ +list of symbols +} +\description{ +returns variables from an expression +} diff --git a/man/slice.Rd b/man/slice.Rd index 175f6b2..ac5a49d 100644 --- a/man/slice.Rd +++ b/man/slice.Rd @@ -8,9 +8,50 @@ \alias{slice_sample} \alias{slice_min} \alias{slice_max} +\alias{slice_sample.SingleCellExperiment} +\alias{slice_head.SingleCellExperiment} +\alias{slice_tail.SingleCellExperiment} +\alias{slice_min.SingleCellExperiment} +\alias{slice_max.SingleCellExperiment} \title{Subset rows using their positions} \usage{ \method{slice}{SingleCellExperiment}(.data, ..., .by = NULL, .preserve = FALSE) + +\method{slice_sample}{SingleCellExperiment}( + .data, + ..., + n = NULL, + prop = NULL, + by = NULL, + weight_by = NULL, + replace = FALSE +) + +\method{slice_head}{SingleCellExperiment}(.data, ..., n, prop, by = NULL) + +\method{slice_tail}{SingleCellExperiment}(.data, ..., n, prop, by = NULL) + +\method{slice_min}{SingleCellExperiment}( + .data, + order_by, + ..., + n, + prop, + by = NULL, + with_ties = TRUE, + na_rm = FALSE +) + +\method{slice_max}{SingleCellExperiment}( + .data, + order_by, + ..., + n, + prop, + by = NULL, + with_ties = TRUE, + na_rm = FALSE +) } \arguments{ \item{.data}{A data frame, data frame extension (e.g. a tibble), or a @@ -26,7 +67,7 @@ Indices beyond the number of rows in the input are silently ignored. For \verb{slice_*()}, these arguments are passed on to methods.} -\item{.by}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} +\item{.by, by}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} <\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> Optionally, a selection of columns to group by for just this operation, functioning as an alternative to \code{\link[dplyr:group_by]{group_by()}}. For @@ -35,6 +76,37 @@ details and examples, see \link[dplyr:dplyr_by]{?dplyr_by}.} \item{.preserve}{Relevant when the \code{.data} input is grouped. If \code{.preserve = FALSE} (the default), the grouping structure is recalculated based on the resulting data, otherwise the grouping is kept as is.} + +\item{n, prop}{Provide either \code{n}, the number of rows, or \code{prop}, the +proportion of rows to select. If neither are supplied, \code{n = 1} will be +used. If \code{n} is greater than the number of rows in the group +(or \code{prop > 1}), the result will be silently truncated to the group size. +\code{prop} will be rounded towards zero to generate an integer number of +rows. + +A negative value of \code{n} or \code{prop} will be subtracted from the group +size. For example, \code{n = -2} with a group of 5 rows will select 5 - 2 = 3 +rows; \code{prop = -0.25} with 8 rows will select 8 * (1 - 0.25) = 6 rows.} + +\item{weight_by}{<\code{\link[rlang:args_data_masking]{data-masking}}> Sampling +weights. This must evaluate to a vector of non-negative numbers the same +length as the input. Weights are automatically standardised to sum to 1.} + +\item{replace}{Should sampling be performed with (\code{TRUE}) or without +(\code{FALSE}, the default) replacement.} + +\item{order_by}{<\code{\link[rlang:args_data_masking]{data-masking}}> Variable or +function of variables to order by. To order by multiple variables, wrap +them in a data frame or tibble.} + +\item{with_ties}{Should ties be kept together? The default, \code{TRUE}, +may return more rows than you request. Use \code{FALSE} to ignore ties, +and return the first \code{n} rows.} + +\item{na_rm}{Should missing values in \code{order_by} be removed from the result? +If \code{FALSE}, \code{NA} values are sorted to the end (like in \code{\link[dplyr:arrange]{arrange()}}), so +they will only be included if there are insufficient non-missing values to +reach \code{n}/\code{prop}.} } \value{ An object of the same type as \code{.data}. The output has the following @@ -69,6 +141,91 @@ operation, use \code{\link[dplyr:filter]{filter()}} and \code{\link[dplyr:row_nu \section{Methods}{ +These function are \strong{generic}s, which means that packages can provide +implementations (methods) for other classes. See the documentation of +individual methods for extra arguments and differences in behaviour. + +Methods available in currently loaded packages: +\itemize{ +\item \code{slice()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice")}. +\item \code{slice_head()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice_head")}. +\item \code{slice_tail()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice_tail")}. +\item \code{slice_min()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice_min")}. +\item \code{slice_max()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice_max")}. +\item \code{slice_sample()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice_sample")}. +} + + + + +These function are \strong{generic}s, which means that packages can provide +implementations (methods) for other classes. See the documentation of +individual methods for extra arguments and differences in behaviour. + +Methods available in currently loaded packages: +\itemize{ +\item \code{slice()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice")}. +\item \code{slice_head()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice_head")}. +\item \code{slice_tail()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice_tail")}. +\item \code{slice_min()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice_min")}. +\item \code{slice_max()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice_max")}. +\item \code{slice_sample()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice_sample")}. +} + + + + +These function are \strong{generic}s, which means that packages can provide +implementations (methods) for other classes. See the documentation of +individual methods for extra arguments and differences in behaviour. + +Methods available in currently loaded packages: +\itemize{ +\item \code{slice()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice")}. +\item \code{slice_head()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice_head")}. +\item \code{slice_tail()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice_tail")}. +\item \code{slice_min()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice_min")}. +\item \code{slice_max()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice_max")}. +\item \code{slice_sample()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice_sample")}. +} + + + + +These function are \strong{generic}s, which means that packages can provide +implementations (methods) for other classes. See the documentation of +individual methods for extra arguments and differences in behaviour. + +Methods available in currently loaded packages: +\itemize{ +\item \code{slice()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice")}. +\item \code{slice_head()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice_head")}. +\item \code{slice_tail()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice_tail")}. +\item \code{slice_min()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice_min")}. +\item \code{slice_max()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice_max")}. +\item \code{slice_sample()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice_sample")}. +} + + + + +These function are \strong{generic}s, which means that packages can provide +implementations (methods) for other classes. See the documentation of +individual methods for extra arguments and differences in behaviour. + +Methods available in currently loaded packages: +\itemize{ +\item \code{slice()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice")}. +\item \code{slice_head()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice_head")}. +\item \code{slice_tail()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice_tail")}. +\item \code{slice_min()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice_min")}. +\item \code{slice_max()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice_max")}. +\item \code{slice_sample()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("slice_sample")}. +} + + + + These function are \strong{generic}s, which means that packages can provide implementations (methods) for other classes. See the documentation of individual methods for extra arguments and differences in behaviour. @@ -89,6 +246,40 @@ Methods available in currently loaded packages: data(pbmc_small) pbmc_small |> slice(1) +data(pbmc_small) +pbmc_small |> slice_sample(n=1) +pbmc_small |> slice_sample(prop=0.1) + +data(pbmc_small) +# First rows based on existing order +pbmc_small |> slice_head(n=5) + +data(pbmc_small) +# First rows based on existing order +pbmc_small |> slice_tail(n=5) + +data(pbmc_small) + +# Rows with minimum and maximum values of a metadata variable +pbmc_small |> slice_min(nFeature_RNA, n=5) + +# slice_min() and slice_max() may return more rows than requested +# in the presence of ties. +pbmc_small |> slice_min(nFeature_RNA, n=2) + +# Use with_ties=FALSE to return exactly n matches +pbmc_small |> slice_min(nFeature_RNA, n=2, with_ties=FALSE) + +# Or use additional variables to break the tie: +pbmc_small |> slice_min(tibble::tibble(nFeature_RNA, nCount_RNA), n=2) + +# Use by for group-wise operations +pbmc_small |> slice_min(nFeature_RNA, n=5, by=groups) + +data(pbmc_small) +# Rows with minimum and maximum values of a metadata variable +pbmc_small |> slice_max(nFeature_RNA, n=5) + } \seealso{ Other single table verbs: diff --git a/tests/testthat/test-dplyr_methods.R b/tests/testthat/test-dplyr_methods.R index 1a07ac9..9e01f96 100755 --- a/tests/testthat/test-dplyr_methods.R +++ b/tests/testthat/test-dplyr_methods.R @@ -179,6 +179,92 @@ test_that("slice()", { expect_identical(slice(df, -i), df[, -i]) }) +test_that("slice_sample()", { + pbmc_small |> + slice_sample(n=0) |> + ncol() |> + expect_equal(0) + pbmc_small |> + slice_sample(n=50) |> + ncol() |> + expect_equal(50) +}) + +test_that("slice_head()", { + pbmc_small |> + slice_head(n=0) |> + ncol() |> + expect_equal(0) + pbmc_small |> + slice_head(n=50) |> + ncol() |> + expect_equal(50) + expect_equal( + colnames(pbmc_small) |> head(n=50), + pbmc_small |> slice_head(n=50) |> colnames() + ) +}) + +test_that("slice_tail()", { + pbmc_small |> + slice_tail(n=0) |> + ncol() |> + expect_equal(0) + pbmc_small |> + slice_tail(n=50) |> + ncol() |> + expect_equal(50) + expect_equal( + colnames(pbmc_small) |> tail(n=50), + pbmc_small |> slice_tail(n=50) |> colnames() + ) +}) + +test_that("slice_min()", { + pbmc_small |> + slice_min(nFeature_RNA, n=0) |> + ncol() |> + expect_equal(0) + pbmc_small |> + slice_min(nFeature_RNA, n=5) |> + ncol() |> + expect_equal(5) + expect_equal( + pbmc_small |> as_tibble() |> + arrange(nFeature_RNA) |> + head(n=5) %>% pull(.cell), + pbmc_small |> slice_min(nFeature_RNA, n=5) |> colnames() + ) +}) + +test_that("slice_max()", { + pbmc_small |> + slice_max(nFeature_RNA, n=0) |> + ncol() |> + expect_equal(0) + pbmc_small |> + slice_max(nFeature_RNA, n = 5) |> + ncol() |> + expect_equal(5) + expect_equal( + pbmc_small |> as_tibble() |> + arrange(desc(nFeature_RNA)) |> + head(n=5) %>% pull(.cell), + pbmc_small |> slice_max(nFeature_RNA, n=5) |> colnames() + ) +}) + +test_that("slice_min() slice_max() tibble input for order_by", { + pbmc_small |> + slice_min(tibble::tibble(nFeature_RNA, nCount_RNA), n=5) |> + ncol() |> + expect_equal(5) + pbmc_small |> + slice_max(tibble::tibble(nFeature_RNA, nCount_RNA), n=5) |> + ncol() |> + expect_equal(5) +}) + test_that("select()", { fd <- select(df, .cell, number) expect_s4_class(fd, "SingleCellExperiment")