Skip to content

Commit 094c402

Browse files
hqzizaniajeanlyn
authored andcommitted
[SPARK-6841] [SPARKR] add support for mean, median, stdev etc.
Moving here from amplab-extras/SparkR-pkg#241 sum() has been implemented. (amplab-extras/SparkR-pkg#242) Now Phase 1: mean, sd, var have been implemented, but some things still need to be improved with the suggestions in https://issues.apache.org/jira/browse/SPARK-6841 Author: qhuang <[email protected]> Closes apache#5446 from hqzizania/R and squashes the following commits: f283572 [qhuang] add test unit for describe() 2e74d5a [qhuang] add describe() DataFrame API
1 parent a7b754d commit 094c402

File tree

4 files changed

+53
-0
lines changed

4 files changed

+53
-0
lines changed

R/pkg/NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ exportMethods("cache",
1313
"collect",
1414
"columns",
1515
"count",
16+
"describe",
1617
"distinct",
1718
"dtypes",
1819
"except",

R/pkg/R/DataFrame.R

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1276,3 +1276,40 @@ setMethod("saveAsTable",
12761276
callJMethod(df@sdf, "saveAsTable", tableName, source, jmode, options)
12771277
})
12781278

1279+
#' describe
1280+
#'
1281+
#' Computes statistics for numeric columns.
1282+
#' If no columns are given, this function computes statistics for all numerical columns.
1283+
#'
1284+
#' @param x A DataFrame to be computed.
1285+
#' @param col A string of name
1286+
#' @param ... Additional expressions
1287+
#' @return A DataFrame
1288+
#' @rdname describe
1289+
#' @export
1290+
#' @examples
1291+
#'\dontrun{
1292+
#' sc <- sparkR.init()
1293+
#' sqlCtx <- sparkRSQL.init(sc)
1294+
#' path <- "path/to/file.json"
1295+
#' df <- jsonFile(sqlCtx, path)
1296+
#' describe(df)
1297+
#' describe(df, "col1")
1298+
#' describe(df, "col1", "col2")
1299+
#' }
1300+
setMethod("describe",
1301+
signature(x = "DataFrame", col = "character"),
1302+
function(x, col, ...) {
1303+
colList <- list(col, ...)
1304+
sdf <- callJMethod(x@sdf, "describe", listToSeq(colList))
1305+
dataFrame(sdf)
1306+
})
1307+
1308+
#' @rdname describe
1309+
setMethod("describe",
1310+
signature(x = "DataFrame"),
1311+
function(x) {
1312+
colList <- as.list(c(columns(x)))
1313+
sdf <- callJMethod(x@sdf, "describe", listToSeq(colList))
1314+
dataFrame(sdf)
1315+
})

R/pkg/R/generics.R

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -384,6 +384,10 @@ setGeneric("value", function(bcast) { standardGeneric("value") })
384384
#' @export
385385
setGeneric("columns", function(x) {standardGeneric("columns") })
386386

387+
#' @rdname describe
388+
#' @export
389+
setGeneric("describe", function(x, col, ...) { standardGeneric("describe") })
390+
387391
#' @rdname schema
388392
#' @export
389393
setGeneric("dtypes", function(x) { standardGeneric("dtypes") })

R/pkg/inst/tests/test_sparkSQL.R

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -705,5 +705,16 @@ test_that("parquetFile works with multiple input paths", {
705705
expect_true(count(parquetDF) == count(df)*2)
706706
})
707707

708+
test_that("describe() on a DataFrame", {
709+
df <- jsonFile(sqlCtx, jsonPath)
710+
stats <- describe(df, "age")
711+
expect_true(collect(stats)[1, "summary"] == "count")
712+
expect_true(collect(stats)[2, "age"] == 24.5)
713+
expect_true(collect(stats)[3, "age"] == 5.5)
714+
stats <- describe(df)
715+
expect_true(collect(stats)[4, "name"] == "Andy")
716+
expect_true(collect(stats)[5, "age"] == 30.0)
717+
})
718+
708719
unlink(parquetPath)
709720
unlink(jsonPath)

0 commit comments

Comments
 (0)