Merge pull request apache#106 from hlin09/hlin09

shivaram · shivaram · commit 4dca9b1fd1d4 · 2014-11-13T20:21:24.000-08:00
Add function countByValue() and countByKey().
diff --git a/pkg/NAMESPACE b/pkg/NAMESPACE
@@ -9,6 +9,8 @@ exportMethods(
               "collectPartition",
               "combineByKey",
               "count",
+              "countByKey",
+              "countByValue",
               "distinct",
               "Filter",
               "filter",
@@ -27,6 +29,7 @@ exportMethods(
               "mapValues",
               "maximum",
               "minimum",
+              "numPartitions",
               "partitionBy",
               "reduce",
               "reduceByKey",
diff --git a/pkg/R/RDD.R b/pkg/R/RDD.R
@@ -234,6 +234,30 @@ setMethod("checkpoint",
             rdd
           })
 
+#' Gets the number of partitions of an RDD
+#'
+#' @param rdd A RDD.
+#' @return the number of partitions of rdd as an integer.
+#' @rdname numPartitions
+#' @export
+#' @examples
+#'\dontrun{
+#' sc <- sparkR.init()
+#' rdd <- parallelize(sc, 1:10, 2L)
+#' numParititions(rdd)  # 2L
+#'}
+setGeneric("numPartitions", function(rdd) { standardGeneric("numPartitions") })
+
+#' @rdname numPartitions
+#' @aliases numPartitions,RDD-method
+setMethod("numPartitions",
+          signature(rdd = "RDD"),
+          function(rdd) {
+            jrdd <- getJRDD(rdd)
+            partitions <- .jcall(jrdd, "Ljava/util/List;", "splits")
+            .jcall(partitions, "I", "size")
+          })
+
 #' Collect elements of an RDD
 #'
 #' @description
@@ -359,6 +383,58 @@ setMethod("length",
             count(x)
           })
 
+#' Return the count of each unique value in this RDD as a list of 
+#' (value, count) pairs.
+#' 
+#' Same as countByValue in Spark.
+#'
+#' @param rdd The RDD to count
+#' @return list of (value, count) pairs, where count is number of each unique 
+#' value in rdd.
+#' @rdname countByValue
+#' @export
+#' @examples
+#'\dontrun{
+#' sc <- sparkR.init()
+#' rdd <- parallelize(sc, c(1,2,3,2,1))
+#' countByValue(rdd) # (1,2L), (2,2L), (3,1L)
+#'}
+setGeneric("countByValue", function(rdd) { standardGeneric("countByValue") })
+
+#' @rdname countByValue
+#' @aliases countByValue,RDD-method
+setMethod("countByValue",
+          signature(rdd = "RDD"),
+          function(rdd) {
+            ones <- lapply(rdd, function(item) { list(item, 1L) })
+            collect(reduceByKey(ones, `+`, numPartitions(rdd)))
+          })
+
+#' Count the number of elements for each key, and return the result to the
+#' master as lists of (key, count) pairs.
+#' 
+#' Same as countByKey in Spark.
+#'
+#' @param rdd The RDD to count keys.
+#' @return list of (key, count) pairs, where count is number of each key in rdd.
+#' @rdname countByKey
+#' @export
+#' @examples
+#'\dontrun{
+#' sc <- sparkR.init()
+#' rdd <- parallelize(sc, list(c("a", 1), c("b", 1), c("a", 1)))
+#' countByKey(rdd) # ("a", 2L), ("b", 1L)
+#'}
+setGeneric("countByKey", function(rdd) { standardGeneric("countByKey") })
+
+#' @rdname countByKey
+#' @aliases countByKey,RDD-method
+setMethod("countByKey",
+          signature(rdd = "RDD"),
+          function(rdd) {
+            keys <- lapply(rdd, function(item) { item[[1]] })
+            countByValue(keys)
+          })
 
 #' Apply a function to all elements
 #'
@@ -659,8 +735,8 @@ setMethod("take",
             resList <- list()
             index <- -1
             jrdd <- getJRDD(rdd)
-            partitions <- .jcall(jrdd, "Ljava/util/List;", "splits")
-            numPartitions <- .jcall(partitions, "I", "size")
+            numPartitions <- numPartitions(rdd)
+            
             # TODO(shivaram): Collect more than one partition based on size
             # estimates similar to the scala version of `take`.
             while (TRUE) {
@@ -707,9 +783,7 @@ setMethod("distinct",
           signature(rdd = "RDD", numPartitions = "missingOrInteger"),
           function(rdd, numPartitions) {
             if (missing(numPartitions)) {
-              jrdd <- getJRDD(rdd)
-              partitions <- .jcall(jrdd, "Ljava/util/List;", "splits")
-              numPartitions <- .jcall(partitions, "I", "size")
+              numPartitions <- SparkR::numPartitions(rdd)
             }
             identical.mapped <- lapply(rdd, function(x) { list(x, NULL) })
             reduced <- reduceByKey(identical.mapped, 
diff --git a/pkg/inst/tests/test_rdd.R b/pkg/inst/tests/test_rdd.R
@@ -10,11 +10,27 @@ rdd <- parallelize(sc, nums, 2L)
 intPairs <- list(list(1L, -1), list(2L, 100), list(2L, 1), list(1L, 200))
 intRdd <- parallelize(sc, intPairs, 2L)
 
+test_that("get number of partitions in RDD", {
+  expect_equal(numPartitions(rdd), 2)
+  expect_equal(numPartitions(intRdd), 2)
+})
+
 test_that("count and length on RDD", {
    expect_equal(count(rdd), 10)
    expect_equal(length(rdd), 10)
 })
 
+test_that("count by values and keys", {
+  mods <- lapply(rdd, function(x) { x %% 3 })
+  actual <- countByValue(mods)
+  expected <- list(list(0, 3L), list(1, 4L), list(2, 3L))
+  expect_equal(actual, expected)
+  
+  actual <- countByKey(intRdd)
+  expected <- list(list(2L, 2L), list(1L, 2L))
+  expect_equal(actual, expected)
+})
+
 test_that("lapply on RDD", {
   multiples <- lapply(rdd, function(x) { 2 * x })
   actual <- collect(multiples)
diff --git a/pkg/man/countByKey.Rd b/pkg/man/countByKey.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2 (4.0.2): do not edit by hand
+\docType{methods}
+\name{countByKey}
+\alias{countByKey}
+\alias{countByKey,RDD-method}
+\title{Count the number of elements for each key, and return the result to the
+master as lists of (key, count) pairs.}
+\usage{
+countByKey(rdd)
+
+\S4method{countByKey}{RDD}(rdd)
+}
+\arguments{
+\item{rdd}{The RDD to count keys.}
+}
+\value{
+list of (key, count) pairs, where count is number of each key in rdd.
+}
+\description{
+Same as countByKey in Spark.
+}
+\examples{
+\dontrun{
+sc <- sparkR.init()
+rdd <- parallelize(sc, list(c("a", 1), c("b", 1), c("a", 1)))
+countByKey(rdd) # ("a", 2L), ("b", 1L)
+}
+}
+
diff --git a/pkg/man/countByValue.Rd b/pkg/man/countByValue.Rd
@@ -0,0 +1,30 @@
+% Generated by roxygen2 (4.0.2): do not edit by hand
+\docType{methods}
+\name{countByValue}
+\alias{countByValue}
+\alias{countByValue,RDD-method}
+\title{Return the count of each unique value in this RDD as a list of
+(value, count) pairs.}
+\usage{
+countByValue(rdd)
+
+\S4method{countByValue}{RDD}(rdd)
+}
+\arguments{
+\item{rdd}{The RDD to count}
+}
+\value{
+list of (value, count) pairs, where count is number of each unique
+value in rdd.
+}
+\description{
+Same as countByValue in Spark.
+}
+\examples{
+\dontrun{
+sc <- sparkR.init()
+rdd <- parallelize(sc, c(1,2,3,2,1))
+countByValue(rdd) # (1,2L), (2,2L), (3,1L)
+}
+}
+
diff --git a/pkg/man/numPartitions.Rd b/pkg/man/numPartitions.Rd
@@ -0,0 +1,28 @@
+% Generated by roxygen2 (4.0.2): do not edit by hand
+\docType{methods}
+\name{numPartitions}
+\alias{numPartitions}
+\alias{numPartitions,RDD-method}
+\title{Gets the number of partitions of an RDD}
+\usage{
+numPartitions(rdd)
+
+\S4method{numPartitions}{RDD}(rdd)
+}
+\arguments{
+\item{rdd}{A RDD.}
+}
+\value{
+the number of partitions of rdd as an integer.
+}
+\description{
+Gets the number of partitions of an RDD
+}
+\examples{
+\dontrun{
+sc <- sparkR.init()
+rdd <- parallelize(sc, 1:10, 2L)
+numParititions(rdd)  # 2L
+}
+}
+