@@ -234,6 +234,30 @@ setMethod("checkpoint",
234234 rdd
235235 })
236236
237+ # ' Gets the number of partitions of an RDD
238+ # '
239+ # ' @param rdd A RDD.
240+ # ' @return the number of partitions of rdd as an integer.
241+ # ' @rdname numPartitions
242+ # ' @export
243+ # ' @examples
244+ # '\dontrun{
245+ # ' sc <- sparkR.init()
246+ # ' rdd <- parallelize(sc, 1:10, 2L)
247+ # ' numParititions(rdd) # 2L
248+ # '}
249+ setGeneric ("numPartitions ", function(rdd) { standardGeneric("numPartitions") })
250+
251+ # ' @rdname numPartitions
252+ # ' @aliases numPartitions,RDD-method
253+ setMethod ("numPartitions ",
254+ signature(rdd = " RDD" ),
255+ function (rdd ) {
256+ jrdd <- getJRDD(rdd )
257+ partitions <- .jcall(jrdd , " Ljava/util/List;" , " splits" )
258+ .jcall(partitions , " I" , " size" )
259+ })
260+
237261# ' Collect elements of an RDD
238262# '
239263# ' @description
@@ -359,6 +383,58 @@ setMethod("length",
359383 count(x )
360384 })
361385
386+ # ' Return the count of each unique value in this RDD as a list of
387+ # ' (value, count) pairs.
388+ # '
389+ # ' Same as countByValue in Spark.
390+ # '
391+ # ' @param rdd The RDD to count
392+ # ' @return list of (value, count) pairs, where count is number of each unique
393+ # ' value in rdd.
394+ # ' @rdname countByValue
395+ # ' @export
396+ # ' @examples
397+ # '\dontrun{
398+ # ' sc <- sparkR.init()
399+ # ' rdd <- parallelize(sc, c(1,2,3,2,1))
400+ # ' countByValue(rdd) # (1,2L), (2,2L), (3,1L)
401+ # '}
402+ setGeneric ("countByValue ", function(rdd) { standardGeneric("countByValue") })
403+
404+ # ' @rdname countByValue
405+ # ' @aliases countByValue,RDD-method
406+ setMethod ("countByValue ",
407+ signature(rdd = " RDD" ),
408+ function (rdd ) {
409+ ones <- lapply(rdd , function (item ) { list (item , 1L ) })
410+ collect(reduceByKey(ones , `+` , numPartitions(rdd )))
411+ })
412+
413+ # ' Count the number of elements for each key, and return the result to the
414+ # ' master as lists of (key, count) pairs.
415+ # '
416+ # ' Same as countByKey in Spark.
417+ # '
418+ # ' @param rdd The RDD to count keys.
419+ # ' @return list of (key, count) pairs, where count is number of each key in rdd.
420+ # ' @rdname countByKey
421+ # ' @export
422+ # ' @examples
423+ # '\dontrun{
424+ # ' sc <- sparkR.init()
425+ # ' rdd <- parallelize(sc, list(c("a", 1), c("b", 1), c("a", 1)))
426+ # ' countByKey(rdd) # ("a", 2L), ("b", 1L)
427+ # '}
428+ setGeneric ("countByKey ", function(rdd) { standardGeneric("countByKey") })
429+
430+ # ' @rdname countByKey
431+ # ' @aliases countByKey,RDD-method
432+ setMethod ("countByKey ",
433+ signature(rdd = " RDD" ),
434+ function (rdd ) {
435+ keys <- lapply(rdd , function (item ) { item [[1 ]] })
436+ countByValue(keys )
437+ })
362438
363439# ' Apply a function to all elements
364440# '
@@ -659,8 +735,8 @@ setMethod("take",
659735 resList <- list ()
660736 index <- - 1
661737 jrdd <- getJRDD(rdd )
662- partitions <- .jcall( jrdd , " Ljava/util/List; " , " splits " )
663- numPartitions <- .jcall( partitions , " I " , " size " )
738+ numPartitions <- numPartitions( rdd )
739+
664740 # TODO(shivaram): Collect more than one partition based on size
665741 # estimates similar to the scala version of `take`.
666742 while (TRUE ) {
@@ -707,9 +783,7 @@ setMethod("distinct",
707783 signature(rdd = " RDD" , numPartitions = " missingOrInteger" ),
708784 function (rdd , numPartitions ) {
709785 if (missing(numPartitions )) {
710- jrdd <- getJRDD(rdd )
711- partitions <- .jcall(jrdd , " Ljava/util/List;" , " splits" )
712- numPartitions <- .jcall(partitions , " I" , " size" )
786+ numPartitions <- SparkR :: numPartitions(rdd )
713787 }
714788 identical.mapped <- lapply(rdd , function (x ) { list (x , NULL ) })
715789 reduced <- reduceByKey(identical.mapped ,
0 commit comments