@@ -672,38 +672,47 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
672672
673673 /**
674674 * Return approximate number of distinct values for each key in this RDD.
675- * The accuracy of approximation can be controlled through the relative standard deviation
676- * (relativeSD) parameter, which also controls the amount of memory used. Lower values result in
677- * more accurate counts but increase the memory footprint and vise versa. Uses the provided
678- * Partitioner to partition the output RDD.
675+ *
676+ * The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice:
677+ * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available
678+ * <a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
679+ *
680+ * @param relativeSD Relative accuracy. Smaller values create counters that require more space.
681+ * It must be greater than 0.000017.
682+ * @param partitioner partitioner of the resulting RDD.
679683 */
680- def countApproxDistinctByKey (relativeSD : Double , partitioner : Partitioner ): JavaRDD [(K , Long )] = {
681- rdd.countApproxDistinctByKey(relativeSD, partitioner)
684+ def countApproxDistinctByKey (relativeSD : Double , partitioner : Partitioner ): JavaPairRDD [K , Long ] =
685+ {
686+ fromRDD(rdd.countApproxDistinctByKey(relativeSD, partitioner))
682687 }
683688
684689 /**
685- * Return approximate number of distinct values for each key this RDD.
686- * The accuracy of approximation can be controlled through the relative standard deviation
687- * (relativeSD) parameter, which also controls the amount of memory used. Lower values result in
688- * more accurate counts but increase the memory footprint and vise versa. The default value of
689- * relativeSD is 0.05. Hash-partitions the output RDD using the existing partitioner/parallelism
690- * level.
690+ * Return approximate number of distinct values for each key in this RDD.
691+ *
692+ * The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice:
693+ * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available
694+ * <a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
695+ *
696+ * @param relativeSD Relative accuracy. Smaller values create counters that require more space.
697+ * It must be greater than 0.000017.
698+ * @param numPartitions number of partitions of the resulting RDD.
691699 */
692- def countApproxDistinctByKey (relativeSD : Double = 0.05 ): JavaRDD [( K , Long ) ] = {
693- rdd.countApproxDistinctByKey(relativeSD)
700+ def countApproxDistinctByKey (relativeSD : Double , numPartitions : Int ): JavaPairRDD [ K , Long ] = {
701+ fromRDD( rdd.countApproxDistinctByKey(relativeSD, numPartitions) )
694702 }
695703
696-
697704 /**
698705 * Return approximate number of distinct values for each key in this RDD.
699- * The accuracy of approximation can be controlled through the relative standard deviation
700- * (relativeSD) parameter, which also controls the amount of memory used. Lower values result in
701- * more accurate counts but increase the memory footprint and vise versa. HashPartitions the
702- * output RDD into numPartitions.
703706 *
707+ * The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice:
708+ * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available
709+ * <a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
710+ *
711+ * @param relativeSD Relative accuracy. Smaller values create counters that require more space.
712+ * It must be greater than 0.000017.
704713 */
705- def countApproxDistinctByKey (relativeSD : Double , numPartitions : Int ): JavaRDD [( K , Long ) ] = {
706- rdd.countApproxDistinctByKey(relativeSD, numPartitions )
714+ def countApproxDistinctByKey (relativeSD : Double ): JavaPairRDD [ K , Long ] = {
715+ fromRDD( rdd.countApproxDistinctByKey(relativeSD) )
707716 }
708717
709718 /** Assign a name to this RDD */
0 commit comments