added repartition function to python API.

ScrapCodes · ScrapCodes · commit 6568d2c2fa14 · 2014-01-23T12:52:44.000+05:30
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
@@ -970,6 +970,19 @@ def keyBy(self, f):
         """
         return self.map(lambda x: (f(x), x))
 
+    def repartition(self, numPartitions):
+        """
+         Return a new RDD that has exactly numPartitions partitions.
+          
+         Can increase or decrease the level of parallelism in this RDD. Internally, this uses
+         a shuffle to redistribute data.
+         >>> sc.parallelize([1, 2, 3, 4, 5]).repartition(10).count()
+         If you are decreasing the number of partitions in this RDD, consider using `coalesce`,
+         which can avoid performing a shuffle.
+        """
+        jrdd = self._jrdd.repartition(numPartitions)
+        return RDD(jrdd, self.ctx, self._jrdd_deserializer)
+
     # TODO: `lookup` is disabled because we can't make direct comparisons based
     # on the key; we need to compare the hash of the key to the hash of the
     # keys in the pairs.  This could be an expensive operation, since those