Skip to content

Commit d6effee

Browse files
committed
SPARK-1165 Implemented RDD.intersection in python.
1 parent 0283665 commit d6effee

File tree

1 file changed

+13
-0
lines changed

1 file changed

+13
-0
lines changed

python/pyspark/rdd.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,19 @@ def union(self, other):
319319
return RDD(self_copy._jrdd.union(other_copy._jrdd), self.ctx,
320320
self.ctx.serializer)
321321

322+
def intersection(self, other):
323+
"""
324+
Return the intersection of this RDD and another one.
325+
326+
>>> rdd1 = sc.parallelize([1, 10, 2, 3, 4, 5])
327+
>>> rdd2 = sc.parallelize([1, 6, 2, 3, 7, 8])
328+
>>> rdd1.intersection(rdd2).collect()
329+
[1, 2, 3]
330+
"""
331+
return self.map(lambda v: (v, None)).cogroup(
332+
other.map(lambda v: (v, None))).filter(
333+
lambda x: (len(x[1][0]) != 0) and (len(x[1][1]) != 0)).keys()
334+
322335
def _reserialize(self):
323336
if self._jrdd_deserializer == self.ctx.serializer:
324337
return self

0 commit comments

Comments
 (0)