Skip to content

Commit 4670678

Browse files
author
Srinivasa Reddy Vundela
committed
[SPARK-12717][PYSPARK] Resolving race condition with pyspark broadcasts when using multiple threads
1 parent 71a8e9d commit 4670678

File tree

1 file changed

+10
-5
lines changed

1 file changed

+10
-5
lines changed

python/pyspark/rdd.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
import socket
2929
from subprocess import Popen, PIPE
3030
from tempfile import NamedTemporaryFile
31-
from threading import Thread
31+
from threading import Thread, Lock
3232
from collections import defaultdict
3333
from itertools import chain
3434
from functools import reduce
@@ -55,6 +55,9 @@
5555

5656
__all__ = ["RDD"]
5757

58+
# Lock which will make sure that dependend broadcast variables are pickled along
59+
# with their PythonRDD wrapped function when using multple threads(SPARK-12717).
60+
_lock = Lock()
5861

5962
def portable_hash(x):
6063
"""
@@ -2451,10 +2454,12 @@ def _jrdd(self):
24512454
else:
24522455
profiler = None
24532456

2454-
wrapped_func = _wrap_function(self.ctx, self.func, self._prev_jrdd_deserializer,
2455-
self._jrdd_deserializer, profiler)
2456-
python_rdd = self.ctx._jvm.PythonRDD(self._prev_jrdd.rdd(), wrapped_func,
2457-
self.preservesPartitioning)
2457+
with _lock:
2458+
wrapped_func = _wrap_function(self.ctx, self.func,
2459+
self._prev_jrdd_deserializer, self._jrdd_deserializer, profiler)
2460+
python_rdd = self.ctx._jvm.PythonRDD(self._prev_jrdd.rdd(), wrapped_func,
2461+
self.preservesPartitioning)
2462+
24582463
self._jrdd_val = python_rdd.asJavaRDD()
24592464

24602465
if profiler:

0 commit comments

Comments
 (0)