Skip to content

Commit 1625baa

Browse files
author
Marcelo Vanzin
committed
[SPARK-26025][k8s] Speed up docker image build on dev repo.
The "build context" for a docker image - basically the whole contents of the current directory where "docker" is invoked - can be huge in a dev build, easily breaking a couple of gigs. Doing that copy 3 times during the build of docker images severely slows down the process. This patch creates a smaller build context - basically mimicking what the make-distribution.sh script does, so that when building the docker images, only the necessary bits are in the current directory. For PySpark and R that is optimized further, since those images are built based on the previously built Spark main image. In my current local clone, the dir size is about 2G, but with this script the "context" sent to docker is about 250M for the main image, 1M for the pyspark image and 8M for the R image. That speeds up the image builds considerably.
1 parent f9ff756 commit 1625baa

File tree

2 files changed

+92
-48
lines changed

2 files changed

+92
-48
lines changed

bin/docker-image-tool.sh

Lines changed: 88 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,20 @@ if [ -z "${SPARK_HOME}" ]; then
2929
fi
3030
. "${SPARK_HOME}/bin/load-spark-env.sh"
3131

32+
CTX_DIR="$SPARK_HOME/target/tmp/docker"
33+
34+
function is_dev_build {
35+
[ ! -f "$SPARK_HOME/RELEASE" ]
36+
}
37+
38+
function cleanup_ctx_dir {
39+
if is_dev_build; then
40+
rm -rf "$CTX_DIR"
41+
fi
42+
}
43+
44+
trap cleanup_ctx_dir EXIT
45+
3246
function image_ref {
3347
local image="$1"
3448
local add_repo="${2:-1}"
@@ -41,76 +55,111 @@ function image_ref {
4155
echo "$image"
4256
}
4357

58+
# Create a smaller build context for docker in dev builds to make the build faster. Docker
59+
# uploads all of the current directory to the daemon, and it can get pretty big with dev
60+
# builds that contain test log files and other artifacts.
61+
#
62+
# Three build contexts are created, one for each image: base, pyspark, and sparkr. For them
63+
# to have the desired effect, the docker command needs to be executed inside the appropriate
64+
# context directory.
65+
#
66+
# Note: docker does not support symlinks in the build context.
67+
function create_dev_build_context {(
68+
set -e
69+
local BASE_CTX="$CTX_DIR/base"
70+
mkdir -p "$BASE_CTX/kubernetes"
71+
cp -r "resource-managers/kubernetes/docker/src/main/dockerfiles" \
72+
"$BASE_CTX/kubernetes/dockerfiles"
73+
74+
cp -r "assembly/target/scala-$SPARK_SCALA_VERSION/jars" "$BASE_CTX/jars"
75+
cp -r "resource-managers/kubernetes/integration-tests/tests" \
76+
"$BASE_CTX/kubernetes/tests"
77+
78+
mkdir "$BASE_CTX/examples"
79+
cp -r "examples/src" "$BASE_CTX/examples/src"
80+
# Copy just needed examples jars instead of everything.
81+
mkdir "$BASE_CTX/examples/jars"
82+
for i in examples/target/scala-$SPARK_SCALA_VERSION/jars/*; do
83+
if [ ! -f "$BASE_CTX/jars/$(basename $i)" ]; then
84+
cp $i "$BASE_CTX/examples/jars"
85+
fi
86+
done
87+
88+
for other in bin sbin data; do
89+
cp -r "$other" "$BASE_CTX/$other"
90+
done
91+
92+
local PYSPARK_CTX="$CTX_DIR/pyspark"
93+
mkdir -p "$PYSPARK_CTX/kubernetes"
94+
cp -r "resource-managers/kubernetes/docker/src/main/dockerfiles" \
95+
"$PYSPARK_CTX/kubernetes/dockerfiles"
96+
mkdir "$PYSPARK_CTX/python"
97+
cp -r "python/lib" "$PYSPARK_CTX/python/lib"
98+
99+
local R_CTX="$CTX_DIR/sparkr"
100+
mkdir -p "$R_CTX/kubernetes"
101+
cp -r "resource-managers/kubernetes/docker/src/main/dockerfiles" \
102+
"$R_CTX/kubernetes/dockerfiles"
103+
cp -r "R" "$R_CTX/R"
104+
)}
105+
106+
function img_ctx_dir {
107+
if is_dev_build; then
108+
echo "$CTX_DIR/$1"
109+
else
110+
echo "$SPARK_HOME"
111+
fi
112+
}
113+
44114
function build {
45115
local BUILD_ARGS
46-
local IMG_PATH
47-
local JARS
48-
49-
if [ ! -f "$SPARK_HOME/RELEASE" ]; then
50-
# Set image build arguments accordingly if this is a source repo and not a distribution archive.
51-
#
52-
# Note that this will copy all of the example jars directory into the image, and that will
53-
# contain a lot of duplicated jars with the main Spark directory. In a proper distribution,
54-
# the examples directory is cleaned up before generating the distribution tarball, so this
55-
# issue does not occur.
56-
IMG_PATH=resource-managers/kubernetes/docker/src/main/dockerfiles
57-
JARS=assembly/target/scala-$SPARK_SCALA_VERSION/jars
58-
BUILD_ARGS=(
59-
${BUILD_PARAMS}
60-
--build-arg
61-
img_path=$IMG_PATH
62-
--build-arg
63-
spark_jars=$JARS
64-
--build-arg
65-
example_jars=examples/target/scala-$SPARK_SCALA_VERSION/jars
66-
--build-arg
67-
k8s_tests=resource-managers/kubernetes/integration-tests/tests
68-
)
69-
else
70-
# Not passed as arguments to docker, but used to validate the Spark directory.
71-
IMG_PATH="kubernetes/dockerfiles"
72-
JARS=jars
73-
BUILD_ARGS=(${BUILD_PARAMS})
116+
local SPARK_ROOT="$SPARK_HOME"
117+
118+
if is_dev_build; then
119+
create_dev_build_context
120+
SPARK_ROOT="$CTX_DIR/base"
74121
fi
75122

76123
# Verify that the Docker image content directory is present
77-
if [ ! -d "$IMG_PATH" ]; then
124+
if [ ! -d "$SPARK_ROOT/kubernetes/dockerfiles" ]; then
78125
error "Cannot find docker image. This script must be run from a runnable distribution of Apache Spark."
79126
fi
80127

81128
# Verify that Spark has actually been built/is a runnable distribution
82129
# i.e. the Spark JARs that the Docker files will place into the image are present
83-
local TOTAL_JARS=$(ls $JARS/spark-* | wc -l)
130+
local TOTAL_JARS=$(ls $SPARK_ROOT/jars/spark-* | wc -l)
84131
TOTAL_JARS=$(( $TOTAL_JARS ))
85132
if [ "${TOTAL_JARS}" -eq 0 ]; then
86133
error "Cannot find Spark JARs. This script assumes that Apache Spark has first been built locally or this is a runnable distribution."
87134
fi
88135

136+
local BUILD_ARGS=(${BUILD_PARAMS})
89137
local BINDING_BUILD_ARGS=(
90138
${BUILD_PARAMS}
91139
--build-arg
92140
base_img=$(image_ref spark)
93141
)
94-
local BASEDOCKERFILE=${BASEDOCKERFILE:-"$IMG_PATH/spark/Dockerfile"}
95-
local PYDOCKERFILE=${PYDOCKERFILE:-"$IMG_PATH/spark/bindings/python/Dockerfile"}
96-
local RDOCKERFILE=${RDOCKERFILE:-"$IMG_PATH/spark/bindings/R/Dockerfile"}
142+
local BASEDOCKERFILE=${BASEDOCKERFILE:-"kubernetes/dockerfiles/spark/Dockerfile"}
143+
local PYDOCKERFILE=${PYDOCKERFILE:-"kubernetes/dockerfiles/spark/bindings/python/Dockerfile"}
144+
local RDOCKERFILE=${RDOCKERFILE:-"kubernetes/dockerfiles/spark/bindings/R/Dockerfile"}
97145

98-
docker build $NOCACHEARG "${BUILD_ARGS[@]}" \
146+
(cd $(img_ctx_dir base) && docker build $NOCACHEARG "${BUILD_ARGS[@]}" \
99147
-t $(image_ref spark) \
100-
-f "$BASEDOCKERFILE" .
148+
-f "$BASEDOCKERFILE" .)
101149
if [ $? -ne 0 ]; then
102150
error "Failed to build Spark JVM Docker image, please refer to Docker build output for details."
103151
fi
104152

105-
docker build $NOCACHEARG "${BINDING_BUILD_ARGS[@]}" \
153+
(cd $(img_ctx_dir pyspark) && docker build $NOCACHEARG "${BINDING_BUILD_ARGS[@]}" \
106154
-t $(image_ref spark-py) \
107-
-f "$PYDOCKERFILE" .
155+
-f "$PYDOCKERFILE" .)
108156
if [ $? -ne 0 ]; then
109157
error "Failed to build PySpark Docker image, please refer to Docker build output for details."
110158
fi
111-
docker build $NOCACHEARG "${BINDING_BUILD_ARGS[@]}" \
159+
160+
(cd $(img_ctx_dir sparkr) && docker build $NOCACHEARG "${BINDING_BUILD_ARGS[@]}" \
112161
-t $(image_ref spark-r) \
113-
-f "$RDOCKERFILE" .
162+
-f "$RDOCKERFILE" .)
114163
if [ $? -ne 0 ]; then
115164
error "Failed to build SparkR Docker image, please refer to Docker build output for details."
116165
fi

resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,6 @@
1717

1818
FROM openjdk:8-alpine
1919

20-
ARG spark_jars=jars
21-
ARG example_jars=examples/jars
22-
ARG img_path=kubernetes/dockerfiles
23-
ARG k8s_tests=kubernetes/tests
24-
2520
# Before building the docker image, first build and make a Spark distribution following
2621
# the instructions in http://spark.apache.org/docs/latest/building-spark.html.
2722
# If this docker file is being used in the context of building your images from a Spark
@@ -41,13 +36,13 @@ RUN set -ex && \
4136
echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su && \
4237
chgrp root /etc/passwd && chmod ug+rw /etc/passwd
4338

44-
COPY ${spark_jars} /opt/spark/jars
39+
COPY jars /opt/spark/jars
4540
COPY bin /opt/spark/bin
4641
COPY sbin /opt/spark/sbin
47-
COPY ${img_path}/spark/entrypoint.sh /opt/
48-
COPY ${example_jars} /opt/spark/examples/jars
42+
COPY kubernetes/dockerfiles/spark/entrypoint.sh /opt/
43+
COPY examples/jars /opt/spark/examples/jars
4944
COPY examples/src /opt/spark/examples/src
50-
COPY ${k8s_tests} /opt/spark/tests
45+
COPY kubernetes/tests /opt/spark/tests
5146
COPY data /opt/spark/data
5247

5348
ENV SPARK_HOME /opt/spark

0 commit comments

Comments
 (0)