Skip to content

Commit cf0a8f0

Browse files
andrewor14pwendell
authored andcommitted
[SPARK-1681] Include datanucleus jars in Spark Hive distribution
This copies the datanucleus jars over from `lib_managed` into `dist/lib`, if any. The `CLASSPATH` must also be updated to reflect this change. Author: Andrew Or <[email protected]> Closes #610 from andrewor14/hive-distribution and squashes the following commits: a4bc96f [Andrew Or] Rename search path in jar error check fa205e1 [Andrew Or] Merge branch 'master' of github.com:apache/spark into hive-distribution 7855f58 [Andrew Or] Have jar command respect JAVA_HOME + check for jar errors both cases c16bbfd [Andrew Or] Merge branch 'master' of github.com:apache/spark into hive-distribution 32f6826 [Andrew Or] Leave the double colons 940a1bb [Andrew Or] Add back 2>/dev/null 58357cc [Andrew Or] Include datanucleus jars in Spark distribution built with Hive support
1 parent a975a19 commit cf0a8f0

File tree

2 files changed

+40
-29
lines changed

2 files changed

+40
-29
lines changed

bin/compute-classpath.sh

Lines changed: 33 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,8 @@ CLASSPATH="$SPARK_CLASSPATH:$SPARK_SUBMIT_CLASSPATH:$FWDIR/conf"
3232

3333
ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SCALA_VERSION"
3434

35-
if [ -n "${JAVA_HOME}" ]; then
36-
JAR_CMD="${JAVA_HOME}/bin/jar"
35+
if [ -n "$JAVA_HOME" ]; then
36+
JAR_CMD="$JAVA_HOME/bin/jar"
3737
else
3838
JAR_CMD="jar"
3939
fi
@@ -52,40 +52,48 @@ if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
5252
CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SCALA_VERSION/classes"
5353
CLASSPATH="$CLASSPATH:$FWDIR/yarn/stable/target/scala-$SCALA_VERSION/classes"
5454

55-
DEPS_ASSEMBLY_JAR=`ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar`
56-
CLASSPATH="$CLASSPATH:$DEPS_ASSEMBLY_JAR"
55+
ASSEMBLY_JAR=$(ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar 2>/dev/null)
5756
else
5857
# Else use spark-assembly jar from either RELEASE or assembly directory
5958
if [ -f "$FWDIR/RELEASE" ]; then
60-
ASSEMBLY_JAR=`ls "$FWDIR"/lib/spark-assembly*hadoop*.jar`
59+
ASSEMBLY_JAR=$(ls "$FWDIR"/lib/spark-assembly*hadoop*.jar 2>/dev/null)
6160
else
62-
ASSEMBLY_JAR=`ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*.jar`
61+
ASSEMBLY_JAR=$(ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*.jar 2>/dev/null)
6362
fi
64-
jar_error_check=$($JAR_CMD -tf $ASSEMBLY_JAR org/apache/spark/SparkContext 2>&1)
65-
if [[ "$jar_error_check" =~ "invalid CEN header" ]]; then
66-
echo "Loading Spark jar with '$JAR_CMD' failed. "
67-
echo "This is likely because Spark was compiled with Java 7 and run "
68-
echo "with Java 6. (see SPARK-1703). Please use Java 7 to run Spark "
69-
echo "or build Spark with Java 6."
70-
exit 1
71-
fi
72-
CLASSPATH="$CLASSPATH:$ASSEMBLY_JAR"
7363
fi
7464

65+
# Verify that versions of java used to build the jars and run Spark are compatible
66+
jar_error_check=$("$JAR_CMD" -tf "$ASSEMBLY_JAR" nonexistent/class/path 2>&1)
67+
if [[ "$jar_error_check" =~ "invalid CEN header" ]]; then
68+
echo "Loading Spark jar with '$JAR_CMD' failed. "
69+
echo "This is likely because Spark was compiled with Java 7 and run "
70+
echo "with Java 6. (see SPARK-1703). Please use Java 7 to run Spark "
71+
echo "or build Spark with Java 6."
72+
exit 1
73+
fi
74+
75+
CLASSPATH="$CLASSPATH:$ASSEMBLY_JAR"
76+
7577
# When Hive support is needed, Datanucleus jars must be included on the classpath.
76-
# Datanucleus jars do not work if only included in the uber jar as plugin.xml metadata is lost.
78+
# Datanucleus jars do not work if only included in the uber jar as plugin.xml metadata is lost.
7779
# Both sbt and maven will populate "lib_managed/jars/" with the datanucleus jars when Spark is
7880
# built with Hive, so first check if the datanucleus jars exist, and then ensure the current Spark
7981
# assembly is built for Hive, before actually populating the CLASSPATH with the jars.
8082
# Note that this check order is faster (by up to half a second) in the case where Hive is not used.
81-
num_datanucleus_jars=$(ls "$FWDIR"/lib_managed/jars/ 2>/dev/null | grep "datanucleus-.*\\.jar" | wc -l)
82-
if [ $num_datanucleus_jars -gt 0 ]; then
83-
AN_ASSEMBLY_JAR=${ASSEMBLY_JAR:-$DEPS_ASSEMBLY_JAR}
84-
num_hive_files=$(jar tvf "$AN_ASSEMBLY_JAR" org/apache/hadoop/hive/ql/exec 2>/dev/null | wc -l)
85-
if [ $num_hive_files -gt 0 ]; then
83+
if [ -f "$FWDIR/RELEASE" ]; then
84+
datanucleus_dir="$FWDIR"/lib
85+
else
86+
datanucleus_dir="$FWDIR"/lib_managed/jars
87+
fi
88+
89+
datanucleus_jars=$(find "$datanucleus_dir" 2>/dev/null | grep "datanucleus-.*\\.jar")
90+
datanucleus_jars=$(echo "$datanucleus_jars" | tr "\n" : | sed s/:$//g)
91+
92+
if [ -n "$datanucleus_jars" ]; then
93+
hive_files=$("$JAR_CMD" -tf "$ASSEMBLY_JAR" org/apache/hadoop/hive/ql/exec 2>/dev/null)
94+
if [ -n "$hive_files" ]; then
8695
echo "Spark assembly has been built with Hive, including Datanucleus jars on classpath" 1>&2
87-
DATANUCLEUSJARS=$(echo "$FWDIR/lib_managed/jars"/datanucleus-*.jar | tr " " :)
88-
CLASSPATH=$CLASSPATH:$DATANUCLEUSJARS
96+
CLASSPATH="$CLASSPATH:$datanucleus_jars"
8997
fi
9098
fi
9199

@@ -105,10 +113,10 @@ fi
105113
# Add hadoop conf dir if given -- otherwise FileSystem.*, etc fail !
106114
# Note, this assumes that there is either a HADOOP_CONF_DIR or YARN_CONF_DIR which hosts
107115
# the configurtion files.
108-
if [ "x" != "x$HADOOP_CONF_DIR" ]; then
116+
if [ -n "$HADOOP_CONF_DIR" ]; then
109117
CLASSPATH="$CLASSPATH:$HADOOP_CONF_DIR"
110118
fi
111-
if [ "x" != "x$YARN_CONF_DIR" ]; then
119+
if [ -n "$YARN_CONF_DIR" ]; then
112120
CLASSPATH="$CLASSPATH:$YARN_CONF_DIR"
113121
fi
114122

make-distribution.sh

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,13 +51,13 @@ if [ $? != 0 ]; then
5151
exit -1;
5252
fi
5353

54-
if [ -z "${JAVA_HOME}" ]; then
54+
if [ -z "$JAVA_HOME" ]; then
5555
echo "Error: JAVA_HOME is not set, cannot proceed."
5656
exit -1
5757
fi
5858

59-
JAVA_CMD=$JAVA_HOME/bin/java
60-
JAVA_VERSION=$($JAVA_CMD -version 2>&1)
59+
JAVA_CMD="$JAVA_HOME"/bin/java
60+
JAVA_VERSION=$("$JAVA_CMD" -version 2>&1)
6161
if ! [[ "$JAVA_VERSION" =~ "1.6" ]]; then
6262
echo "Error: JAVA_HOME must point to a JDK 6 installation (see SPARK-1703)."
6363
echo "Output from 'java -version' was:"
@@ -162,6 +162,10 @@ echo "Spark $VERSION built for Hadoop $SPARK_HADOOP_VERSION" > "$DISTDIR/RELEASE
162162
cp $FWDIR/assembly/target/scala*/*assembly*hadoop*.jar "$DISTDIR/lib/"
163163
cp $FWDIR/examples/target/scala*/spark-examples*.jar "$DISTDIR/lib/"
164164

165+
if [ "$SPARK_HIVE" == "true" ]; then
166+
cp $FWDIR/lib_managed/jars/datanucleus*.jar "$DISTDIR/lib/"
167+
fi
168+
165169
# Copy other things
166170
mkdir "$DISTDIR"/conf
167171
cp "$FWDIR"/conf/*.template "$DISTDIR"/conf
@@ -170,7 +174,6 @@ cp -r "$FWDIR/bin" "$DISTDIR"
170174
cp -r "$FWDIR/python" "$DISTDIR"
171175
cp -r "$FWDIR/sbin" "$DISTDIR"
172176

173-
174177
# Download and copy in tachyon, if requested
175178
if [ "$SPARK_TACHYON" == "true" ]; then
176179
TACHYON_VERSION="0.4.1"

0 commit comments

Comments
 (0)