Skip to content

Commit 4893622

Browse files
author
Marcelo Vanzin
committed
[SPARK-7485] [build] Remove pyspark files from assembly.
The sbt part of the build is hacky; it basically tricks sbt into generating the zip by using a generator, but returns an empty list for the generated files so that nothing is actually added to the assembly.
1 parent 6dad76e commit 4893622

File tree

5 files changed

+4
-109
lines changed

5 files changed

+4
-109
lines changed

core/pom.xml

Lines changed: 0 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -381,35 +381,6 @@
381381
<outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
382382
<testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
383383
<plugins>
384-
<!-- Unzip py4j so we can include its files in the jar -->
385-
<plugin>
386-
<groupId>org.apache.maven.plugins</groupId>
387-
<artifactId>maven-antrun-plugin</artifactId>
388-
<executions>
389-
<execution>
390-
<phase>generate-resources</phase>
391-
<goals>
392-
<goal>run</goal>
393-
</goals>
394-
</execution>
395-
</executions>
396-
<configuration>
397-
<target>
398-
<unzip src="../python/lib/py4j-0.8.2.1-src.zip" dest="../python/build" />
399-
</target>
400-
</configuration>
401-
</plugin>
402-
<plugin>
403-
<artifactId>maven-clean-plugin</artifactId>
404-
<configuration>
405-
<filesets>
406-
<fileset>
407-
<directory>${basedir}/../python/build</directory>
408-
</fileset>
409-
</filesets>
410-
<verbose>true</verbose>
411-
</configuration>
412-
</plugin>
413384
<plugin>
414385
<groupId>org.apache.maven.plugins</groupId>
415386
<artifactId>maven-dependency-plugin</artifactId>
@@ -438,24 +409,6 @@
438409
</executions>
439410
</plugin>
440411
</plugins>
441-
442-
<resources>
443-
<resource>
444-
<directory>src/main/resources</directory>
445-
</resource>
446-
<resource>
447-
<directory>../python</directory>
448-
<includes>
449-
<include>pyspark/*.py</include>
450-
</includes>
451-
</resource>
452-
<resource>
453-
<directory>../python/build</directory>
454-
<includes>
455-
<include>py4j/*.py</include>
456-
</includes>
457-
</resource>
458-
</resources>
459412
</build>
460413

461414
<profiles>

mllib/pom.xml

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -141,16 +141,5 @@
141141
<build>
142142
<outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
143143
<testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
144-
<resources>
145-
<resource>
146-
<directory>../python</directory>
147-
<includes>
148-
<include>pyspark/mllib/*.py</include>
149-
<include>pyspark/mllib/stat/*.py</include>
150-
<include>pyspark/ml/*.py</include>
151-
<include>pyspark/ml/param/*.py</include>
152-
</includes>
153-
</resource>
154-
</resources>
155144
</build>
156145
</project>

project/SparkBuild.scala

Lines changed: 4 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ object SparkBuild extends PomBuild {
168168
/* Enable Assembly for all assembly projects */
169169
assemblyProjects.foreach(enable(Assembly.settings))
170170

171-
/* Package pyspark artifacts in the main assembly. */
171+
/* Package pyspark artifacts in a separate zip file for YARN. */
172172
enable(PySparkAssembly.settings)(assembly)
173173

174174
/* Enable unidoc only for the root spark project */
@@ -373,7 +373,6 @@ object PySparkAssembly {
373373
import java.util.zip.{ZipOutputStream, ZipEntry}
374374

375375
lazy val settings = Seq(
376-
unmanagedJars in Compile += { BuildCommons.sparkHome / "python/lib/py4j-0.8.2.1-src.zip" },
377376
// Use a resource generator to copy all .py files from python/pyspark into a managed directory
378377
// to be included in the assembly. We can't just add "python/" to the assembly's resource dir
379378
// list since that will copy unneeded / unwanted files.
@@ -388,7 +387,8 @@ object PySparkAssembly {
388387
if (!dst.isDirectory()) {
389388
require(dst.mkdirs())
390389
}
391-
copy(src, dst)
390+
391+
Seq[File]()
392392
}
393393
)
394394

@@ -416,42 +416,11 @@ object PySparkAssembly {
416416
output.write(buf, 0, n)
417417
}
418418
}
419+
output.closeEntry()
419420
in.close()
420421
}
421422
}
422423

423-
private def copy(src: File, dst: File): Seq[File] = {
424-
src.listFiles().flatMap { f =>
425-
val child = new File(dst, f.getName())
426-
if (f.isDirectory()) {
427-
child.mkdir()
428-
copy(f, child)
429-
} else if (f.getName().endsWith(".py")) {
430-
var in: Option[FileInputStream] = None
431-
var out: Option[FileOutputStream] = None
432-
try {
433-
in = Some(new FileInputStream(f))
434-
out = Some(new FileOutputStream(child))
435-
436-
val bytes = new Array[Byte](1024)
437-
var read = 0
438-
while (read >= 0) {
439-
read = in.get.read(bytes)
440-
if (read > 0) {
441-
out.get.write(bytes, 0, read)
442-
}
443-
}
444-
445-
Some(child)
446-
} finally {
447-
in.foreach(_.close())
448-
out.foreach(_.close())
449-
}
450-
} else {
451-
None
452-
}
453-
}
454-
}
455424
}
456425

457426
object Unidoc {

sql/core/pom.xml

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -103,13 +103,5 @@
103103
<build>
104104
<outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
105105
<testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
106-
<resources>
107-
<resource>
108-
<directory>../../python</directory>
109-
<includes>
110-
<include>pyspark/sql/*.py</include>
111-
</includes>
112-
</resource>
113-
</resources>
114106
</build>
115107
</project>

streaming/pom.xml

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -105,13 +105,5 @@
105105
</configuration>
106106
</plugin>
107107
</plugins>
108-
<resources>
109-
<resource>
110-
<directory>../python</directory>
111-
<includes>
112-
<include>pyspark/streaming/*.py</include>
113-
</includes>
114-
</resource>
115-
</resources>
116108
</build>
117109
</project>

0 commit comments

Comments
 (0)