Skip to content

Commit d3c426a

Browse files
kiszkMarcelo Vanzin
authored andcommitted
[SPARK-10878][CORE] Fix race condition when multiple clients resolves artifacts at the same time
## What changes were proposed in this pull request? When multiple clients attempt to resolve artifacts via the `--packages` parameter, they could run into race condition when they each attempt to modify the dummy `org.apache.spark-spark-submit-parent-default.xml` file created in the default ivy cache dir. This PR changes the behavior to encode UUID in the dummy module descriptor so each client will operate on a different resolution file in the ivy cache dir. In addition, this patch changes the behavior of when and which resolution files are cleaned to prevent accumulation of resolution files in the default ivy cache dir. Since this PR is a successor of apache#18801, close apache#18801. Many codes were ported from apache#18801. **Many efforts were put here. I think this PR should credit to Victsm .** ## How was this patch tested? added UT into `SparkSubmitUtilsSuite` Author: Kazuaki Ishizaki <[email protected]> Closes apache#21251 from kiszk/SPARK-10878.
1 parent 3e26005 commit d3c426a

File tree

2 files changed

+47
-10
lines changed

2 files changed

+47
-10
lines changed

core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala

Lines changed: 32 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import java.lang.reflect.{InvocationTargetException, Modifier, UndeclaredThrowab
2222
import java.net.URL
2323
import java.security.PrivilegedExceptionAction
2424
import java.text.ParseException
25+
import java.util.UUID
2526

2627
import scala.annotation.tailrec
2728
import scala.collection.mutable.{ArrayBuffer, HashMap, Map}
@@ -1204,7 +1205,33 @@ private[spark] object SparkSubmitUtils {
12041205

12051206
/** A nice function to use in tests as well. Values are dummy strings. */
12061207
def getModuleDescriptor: DefaultModuleDescriptor = DefaultModuleDescriptor.newDefaultInstance(
1207-
ModuleRevisionId.newInstance("org.apache.spark", "spark-submit-parent", "1.0"))
1208+
// Include UUID in module name, so multiple clients resolving maven coordinate at the same time
1209+
// do not modify the same resolution file concurrently.
1210+
ModuleRevisionId.newInstance("org.apache.spark",
1211+
s"spark-submit-parent-${UUID.randomUUID.toString}",
1212+
"1.0"))
1213+
1214+
/**
1215+
* Clear ivy resolution from current launch. The resolution file is usually at
1216+
* ~/.ivy2/org.apache.spark-spark-submit-parent-$UUID-default.xml,
1217+
* ~/.ivy2/resolved-org.apache.spark-spark-submit-parent-$UUID-1.0.xml, and
1218+
* ~/.ivy2/resolved-org.apache.spark-spark-submit-parent-$UUID-1.0.properties.
1219+
* Since each launch will have its own resolution files created, delete them after
1220+
* each resolution to prevent accumulation of these files in the ivy cache dir.
1221+
*/
1222+
private def clearIvyResolutionFiles(
1223+
mdId: ModuleRevisionId,
1224+
ivySettings: IvySettings,
1225+
ivyConfName: String): Unit = {
1226+
val currentResolutionFiles = Seq(
1227+
s"${mdId.getOrganisation}-${mdId.getName}-$ivyConfName.xml",
1228+
s"resolved-${mdId.getOrganisation}-${mdId.getName}-${mdId.getRevision}.xml",
1229+
s"resolved-${mdId.getOrganisation}-${mdId.getName}-${mdId.getRevision}.properties"
1230+
)
1231+
currentResolutionFiles.foreach { filename =>
1232+
new File(ivySettings.getDefaultCache, filename).delete()
1233+
}
1234+
}
12081235

12091236
/**
12101237
* Resolves any dependencies that were supplied through maven coordinates
@@ -1255,14 +1282,6 @@ private[spark] object SparkSubmitUtils {
12551282

12561283
// A Module descriptor must be specified. Entries are dummy strings
12571284
val md = getModuleDescriptor
1258-
// clear ivy resolution from previous launches. The resolution file is usually at
1259-
// ~/.ivy2/org.apache.spark-spark-submit-parent-default.xml. In between runs, this file
1260-
// leads to confusion with Ivy when the files can no longer be found at the repository
1261-
// declared in that file/
1262-
val mdId = md.getModuleRevisionId
1263-
val previousResolution = new File(ivySettings.getDefaultCache,
1264-
s"${mdId.getOrganisation}-${mdId.getName}-$ivyConfName.xml")
1265-
if (previousResolution.exists) previousResolution.delete
12661285

12671286
md.setDefaultConf(ivyConfName)
12681287

@@ -1283,7 +1302,10 @@ private[spark] object SparkSubmitUtils {
12831302
packagesDirectory.getAbsolutePath + File.separator +
12841303
"[organization]_[artifact]-[revision](-[classifier]).[ext]",
12851304
retrieveOptions.setConfs(Array(ivyConfName)))
1286-
resolveDependencyPaths(rr.getArtifacts.toArray, packagesDirectory)
1305+
val paths = resolveDependencyPaths(rr.getArtifacts.toArray, packagesDirectory)
1306+
val mdId = md.getModuleRevisionId
1307+
clearIvyResolutionFiles(mdId, ivySettings, ivyConfName)
1308+
paths
12871309
} finally {
12881310
System.setOut(sysOut)
12891311
}

core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,4 +256,19 @@ class SparkSubmitUtilsSuite extends SparkFunSuite with BeforeAndAfterAll {
256256
assert(jarPath.indexOf("mydep") >= 0, "should find dependency")
257257
}
258258
}
259+
260+
test("SPARK-10878: test resolution files cleaned after resolving artifact") {
261+
val main = new MavenCoordinate("my.great.lib", "mylib", "0.1")
262+
263+
IvyTestUtils.withRepository(main, None, None) { repo =>
264+
val ivySettings = SparkSubmitUtils.buildIvySettings(Some(repo), Some(tempIvyPath))
265+
val jarPath = SparkSubmitUtils.resolveMavenCoordinates(
266+
main.toString,
267+
ivySettings,
268+
isTest = true)
269+
val r = """.*org.apache.spark-spark-submit-parent-.*""".r
270+
assert(!ivySettings.getDefaultCache.listFiles.map(_.getName)
271+
.exists(r.findFirstIn(_).isDefined), "resolution files should be cleaned")
272+
}
273+
}
259274
}

0 commit comments

Comments
 (0)