Skip to content

Commit 8db4d95

Browse files
gatorsmilecloud-fan
authored andcommitted
[SPARK-18703][SQL] Drop Staging Directories and Data Files After each Insertion/CTAS of Hive serde Tables
### What changes were proposed in this pull request? Below are the files/directories generated for three inserts againsts a Hive table: ``` /private/var/folders/4b/sgmfldk15js406vk7lw5llzw0000gn/T/spark-41eaa5ce-0288-471e-bba1-09cc482813ff/.hive-staging_hive_2016-12-03_20-56-29_149_4298858301766472202-1 /private/var/folders/4b/sgmfldk15js406vk7lw5llzw0000gn/T/spark-41eaa5ce-0288-471e-bba1-09cc482813ff/.hive-staging_hive_2016-12-03_20-56-29_149_4298858301766472202-1/-ext-10000 /private/var/folders/4b/sgmfldk15js406vk7lw5llzw0000gn/T/spark-41eaa5ce-0288-471e-bba1-09cc482813ff/.hive-staging_hive_2016-12-03_20-56-29_149_4298858301766472202-1/-ext-10000/._SUCCESS.crc /private/var/folders/4b/sgmfldk15js406vk7lw5llzw0000gn/T/spark-41eaa5ce-0288-471e-bba1-09cc482813ff/.hive-staging_hive_2016-12-03_20-56-29_149_4298858301766472202-1/-ext-10000/.part-00000.crc /private/var/folders/4b/sgmfldk15js406vk7lw5llzw0000gn/T/spark-41eaa5ce-0288-471e-bba1-09cc482813ff/.hive-staging_hive_2016-12-03_20-56-29_149_4298858301766472202-1/-ext-10000/_SUCCESS /private/var/folders/4b/sgmfldk15js406vk7lw5llzw0000gn/T/spark-41eaa5ce-0288-471e-bba1-09cc482813ff/.hive-staging_hive_2016-12-03_20-56-29_149_4298858301766472202-1/-ext-10000/part-00000 /private/var/folders/4b/sgmfldk15js406vk7lw5llzw0000gn/T/spark-41eaa5ce-0288-471e-bba1-09cc482813ff/.hive-staging_hive_2016-12-03_20-56-30_454_6445008511655931341-1 /private/var/folders/4b/sgmfldk15js406vk7lw5llzw0000gn/T/spark-41eaa5ce-0288-471e-bba1-09cc482813ff/.hive-staging_hive_2016-12-03_20-56-30_454_6445008511655931341-1/-ext-10000 /private/var/folders/4b/sgmfldk15js406vk7lw5llzw0000gn/T/spark-41eaa5ce-0288-471e-bba1-09cc482813ff/.hive-staging_hive_2016-12-03_20-56-30_454_6445008511655931341-1/-ext-10000/._SUCCESS.crc /private/var/folders/4b/sgmfldk15js406vk7lw5llzw0000gn/T/spark-41eaa5ce-0288-471e-bba1-09cc482813ff/.hive-staging_hive_2016-12-03_20-56-30_454_6445008511655931341-1/-ext-10000/.part-00000.crc /private/var/folders/4b/sgmfldk15js406vk7lw5llzw0000gn/T/spark-41eaa5ce-0288-471e-bba1-09cc482813ff/.hive-staging_hive_2016-12-03_20-56-30_454_6445008511655931341-1/-ext-10000/_SUCCESS /private/var/folders/4b/sgmfldk15js406vk7lw5llzw0000gn/T/spark-41eaa5ce-0288-471e-bba1-09cc482813ff/.hive-staging_hive_2016-12-03_20-56-30_454_6445008511655931341-1/-ext-10000/part-00000 /private/var/folders/4b/sgmfldk15js406vk7lw5llzw0000gn/T/spark-41eaa5ce-0288-471e-bba1-09cc482813ff/.hive-staging_hive_2016-12-03_20-56-30_722_3388423608658711001-1 /private/var/folders/4b/sgmfldk15js406vk7lw5llzw0000gn/T/spark-41eaa5ce-0288-471e-bba1-09cc482813ff/.hive-staging_hive_2016-12-03_20-56-30_722_3388423608658711001-1/-ext-10000 /private/var/folders/4b/sgmfldk15js406vk7lw5llzw0000gn/T/spark-41eaa5ce-0288-471e-bba1-09cc482813ff/.hive-staging_hive_2016-12-03_20-56-30_722_3388423608658711001-1/-ext-10000/._SUCCESS.crc /private/var/folders/4b/sgmfldk15js406vk7lw5llzw0000gn/T/spark-41eaa5ce-0288-471e-bba1-09cc482813ff/.hive-staging_hive_2016-12-03_20-56-30_722_3388423608658711001-1/-ext-10000/.part-00000.crc /private/var/folders/4b/sgmfldk15js406vk7lw5llzw0000gn/T/spark-41eaa5ce-0288-471e-bba1-09cc482813ff/.hive-staging_hive_2016-12-03_20-56-30_722_3388423608658711001-1/-ext-10000/_SUCCESS /private/var/folders/4b/sgmfldk15js406vk7lw5llzw0000gn/T/spark-41eaa5ce-0288-471e-bba1-09cc482813ff/.hive-staging_hive_2016-12-03_20-56-30_722_3388423608658711001-1/-ext-10000/part-00000 /private/var/folders/4b/sgmfldk15js406vk7lw5llzw0000gn/T/spark-41eaa5ce-0288-471e-bba1-09cc482813ff/.part-00000.crc /private/var/folders/4b/sgmfldk15js406vk7lw5llzw0000gn/T/spark-41eaa5ce-0288-471e-bba1-09cc482813ff/part-00000 ``` The first 18 files are temporary. We do not drop it until the end of JVM termination. If JVM does not appropriately terminate, these temporary files/directories will not be dropped. Only the last two files are needed, as shown below. ``` /private/var/folders/4b/sgmfldk15js406vk7lw5llzw0000gn/T/spark-41eaa5ce-0288-471e-bba1-09cc482813ff/.part-00000.crc /private/var/folders/4b/sgmfldk15js406vk7lw5llzw0000gn/T/spark-41eaa5ce-0288-471e-bba1-09cc482813ff/part-00000 ``` The temporary files/directories could accumulate a lot when we issue many inserts, since each insert generats at least six files. This could eat a lot of spaces and slow down the JVM termination. When the JVM does not terminates approprately, the files might not be dropped. This PR is to drop the created staging files and temporary data files after each insert/CTAS. ### How was this patch tested? Added a test case Author: gatorsmile <[email protected]> Closes #16134 from gatorsmile/deleteFiles.
1 parent 3243885 commit 8db4d95

File tree

2 files changed

+38
-2
lines changed

2 files changed

+38
-2
lines changed

sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ import java.net.URI
2222
import java.text.SimpleDateFormat
2323
import java.util.{Date, Locale, Random}
2424

25+
import scala.util.control.NonFatal
26+
2527
import org.apache.hadoop.fs.{FileSystem, Path}
2628
import org.apache.hadoop.hive.common.FileUtils
2729
import org.apache.hadoop.hive.ql.exec.TaskRunner
@@ -84,6 +86,7 @@ case class InsertIntoHiveTable(
8486
def output: Seq[Attribute] = Seq.empty
8587

8688
val hadoopConf = sessionState.newHadoopConf()
89+
var createdTempDir: Option[Path] = None
8790
val stagingDir = hadoopConf.get("hive.exec.stagingdir", ".hive-staging")
8891
val scratchDir = hadoopConf.get("hive.exec.scratchdir", "/tmp/hive")
8992

@@ -111,12 +114,12 @@ case class InsertIntoHiveTable(
111114
if (!FileUtils.mkdir(fs, dir, true, hadoopConf)) {
112115
throw new IllegalStateException("Cannot create staging directory '" + dir.toString + "'")
113116
}
117+
createdTempDir = Some(dir)
114118
fs.deleteOnExit(dir)
115119
} catch {
116120
case e: IOException =>
117121
throw new RuntimeException(
118122
"Cannot create staging directory '" + dir.toString + "': " + e.getMessage, e)
119-
120123
}
121124
return dir
122125
}
@@ -163,11 +166,11 @@ case class InsertIntoHiveTable(
163166
if (!FileUtils.mkdir(fs, dirPath, true, hadoopConf)) {
164167
throw new IllegalStateException("Cannot create staging directory: " + dirPath.toString)
165168
}
169+
createdTempDir = Some(dirPath)
166170
fs.deleteOnExit(dirPath)
167171
} catch {
168172
case e: IOException =>
169173
throw new RuntimeException("Cannot create staging directory: " + dirPath.toString, e)
170-
171174
}
172175
dirPath
173176
}
@@ -378,6 +381,15 @@ case class InsertIntoHiveTable(
378381
isSrcLocal = false)
379382
}
380383

384+
// Attempt to delete the staging directory and the inclusive files. If failed, the files are
385+
// expected to be dropped at the normal termination of VM since deleteOnExit is used.
386+
try {
387+
createdTempDir.foreach { path => path.getFileSystem(hadoopConf).delete(path, true) }
388+
} catch {
389+
case NonFatal(e) =>
390+
logWarning(s"Unable to delete staging directory: $stagingDir.\n" + e)
391+
}
392+
381393
// Invalidate the cache.
382394
sqlContext.sharedState.cacheManager.invalidateCache(table)
383395
sqlContext.sessionState.catalog.refreshTable(table.catalogTable.identifier)

sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -546,6 +546,30 @@ class VersionsSuite extends SparkFunSuite with SQLTestUtils with TestHiveSinglet
546546
}
547547
}
548548

549+
test(s"$version: Delete the temporary staging directory and files after each insert") {
550+
withTempDir { tmpDir =>
551+
withTable("tab") {
552+
spark.sql(
553+
s"""
554+
|CREATE TABLE tab(c1 string)
555+
|location '${tmpDir.toURI.toString}'
556+
""".stripMargin)
557+
558+
(1 to 3).map { i =>
559+
spark.sql(s"INSERT OVERWRITE TABLE tab SELECT '$i'")
560+
}
561+
def listFiles(path: File): List[String] = {
562+
val dir = path.listFiles()
563+
val folders = dir.filter(_.isDirectory).toList
564+
val filePaths = dir.map(_.getName).toList
565+
folders.flatMap(listFiles) ++: filePaths
566+
}
567+
val expectedFiles = ".part-00000.crc" :: "part-00000" :: Nil
568+
assert(listFiles(tmpDir).sorted == expectedFiles)
569+
}
570+
}
571+
}
572+
549573
// TODO: add more tests.
550574
}
551575
}

0 commit comments

Comments
 (0)