[SPARK-24610] fix reading small files via wholeTextFiles

dhruve · dhruve · commit 2369e3acee73 · 2018-06-20T11:34:36.000-05:00
diff --git a/core/src/main/scala/org/apache/spark/input/WholeTextFileInputFormat.scala b/core/src/main/scala/org/apache/spark/input/WholeTextFileInputFormat.scala
@@ -53,6 +53,19 @@ private[spark] class WholeTextFileInputFormat
     val totalLen = files.map(file => if (file.isDirectory) 0L else file.getLen).sum
     val maxSplitSize = Math.ceil(totalLen * 1.0 /
       (if (minPartitions == 0) 1 else minPartitions)).toLong
+
+    // For small files we need to ensure the min split size per node & rack <= maxSplitSize
+    val config = context.getConfiguration
+    val minSplitSizePerNode = config.getLong(CombineFileInputFormat.SPLIT_MINSIZE_PERNODE, 0L)
+    val minSplitSizePerRack = config.getLong(CombineFileInputFormat.SPLIT_MINSIZE_PERRACK, 0L)
+
+    if (maxSplitSize < minSplitSizePerNode) {
+      super.setMinSplitSizeNode(maxSplitSize)
+    }
+
+    if (maxSplitSize < minSplitSizePerRack) {
+      super.setMinSplitSizeRack(maxSplitSize)
+    }
     super.setMaxSplitSize(maxSplitSize)
   }
 }
diff --git a/core/src/test/scala/org/apache/spark/input/WholeTextFileInputFormatSuite.scala b/core/src/test/scala/org/apache/spark/input/WholeTextFileInputFormatSuite.scala
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.input
+
+import java.io.{DataOutputStream, File, FileOutputStream}
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.util.Utils
+import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite}
+import org.scalatest.BeforeAndAfterAll
+
+import scala.collection.immutable.IndexedSeq
+
+/**
+ * Tests the correctness of
+ * [[org.apache.spark.input.WholeTextFileInputFormat WholeTextFileInputFormat]]. A temporary
+ * directory containing files is created as fake input which is deleted in the end.
+ */
+class WholeTextFileInputFormatSuite extends SparkFunSuite with BeforeAndAfterAll with Logging {
+  private var sc: SparkContext = _
+
+  override def beforeAll() {
+    super.beforeAll()
+    val conf = new SparkConf()
+    sc = new SparkContext("local", "test", conf)
+
+    sc.hadoopConfiguration.setLong("mapreduce.input.fileinputformat.split.minsize.per.node", 123456)
+    sc.hadoopConfiguration.setLong("mapreduce.input.fileinputformat.split.minsize.per.rack", 123456)
+  }
+
+  override def afterAll() {
+    try {
+      sc.stop()
+    } finally {
+      super.afterAll()
+    }
+  }
+
+  private def createNativeFile(inputDir: File, fileName: String, contents: Array[Byte],
+                               compress: Boolean) = {
+    val path = s"${inputDir.toString}/$fileName"
+    val out = new DataOutputStream(new FileOutputStream(path))
+    out.write(contents, 0, contents.length)
+    out.close()
+  }
+
+  test("for small files minimum split size per node and per rack should be less than or equal to " +
+    "maximum split size.") {
+    var dir : File = null;
+    try {
+      dir = Utils.createTempDir()
+      logInfo(s"Local disk address is ${dir.toString}.")
+
+      WholeTextFileInputFormatSuite.files.foreach { case (filename, contents) =>
+        createNativeFile(dir, filename, contents, false)
+      }
+
+      val res = sc.wholeTextFiles(dir.toString).count
+    } finally {
+      Utils.deleteRecursively(dir)
+    }
+  }
+}
+
+/**
+ * Files to be tested are defined here.
+ */
+object WholeTextFileInputFormatSuite {
+  private val testWords: IndexedSeq[Byte] = "Spark is easy to use.\n".map(_.toByte)
+
+  private val fileNames = Array("part-00000", "part-00001", "part-00002")
+  private val fileLengths = Array(10, 100, 1000)
+
+  private val files = fileLengths.zip(fileNames).map { case (upperBound, filename) =>
+    filename -> Stream.continually(testWords.toList.toStream).flatten.take(upperBound).toArray
+  }.toMap
+}