@@ -249,12 +249,9 @@ abstract class PartitioningAwareFileIndex(
249249 pathsToFetch += path
250250 }
251251 }
252- val discovered = if (pathsToFetch.length >=
253- sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold) {
254- PartitioningAwareFileIndex .listLeafFilesInParallel(pathsToFetch, hadoopConf, sparkSession)
255- } else {
256- PartitioningAwareFileIndex .listLeafFilesInSerial(pathsToFetch, hadoopConf)
257- }
252+ val filter = FileInputFormat .getInputPathFilter(new JobConf (hadoopConf, this .getClass))
253+ val discovered = PartitioningAwareFileIndex .bulkListLeafFiles(
254+ pathsToFetch, hadoopConf, filter, sparkSession)
258255 discovered.foreach { case (path, leafFiles) =>
259256 HiveCatalogMetrics .incrementFilesDiscovered(leafFiles.size)
260257 fileStatusCache.putLeafFiles(path, leafFiles.toArray)
@@ -286,31 +283,28 @@ object PartitioningAwareFileIndex extends Logging {
286283 blockLocations : Array [SerializableBlockLocation ])
287284
288285 /**
289- * List a collection of path recursively.
290- */
291- private def listLeafFilesInSerial (
292- paths : Seq [Path ],
293- hadoopConf : Configuration ): Seq [(Path , Seq [FileStatus ])] = {
294- // Dummy jobconf to get to the pathFilter defined in configuration
295- val jobConf = new JobConf (hadoopConf, this .getClass)
296- val filter = FileInputFormat .getInputPathFilter(jobConf)
297-
298- paths.map { path =>
299- val fs = path.getFileSystem(hadoopConf)
300- (path, listLeafFiles0(fs, path, filter))
301- }
302- }
303-
304- /**
305- * List a collection of path recursively in parallel (using Spark executors).
306- * Each task launched will use [[listLeafFilesInSerial ]] to list.
286+ * Lists a collection of paths recursively. Picks the listing strategy adaptively depending
287+ * on the number of paths to list.
288+ *
289+ * This may only be called on the driver.
290+ *
291+ * @return for each input path, the set of discovered files for the path
307292 */
308- private def listLeafFilesInParallel (
293+ private def bulkListLeafFiles (
309294 paths : Seq [Path ],
310295 hadoopConf : Configuration ,
296+ filter : PathFilter ,
311297 sparkSession : SparkSession ): Seq [(Path , Seq [FileStatus ])] = {
312- assert(paths.size >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold)
298+
299+ // Short-circuits parallel listing when serial listing is likely to be faster.
300+ if (paths.size < sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold) {
301+ return paths.map { path =>
302+ (path, listLeafFiles(path, hadoopConf, filter, Some (sparkSession)))
303+ }
304+ }
305+
313306 logInfo(s " Listing leaf files and directories in parallel under: ${paths.mkString(" , " )}" )
307+ HiveCatalogMetrics .incrementParallelListingJobCount(1 )
314308
315309 val sparkContext = sparkSession.sparkContext
316310 val serializableConfiguration = new SerializableConfiguration (hadoopConf)
@@ -324,9 +318,11 @@ object PartitioningAwareFileIndex extends Logging {
324318
325319 val statusMap = sparkContext
326320 .parallelize(serializedPaths, numParallelism)
327- .mapPartitions { paths =>
321+ .mapPartitions { pathStrings =>
328322 val hadoopConf = serializableConfiguration.value
329- listLeafFilesInSerial(paths.map(new Path (_)).toSeq, hadoopConf).iterator
323+ pathStrings.map(new Path (_)).toSeq.map { path =>
324+ (path, listLeafFiles(path, hadoopConf, filter, None ))
325+ }.iterator
330326 }.map { case (path, statuses) =>
331327 val serializableStatuses = statuses.map { status =>
332328 // Turn FileStatus into SerializableFileStatus so we can send it back to the driver
@@ -374,11 +370,20 @@ object PartitioningAwareFileIndex extends Logging {
374370 }
375371
376372 /**
377- * List a single path, provided as a FileStatus, in serial.
373+ * Lists a single filesystem path recursively. If a SparkSession object is specified, this
374+ * function may launch Spark jobs to parallelize listing.
375+ *
376+ * If sessionOpt is None, this may be called on executors.
377+ *
378+ * @return all children of path that match the specified filter.
378379 */
379- private def listLeafFiles0 (
380- fs : FileSystem , path : Path , filter : PathFilter ): Seq [FileStatus ] = {
380+ private def listLeafFiles (
381+ path : Path ,
382+ hadoopConf : Configuration ,
383+ filter : PathFilter ,
384+ sessionOpt : Option [SparkSession ]): Seq [FileStatus ] = {
381385 logTrace(s " Listing $path" )
386+ val fs = path.getFileSystem(hadoopConf)
382387 val name = path.getName.toLowerCase
383388 if (shouldFilterOut(name)) {
384389 Seq .empty[FileStatus ]
@@ -393,9 +398,15 @@ object PartitioningAwareFileIndex extends Logging {
393398 }
394399
395400 val allLeafStatuses = {
396- val (dirs, files) = statuses.partition(_.isDirectory)
397- val stats = files ++ dirs.flatMap(dir => listLeafFiles0(fs, dir.getPath, filter))
398- if (filter != null ) stats.filter(f => filter.accept(f.getPath)) else stats
401+ val (dirs, topLevelFiles) = statuses.partition(_.isDirectory)
402+ val nestedFiles : Seq [FileStatus ] = sessionOpt match {
403+ case Some (session) =>
404+ bulkListLeafFiles(dirs.map(_.getPath), hadoopConf, filter, session).flatMap(_._2)
405+ case _ =>
406+ dirs.flatMap(dir => listLeafFiles(dir.getPath, hadoopConf, filter, sessionOpt))
407+ }
408+ val allFiles = topLevelFiles ++ nestedFiles
409+ if (filter != null ) allFiles.filter(f => filter.accept(f.getPath)) else allFiles
399410 }
400411
401412 allLeafStatuses.filterNot(status => shouldFilterOut(status.getPath.getName)).map {
0 commit comments