@@ -23,12 +23,11 @@ import scala.collection.JavaConversions._
2323import scala .util .Try
2424
2525import com .google .common .base .Objects
26- import org .apache .hadoop .fs .{FileStatus , FileSystem , Path }
26+ import org .apache .hadoop .fs .{FileStatus , Path }
2727import org .apache .hadoop .io .Writable
2828import org .apache .hadoop .mapreduce ._
2929import org .apache .hadoop .mapreduce .lib .input .FileInputFormat
3030import parquet .filter2 .predicate .FilterApi
31- import parquet .format .converter .ParquetMetadataConverter
3231import parquet .hadoop ._
3332import parquet .hadoop .metadata .CompressionCodecName
3433import parquet .hadoop .util .ContextUtil
@@ -269,7 +268,7 @@ private[sql] class ParquetRelation2(
269268 val useMetadataCache = sqlContext.getConf(SQLConf .PARQUET_CACHE_METADATA , " true" ).toBoolean
270269 conf.set(SQLConf .PARQUET_CACHE_METADATA , useMetadataCache.toString)
271270
272- val footers = inputFiles.map(metadataCache.footers)
271+ val footers = inputFiles.map(f => metadataCache.footers(f.getPath) )
273272
274273 // TODO Stop using `FilteringParquetRowInputFormat` and overriding `getPartition`.
275274 // After upgrading to Parquet 1.6.0, we should be able to stop caching `FileStatus` objects and
@@ -330,7 +329,7 @@ private[sql] class ParquetRelation2(
330329 private var commonMetadataStatuses : Array [FileStatus ] = _
331330
332331 // Parquet footer cache.
333- var footers : Map [FileStatus , Footer ] = _
332+ var footers : Map [Path , Footer ] = _
334333
335334 // `FileStatus` objects of all data files (Parquet part-files).
336335 var dataStatuses : Array [FileStatus ] = _
@@ -357,11 +356,19 @@ private[sql] class ParquetRelation2(
357356 commonMetadataStatuses =
358357 leaves.filter(_.getPath.getName == ParquetFileWriter .PARQUET_COMMON_METADATA_FILE )
359358
360- footers = (dataStatuses ++ metadataStatuses ++ commonMetadataStatuses).par.map { f =>
361- val parquetMetadata = ParquetFileReader .readFooter(
362- SparkHadoopUtil .get.conf, f, ParquetMetadataConverter .NO_FILTER )
363- f -> new Footer (f.getPath, parquetMetadata)
364- }.seq.toMap
359+ footers = {
360+ val conf = SparkHadoopUtil .get.conf
361+ val taskSideMetaData = conf.getBoolean(ParquetInputFormat .TASK_SIDE_METADATA , true )
362+ val rawFooters = if (shouldMergeSchemas) {
363+ ParquetFileReader .readAllFootersInParallel(
364+ conf, seqAsJavaList(leaves), taskSideMetaData)
365+ } else {
366+ ParquetFileReader .readAllFootersInParallelUsingSummaryFiles(
367+ conf, seqAsJavaList(leaves), taskSideMetaData)
368+ }
369+
370+ rawFooters.map(footer => footer.getFile -> footer).toMap
371+ }
365372
366373 dataSchema = {
367374 val dataSchema0 =
@@ -428,7 +435,7 @@ private[sql] class ParquetRelation2(
428435 " No schema defined, " +
429436 s " and no Parquet data file or summary file found under ${paths.mkString(" , " )}. " )
430437
431- ParquetRelation2 .readSchema(filesToTouch.map(footers.apply), sqlContext)
438+ ParquetRelation2 .readSchema(filesToTouch.map(f => footers.apply(f.getPath) ), sqlContext)
432439 }
433440 }
434441}
0 commit comments