Skip to content

Commit 2d58a2b

Browse files
committed
Skips reading row group information when using task side metadata reading
1 parent 7aa3748 commit 2d58a2b

File tree

1 file changed

+17
-10
lines changed

1 file changed

+17
-10
lines changed

sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,11 @@ import scala.collection.JavaConversions._
2323
import scala.util.Try
2424

2525
import com.google.common.base.Objects
26-
import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
26+
import org.apache.hadoop.fs.{FileStatus, Path}
2727
import org.apache.hadoop.io.Writable
2828
import org.apache.hadoop.mapreduce._
2929
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
3030
import parquet.filter2.predicate.FilterApi
31-
import parquet.format.converter.ParquetMetadataConverter
3231
import parquet.hadoop._
3332
import parquet.hadoop.metadata.CompressionCodecName
3433
import parquet.hadoop.util.ContextUtil
@@ -269,7 +268,7 @@ private[sql] class ParquetRelation2(
269268
val useMetadataCache = sqlContext.getConf(SQLConf.PARQUET_CACHE_METADATA, "true").toBoolean
270269
conf.set(SQLConf.PARQUET_CACHE_METADATA, useMetadataCache.toString)
271270

272-
val footers = inputFiles.map(metadataCache.footers)
271+
val footers = inputFiles.map(f => metadataCache.footers(f.getPath))
273272

274273
// TODO Stop using `FilteringParquetRowInputFormat` and overriding `getPartition`.
275274
// After upgrading to Parquet 1.6.0, we should be able to stop caching `FileStatus` objects and
@@ -330,7 +329,7 @@ private[sql] class ParquetRelation2(
330329
private var commonMetadataStatuses: Array[FileStatus] = _
331330

332331
// Parquet footer cache.
333-
var footers: Map[FileStatus, Footer] = _
332+
var footers: Map[Path, Footer] = _
334333

335334
// `FileStatus` objects of all data files (Parquet part-files).
336335
var dataStatuses: Array[FileStatus] = _
@@ -357,11 +356,19 @@ private[sql] class ParquetRelation2(
357356
commonMetadataStatuses =
358357
leaves.filter(_.getPath.getName == ParquetFileWriter.PARQUET_COMMON_METADATA_FILE)
359358

360-
footers = (dataStatuses ++ metadataStatuses ++ commonMetadataStatuses).par.map { f =>
361-
val parquetMetadata = ParquetFileReader.readFooter(
362-
SparkHadoopUtil.get.conf, f, ParquetMetadataConverter.NO_FILTER)
363-
f -> new Footer(f.getPath, parquetMetadata)
364-
}.seq.toMap
359+
footers = {
360+
val conf = SparkHadoopUtil.get.conf
361+
val taskSideMetaData = conf.getBoolean(ParquetInputFormat.TASK_SIDE_METADATA, true)
362+
val rawFooters = if (shouldMergeSchemas) {
363+
ParquetFileReader.readAllFootersInParallel(
364+
conf, seqAsJavaList(leaves), taskSideMetaData)
365+
} else {
366+
ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(
367+
conf, seqAsJavaList(leaves), taskSideMetaData)
368+
}
369+
370+
rawFooters.map(footer => footer.getFile -> footer).toMap
371+
}
365372

366373
dataSchema = {
367374
val dataSchema0 =
@@ -428,7 +435,7 @@ private[sql] class ParquetRelation2(
428435
"No schema defined, " +
429436
s"and no Parquet data file or summary file found under ${paths.mkString(", ")}.")
430437

431-
ParquetRelation2.readSchema(filesToTouch.map(footers.apply), sqlContext)
438+
ParquetRelation2.readSchema(filesToTouch.map(f => footers.apply(f.getPath)), sqlContext)
432439
}
433440
}
434441
}

0 commit comments

Comments
 (0)