Skip to content

Commit 6097788

Browse files
committed
[SPARK-20136][SQL] Add num files and metadata operation timing to scan operator metrics
## What changes were proposed in this pull request? This patch adds explicit metadata operation timing and number of files in data source metrics. Those would be useful to include for performance profiling. Screenshot of a UI with this change (num files and metadata time are new metrics): <img width="321" alt="screen shot 2017-03-29 at 12 29 28 am" src="https://cloud.githubusercontent.com/assets/323388/24443272/d4ea58c0-1416-11e7-8940-ecb69375554a.png"> ## How was this patch tested? N/A Author: Reynold Xin <[email protected]> Closes #17465 from rxin/SPARK-20136.
1 parent 22f07fe commit 6097788

File tree

1 file changed

+16
-2
lines changed

1 file changed

+16
-2
lines changed

sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -171,8 +171,20 @@ case class FileSourceScanExec(
171171
false
172172
}
173173

174-
@transient private lazy val selectedPartitions =
175-
relation.location.listFiles(partitionFilters, dataFilters)
174+
@transient private lazy val selectedPartitions: Seq[PartitionDirectory] = {
175+
val startTime = System.nanoTime()
176+
val ret = relation.location.listFiles(partitionFilters, dataFilters)
177+
val timeTaken = (System.nanoTime() - startTime) / 1000 / 1000
178+
179+
metrics("numFiles").add(ret.map(_.files.size.toLong).sum)
180+
metrics("metadataTime").add(timeTaken)
181+
182+
val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY)
183+
SQLMetrics.postDriverMetricUpdates(sparkContext, executionId,
184+
metrics("numFiles") :: metrics("metadataTime") :: Nil)
185+
186+
ret
187+
}
176188

177189
override val (outputPartitioning, outputOrdering): (Partitioning, Seq[SortOrder]) = {
178190
val bucketSpec = if (relation.sparkSession.sessionState.conf.bucketingEnabled) {
@@ -293,6 +305,8 @@ case class FileSourceScanExec(
293305

294306
override lazy val metrics =
295307
Map("numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"),
308+
"numFiles" -> SQLMetrics.createMetric(sparkContext, "number of files"),
309+
"metadataTime" -> SQLMetrics.createMetric(sparkContext, "metadata time (ms)"),
296310
"scanTime" -> SQLMetrics.createTimingMetric(sparkContext, "scan time"))
297311

298312
protected override def doExecute(): RDD[InternalRow] = {

0 commit comments

Comments
 (0)