Skip to content

Commit b1b35ca

Browse files
sryzapwendell
authored andcommitted
SPARK-5199. FS read metrics should support CombineFileSplits and track bytes from all FSs
...mbineFileSplits Author: Sandy Ryza <[email protected]> Closes #4050 from sryza/sandy-spark-5199 and squashes the following commits: 864514b [Sandy Ryza] Add tests and fix bug 0d504f1 [Sandy Ryza] Prettify 915c7e6 [Sandy Ryza] Get metrics from all filesystems cdbc3e8 [Sandy Ryza] SPARK-5199. Input metrics should show up for InputFormats that return CombineFileSplits
1 parent fdaad4e commit b1b35ca

File tree

6 files changed

+120
-32
lines changed

6 files changed

+120
-32
lines changed

core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -133,10 +133,9 @@ class SparkHadoopUtil extends Logging {
133133
* statistics are only available as of Hadoop 2.5 (see HADOOP-10688).
134134
* Returns None if the required method can't be found.
135135
*/
136-
private[spark] def getFSBytesReadOnThreadCallback(path: Path, conf: Configuration)
137-
: Option[() => Long] = {
136+
private[spark] def getFSBytesReadOnThreadCallback(): Option[() => Long] = {
138137
try {
139-
val threadStats = getFileSystemThreadStatistics(path, conf)
138+
val threadStats = getFileSystemThreadStatistics()
140139
val getBytesReadMethod = getFileSystemThreadStatisticsMethod("getBytesRead")
141140
val f = () => threadStats.map(getBytesReadMethod.invoke(_).asInstanceOf[Long]).sum
142141
val baselineBytesRead = f()
@@ -156,10 +155,9 @@ class SparkHadoopUtil extends Logging {
156155
* statistics are only available as of Hadoop 2.5 (see HADOOP-10688).
157156
* Returns None if the required method can't be found.
158157
*/
159-
private[spark] def getFSBytesWrittenOnThreadCallback(path: Path, conf: Configuration)
160-
: Option[() => Long] = {
158+
private[spark] def getFSBytesWrittenOnThreadCallback(): Option[() => Long] = {
161159
try {
162-
val threadStats = getFileSystemThreadStatistics(path, conf)
160+
val threadStats = getFileSystemThreadStatistics()
163161
val getBytesWrittenMethod = getFileSystemThreadStatisticsMethod("getBytesWritten")
164162
val f = () => threadStats.map(getBytesWrittenMethod.invoke(_).asInstanceOf[Long]).sum
165163
val baselineBytesWritten = f()
@@ -172,10 +170,8 @@ class SparkHadoopUtil extends Logging {
172170
}
173171
}
174172

175-
private def getFileSystemThreadStatistics(path: Path, conf: Configuration): Seq[AnyRef] = {
176-
val qualifiedPath = path.getFileSystem(conf).makeQualified(path)
177-
val scheme = qualifiedPath.toUri().getScheme()
178-
val stats = FileSystem.getAllStatistics().filter(_.getScheme().equals(scheme))
173+
private def getFileSystemThreadStatistics(): Seq[AnyRef] = {
174+
val stats = FileSystem.getAllStatistics()
179175
stats.map(Utils.invoke(classOf[Statistics], _, "getThreadStatistics"))
180176
}
181177

core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ package org.apache.spark.executor
1919

2020
import java.util.concurrent.atomic.AtomicLong
2121

22-
import org.apache.spark.executor.DataReadMethod
2322
import org.apache.spark.executor.DataReadMethod.DataReadMethod
2423

2524
import scala.collection.mutable.ArrayBuffer

core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ import org.apache.hadoop.mapred.Reporter
3535
import org.apache.hadoop.mapred.JobID
3636
import org.apache.hadoop.mapred.TaskAttemptID
3737
import org.apache.hadoop.mapred.TaskID
38+
import org.apache.hadoop.mapred.lib.CombineFileSplit
3839
import org.apache.hadoop.util.ReflectionUtils
3940

4041
import org.apache.spark._
@@ -218,13 +219,13 @@ class HadoopRDD[K, V](
218219

219220
// Find a function that will return the FileSystem bytes read by this thread. Do this before
220221
// creating RecordReader, because RecordReader's constructor might read some bytes
221-
val bytesReadCallback = inputMetrics.bytesReadCallback.orElse(
222+
val bytesReadCallback = inputMetrics.bytesReadCallback.orElse {
222223
split.inputSplit.value match {
223-
case split: FileSplit =>
224-
SparkHadoopUtil.get.getFSBytesReadOnThreadCallback(split.getPath, jobConf)
224+
case _: FileSplit | _: CombineFileSplit =>
225+
SparkHadoopUtil.get.getFSBytesReadOnThreadCallback()
225226
case _ => None
226227
}
227-
)
228+
}
228229
inputMetrics.setBytesReadCallback(bytesReadCallback)
229230

230231
var reader: RecordReader[K, V] = null
@@ -254,7 +255,8 @@ class HadoopRDD[K, V](
254255
reader.close()
255256
if (bytesReadCallback.isDefined) {
256257
inputMetrics.updateBytesRead()
257-
} else if (split.inputSplit.value.isInstanceOf[FileSplit]) {
258+
} else if (split.inputSplit.value.isInstanceOf[FileSplit] ||
259+
split.inputSplit.value.isInstanceOf[CombineFileSplit]) {
258260
// If we can't get the bytes read from the FS stats, fall back to the split size,
259261
// which may be inaccurate.
260262
try {

core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ import scala.reflect.ClassTag
2525
import org.apache.hadoop.conf.{Configurable, Configuration}
2626
import org.apache.hadoop.io.Writable
2727
import org.apache.hadoop.mapreduce._
28-
import org.apache.hadoop.mapreduce.lib.input.FileSplit
28+
import org.apache.hadoop.mapreduce.lib.input.{CombineFileSplit, FileSplit}
2929

3030
import org.apache.spark.annotation.DeveloperApi
3131
import org.apache.spark.input.WholeTextFileInputFormat
@@ -34,7 +34,7 @@ import org.apache.spark.Logging
3434
import org.apache.spark.Partition
3535
import org.apache.spark.SerializableWritable
3636
import org.apache.spark.{SparkContext, TaskContext}
37-
import org.apache.spark.executor.{DataReadMethod, InputMetrics}
37+
import org.apache.spark.executor.DataReadMethod
3838
import org.apache.spark.mapreduce.SparkHadoopMapReduceUtil
3939
import org.apache.spark.rdd.NewHadoopRDD.NewHadoopMapPartitionsWithSplitRDD
4040
import org.apache.spark.util.Utils
@@ -114,13 +114,13 @@ class NewHadoopRDD[K, V](
114114

115115
// Find a function that will return the FileSystem bytes read by this thread. Do this before
116116
// creating RecordReader, because RecordReader's constructor might read some bytes
117-
val bytesReadCallback = inputMetrics.bytesReadCallback.orElse(
117+
val bytesReadCallback = inputMetrics.bytesReadCallback.orElse {
118118
split.serializableHadoopSplit.value match {
119-
case split: FileSplit =>
120-
SparkHadoopUtil.get.getFSBytesReadOnThreadCallback(split.getPath, conf)
119+
case _: FileSplit | _: CombineFileSplit =>
120+
SparkHadoopUtil.get.getFSBytesReadOnThreadCallback()
121121
case _ => None
122122
}
123-
)
123+
}
124124
inputMetrics.setBytesReadCallback(bytesReadCallback)
125125

126126
val attemptId = newTaskAttemptID(jobTrackerId, id, isMap = true, split.index, 0)
@@ -163,7 +163,8 @@ class NewHadoopRDD[K, V](
163163
reader.close()
164164
if (bytesReadCallback.isDefined) {
165165
inputMetrics.updateBytesRead()
166-
} else if (split.serializableHadoopSplit.value.isInstanceOf[FileSplit]) {
166+
} else if (split.serializableHadoopSplit.value.isInstanceOf[FileSplit] ||
167+
split.serializableHadoopSplit.value.isInstanceOf[CombineFileSplit]) {
167168
// If we can't get the bytes read from the FS stats, fall back to the split size,
168169
// which may be inaccurate.
169170
try {

core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -990,7 +990,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
990990
val committer = format.getOutputCommitter(hadoopContext)
991991
committer.setupTask(hadoopContext)
992992

993-
val (outputMetrics, bytesWrittenCallback) = initHadoopOutputMetrics(context, config)
993+
val (outputMetrics, bytesWrittenCallback) = initHadoopOutputMetrics(context)
994994

995995
val writer = format.getRecordWriter(hadoopContext).asInstanceOf[NewRecordWriter[K,V]]
996996
try {
@@ -1061,7 +1061,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
10611061
// around by taking a mod. We expect that no task will be attempted 2 billion times.
10621062
val taskAttemptId = (context.taskAttemptId % Int.MaxValue).toInt
10631063

1064-
val (outputMetrics, bytesWrittenCallback) = initHadoopOutputMetrics(context, config)
1064+
val (outputMetrics, bytesWrittenCallback) = initHadoopOutputMetrics(context)
10651065

10661066
writer.setup(context.stageId, context.partitionId, taskAttemptId)
10671067
writer.open()
@@ -1086,11 +1086,8 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
10861086
writer.commitJob()
10871087
}
10881088

1089-
private def initHadoopOutputMetrics(context: TaskContext, config: Configuration)
1090-
: (OutputMetrics, Option[() => Long]) = {
1091-
val bytesWrittenCallback = Option(config.get("mapreduce.output.fileoutputformat.outputdir"))
1092-
.map(new Path(_))
1093-
.flatMap(SparkHadoopUtil.get.getFSBytesWrittenOnThreadCallback(_, config))
1089+
private def initHadoopOutputMetrics(context: TaskContext): (OutputMetrics, Option[() => Long]) = {
1090+
val bytesWrittenCallback = SparkHadoopUtil.get.getFSBytesWrittenOnThreadCallback()
10941091
val outputMetrics = new OutputMetrics(DataWriteMethod.Hadoop)
10951092
if (bytesWrittenCallback.isDefined) {
10961093
context.taskMetrics.outputMetrics = Some(outputMetrics)

core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala

Lines changed: 95 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,16 @@ import org.scalatest.FunSuite
2626
import org.apache.hadoop.conf.Configuration
2727
import org.apache.hadoop.fs.{FileSystem, Path}
2828
import org.apache.hadoop.io.{LongWritable, Text}
29-
import org.apache.hadoop.mapreduce.lib.input.{TextInputFormat => NewTextInputFormat}
29+
import org.apache.hadoop.mapred.{FileSplit => OldFileSplit, InputSplit => OldInputSplit, JobConf,
30+
LineRecordReader => OldLineRecordReader, RecordReader => OldRecordReader, Reporter,
31+
TextInputFormat => OldTextInputFormat}
32+
import org.apache.hadoop.mapred.lib.{CombineFileInputFormat => OldCombineFileInputFormat,
33+
CombineFileSplit => OldCombineFileSplit, CombineFileRecordReader => OldCombineFileRecordReader}
34+
import org.apache.hadoop.mapreduce.{InputSplit => NewInputSplit, RecordReader => NewRecordReader,
35+
TaskAttemptContext}
36+
import org.apache.hadoop.mapreduce.lib.input.{CombineFileInputFormat => NewCombineFileInputFormat,
37+
CombineFileRecordReader => NewCombineFileRecordReader, CombineFileSplit => NewCombineFileSplit,
38+
FileSplit => NewFileSplit, TextInputFormat => NewTextInputFormat}
3039

3140
import org.apache.spark.SharedSparkContext
3241
import org.apache.spark.deploy.SparkHadoopUtil
@@ -202,7 +211,7 @@ class InputOutputMetricsSuite extends FunSuite with SharedSparkContext {
202211
val fs = FileSystem.getLocal(new Configuration())
203212
val outPath = new Path(fs.getWorkingDirectory, "outdir")
204213

205-
if (SparkHadoopUtil.get.getFSBytesWrittenOnThreadCallback(outPath, fs.getConf).isDefined) {
214+
if (SparkHadoopUtil.get.getFSBytesWrittenOnThreadCallback().isDefined) {
206215
val taskBytesWritten = new ArrayBuffer[Long]()
207216
sc.addSparkListener(new SparkListener() {
208217
override def onTaskEnd(taskEnd: SparkListenerTaskEnd) {
@@ -225,4 +234,88 @@ class InputOutputMetricsSuite extends FunSuite with SharedSparkContext {
225234
}
226235
}
227236
}
237+
238+
test("input metrics with old CombineFileInputFormat") {
239+
val bytesRead = runAndReturnBytesRead {
240+
sc.hadoopFile(tmpFilePath, classOf[OldCombineTextInputFormat], classOf[LongWritable],
241+
classOf[Text], 2).count()
242+
}
243+
assert(bytesRead >= tmpFile.length())
244+
}
245+
246+
test("input metrics with new CombineFileInputFormat") {
247+
val bytesRead = runAndReturnBytesRead {
248+
sc.newAPIHadoopFile(tmpFilePath, classOf[NewCombineTextInputFormat], classOf[LongWritable],
249+
classOf[Text], new Configuration()).count()
250+
}
251+
assert(bytesRead >= tmpFile.length())
252+
}
253+
}
254+
255+
/**
256+
* Hadoop 2 has a version of this, but we can't use it for backwards compatibility
257+
*/
258+
class OldCombineTextInputFormat extends OldCombineFileInputFormat[LongWritable, Text] {
259+
override def getRecordReader(split: OldInputSplit, conf: JobConf, reporter: Reporter)
260+
: OldRecordReader[LongWritable, Text] = {
261+
new OldCombineFileRecordReader[LongWritable, Text](conf,
262+
split.asInstanceOf[OldCombineFileSplit], reporter, classOf[OldCombineTextRecordReaderWrapper]
263+
.asInstanceOf[Class[OldRecordReader[LongWritable, Text]]])
264+
}
265+
}
266+
267+
class OldCombineTextRecordReaderWrapper(
268+
split: OldCombineFileSplit,
269+
conf: Configuration,
270+
reporter: Reporter,
271+
idx: Integer) extends OldRecordReader[LongWritable, Text] {
272+
273+
val fileSplit = new OldFileSplit(split.getPath(idx),
274+
split.getOffset(idx),
275+
split.getLength(idx),
276+
split.getLocations())
277+
278+
val delegate: OldLineRecordReader = new OldTextInputFormat().getRecordReader(fileSplit,
279+
conf.asInstanceOf[JobConf], reporter).asInstanceOf[OldLineRecordReader]
280+
281+
override def next(key: LongWritable, value: Text): Boolean = delegate.next(key, value)
282+
override def createKey(): LongWritable = delegate.createKey()
283+
override def createValue(): Text = delegate.createValue()
284+
override def getPos(): Long = delegate.getPos
285+
override def close(): Unit = delegate.close()
286+
override def getProgress(): Float = delegate.getProgress
287+
}
288+
289+
/**
290+
* Hadoop 2 has a version of this, but we can't use it for backwards compatibility
291+
*/
292+
class NewCombineTextInputFormat extends NewCombineFileInputFormat[LongWritable,Text] {
293+
def createRecordReader(split: NewInputSplit, context: TaskAttemptContext)
294+
: NewRecordReader[LongWritable, Text] = {
295+
new NewCombineFileRecordReader[LongWritable,Text](split.asInstanceOf[NewCombineFileSplit],
296+
context, classOf[NewCombineTextRecordReaderWrapper])
297+
}
228298
}
299+
300+
class NewCombineTextRecordReaderWrapper(
301+
split: NewCombineFileSplit,
302+
context: TaskAttemptContext,
303+
idx: Integer) extends NewRecordReader[LongWritable, Text] {
304+
305+
val fileSplit = new NewFileSplit(split.getPath(idx),
306+
split.getOffset(idx),
307+
split.getLength(idx),
308+
split.getLocations())
309+
310+
val delegate = new NewTextInputFormat().createRecordReader(fileSplit, context)
311+
312+
override def initialize(split: NewInputSplit, context: TaskAttemptContext): Unit = {
313+
delegate.initialize(fileSplit, context)
314+
}
315+
316+
override def nextKeyValue(): Boolean = delegate.nextKeyValue()
317+
override def getCurrentKey(): LongWritable = delegate.getCurrentKey
318+
override def getCurrentValue(): Text = delegate.getCurrentValue
319+
override def getProgress(): Float = delegate.getProgress
320+
override def close(): Unit = delegate.close()
321+
}

0 commit comments

Comments
 (0)