apache · MaxGekk · Mar 20, 2018 · Mar 20, 2018 · Mar 20, 2018 · Mar 21, 2018
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
@@ -335,7 +335,8 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
             ignoreTrailingWhiteSpace=None, nullValue=None, nanValue=None, positiveInf=None,
             negativeInf=None, dateFormat=None, timestampFormat=None, maxColumns=None,
             maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None,
-            columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None):
+            columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None,
+            checkHeader=None):
         """Loads a CSV file and returns the result as a  :class:`DataFrame`.
 
         This function will go through the input once to determine the input schema if
@@ -360,6 +361,9 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
                         character. By default (None), it is disabled.
         :param header: uses the first line as names of columns. If None is set, it uses the
                        default value, ``false``.
+        :param checkHeader: compares column names in the header with field names in the schema
+                            and outputs an error if names are not matched.
+                            If None is set, it uses the default value, ``true``.
         :param inferSchema: infers the input schema automatically from data. It requires one extra
                        pass over the data. If None is set, it uses the default value, ``false``.
         :param ignoreLeadingWhiteSpace: A flag indicating whether or not leading whitespaces from
@@ -436,7 +440,7 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
             maxCharsPerColumn=maxCharsPerColumn,
             maxMalformedLogPerPartition=maxMalformedLogPerPartition, mode=mode,
             columnNameOfCorruptRecord=columnNameOfCorruptRecord, multiLine=multiLine,
-            charToEscapeQuoteEscaping=charToEscapeQuoteEscaping)
+            charToEscapeQuoteEscaping=charToEscapeQuoteEscaping, checkHeader=checkHeader)
         if isinstance(path, basestring):
             path = [path]
         if type(path) == list:

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
@@ -2974,6 +2974,21 @@ def test_create_dateframe_from_pandas_with_dst(self):
                 os.environ['TZ'] = orig_env_tz
             time.tzset()
 
+    def test_checking_csv_header(self):
+        tmpPath = tempfile.mkdtemp()
+        shutil.rmtree(tmpPath)
+        self.spark.createDataFrame([[1, 1000], [2000, 2]]).\
+            toDF('f1', 'f2').write.option("header", "true").csv(tmpPath)
+        schema = StructType([
+            StructField('f2', IntegerType(), nullable=True),
+            StructField('f1', IntegerType(), nullable=True)])
+        df = self.spark.read.option('header', 'true').schema(schema).csv(tmpPath)
+        self.assertRaisesRegexp(
+            Exception,
+            "Fields in the header of csv file are not matched to field names of the schema",
+            lambda: df.collect())
+        shutil.rmtree(tmpPath)
+
 
 class HiveSparkSubmitTests(SparkSubmitTests):
 

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -524,6 +524,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
    * <li>`comment` (default empty string): sets a single character used for skipping lines
    * beginning with this character. By default, it is disabled.</li>
    * <li>`header` (default `false`): uses the first line as names of columns.</li>
+   * <li>`checkHeader` (default `true`): compares column names in the header with field names
+   * in the schema and outputs an error if names are not matched.</li>
    * <li>`inferSchema` (default `false`): infers the input schema automatically from data. It
    * requires one extra pass over the data.</li>
    * <li>`ignoreLeadingWhiteSpace` (default `false`): a flag indicating whether or not leading

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
@@ -50,7 +50,9 @@ abstract class CSVDataSource extends Serializable {
       conf: Configuration,
       file: PartitionedFile,
       parser: UnivocityParser,
-      schema: StructType): Iterator[InternalRow]
+      schema: StructType, // Schema of projection
+      dataSchema: StructType // Schema of data in csv files
+  ): Iterator[InternalRow]
 
   /**
    * Infers the schema from `inputPaths` files.
@@ -127,7 +129,8 @@ object TextInputCSVDataSource extends CSVDataSource {
       conf: Configuration,
       file: PartitionedFile,
       parser: UnivocityParser,
-      schema: StructType): Iterator[InternalRow] = {
+      schema: StructType,
+      dataSchema: StructType): Iterator[InternalRow] = {
     val lines = {
       val linesReader = new HadoopFileLinesReader(file, conf)
       Option(TaskContext.get()).foreach(_.addTaskCompletionListener(_ => linesReader.close()))
@@ -136,8 +139,22 @@ object TextInputCSVDataSource extends CSVDataSource {
       }
     }
 
-    val shouldDropHeader = parser.options.headerFlag && file.start == 0
-    UnivocityParser.parseIterator(lines, shouldDropHeader, parser, schema)
+    val hasHeader = parser.options.headerFlag && file.start == 0
+    if (hasHeader) {
+      // Checking that column names in the header are matched to field names of the schema.
+      // The header will be removed from lines.
+      // Note: if there are only comments in the first block, the header would probably
+      // be not extracted.
+      val checkHeader = UnivocityParser.checkHeader(
+        parser,
+        dataSchema,
+        _: String,
+        file.filePath
+      )
+      CSVUtils.extractHeader(lines, parser.options).foreach(checkHeader(_))
+    }
+
+    UnivocityParser.parseIterator(lines, parser, schema)
   }
 
   override def infer(
@@ -204,24 +221,35 @@ object MultiLineCSVDataSource extends CSVDataSource {
       conf: Configuration,
       file: PartitionedFile,
       parser: UnivocityParser,
-      schema: StructType): Iterator[InternalRow] = {
+      schema: StructType,
+      dataSchema: StructType): Iterator[InternalRow] = {
+    val checkHeader = UnivocityParser.checkHeaderColumnNames(
+      parser,
+      dataSchema,
+      _: Array[String],
+      file.filePath
+    )
     UnivocityParser.parseStream(
       CodecStreams.createInputStreamWithCloseResource(conf, new Path(new URI(file.filePath))),
       parser.options.headerFlag,
       parser,
-      schema)
+      schema,
+      checkHeader)
   }
 
   override def infer(
       sparkSession: SparkSession,
       inputPaths: Seq[FileStatus],
       parsedOptions: CSVOptions): StructType = {
     val csv = createBaseRdd(sparkSession, inputPaths, parsedOptions)
+    // The header is not checked because there is no schema against with it could be check
+    val checkHeader = (_: Array[String]) => ()
     csv.flatMap { lines =>
       val path = new Path(lines.getPath())
       UnivocityParser.tokenizeStream(
         CodecStreams.createInputStreamWithCloseResource(lines.getConfiguration, path),
         shouldDropHeader = false,
+        checkHeader,
         new CsvParser(parsedOptions.asParserSettings))
     }.take(1).headOption match {
       case Some(firstRow) =>
@@ -233,6 +261,7 @@ object MultiLineCSVDataSource extends CSVDataSource {
               lines.getConfiguration,
               new Path(lines.getPath())),
             parsedOptions.headerFlag,
+            checkHeader,
             new CsvParser(parsedOptions.asParserSettings))
         }
         CSVInferSchema.infer(tokenRDD, header, parsedOptions)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala
@@ -129,7 +129,7 @@ class CSVFileFormat extends TextBasedFileFormat with DataSourceRegister {
         StructType(dataSchema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord)),
         StructType(requiredSchema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord)),
         parsedOptions)
-      CSVDataSource(parsedOptions).readFile(conf, file, parser, requiredSchema)
+      CSVDataSource(parsedOptions).readFile(conf, file, parser, requiredSchema, dataSchema)
     }
   }
 

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
@@ -150,6 +150,12 @@ class CSVOptions(
 
   val isCommentSet = this.comment != '\u0000'
 
+  /**
+   * The option enables checks of headers in csv files. In particular, column names
+   * are matched to field names of provided schema.
+   */
+  val checkHeader = getBool("checkHeader", true)
+
   def asWriterSettings: CsvWriterSettings = {
     val writerSettings = new CsvWriterSettings()
     val format = writerSettings.getFormat

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala
@@ -67,24 +67,28 @@ object CSVUtils {
     }
   }
 
-  /**
-   * Drop header line so that only data can remain.
-   * This is similar with `filterHeaderLine` above and currently being used in CSV reading path.
-   */
-  def dropHeaderLine(iter: Iterator[String], options: CSVOptions): Iterator[String] = {
-    val nonEmptyLines = if (options.isCommentSet) {
+  def skipComments(iter: Iterator[String], options: CSVOptions): Iterator[String] = {
+    if (options.isCommentSet) {
       val commentPrefix = options.comment.toString
       iter.dropWhile { line =>
         line.trim.isEmpty || line.trim.startsWith(commentPrefix)
       }
     } else {
       iter.dropWhile(_.trim.isEmpty)
     }
-
-    if (nonEmptyLines.hasNext) nonEmptyLines.drop(1)
-    iter
   }
 
+  /**
+   * Extracts header and moves iterator forward so that only data remains in it
+   */
+  def extractHeader(iter: Iterator[String], options: CSVOptions): Option[String] = {
+    val nonEmptyLines = skipComments(iter, options)
+    if (nonEmptyLines.hasNext) {
+      Some(nonEmptyLines.next())
+    } else {
+      None
+    }
+  }
   /**
    * Helper method that converts string representation of a character to actual character.
    * It handles some Java escaped strings and throws exception if given string is longer than one

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
@@ -237,8 +237,9 @@ private[csv] object UnivocityParser {
   def tokenizeStream(
       inputStream: InputStream,
       shouldDropHeader: Boolean,
+      checkHeader: Array[String] => Unit,
       tokenizer: CsvParser): Iterator[Array[String]] = {
-    convertStream(inputStream, shouldDropHeader, tokenizer)(tokens => tokens)
+    convertStream(inputStream, shouldDropHeader, tokenizer, checkHeader)(tokens => tokens)
   }
 
   /**
@@ -248,26 +249,30 @@ private[csv] object UnivocityParser {
       inputStream: InputStream,
       shouldDropHeader: Boolean,
       parser: UnivocityParser,
-      schema: StructType): Iterator[InternalRow] = {
+      schema: StructType,
+      checkHeader: Array[String] => Unit): Iterator[InternalRow] = {
     val tokenizer = parser.tokenizer
     val safeParser = new FailureSafeParser[Array[String]](
       input => Seq(parser.convert(input)),
       parser.options.parseMode,
       schema,
       parser.options.columnNameOfCorruptRecord)
-    convertStream(inputStream, shouldDropHeader, tokenizer) { tokens =>
+    convertStream(inputStream, shouldDropHeader, tokenizer, checkHeader) { tokens =>
       safeParser.parse(tokens)
     }.flatten
   }
 
   private def convertStream[T](
       inputStream: InputStream,
       shouldDropHeader: Boolean,
-      tokenizer: CsvParser)(convert: Array[String] => T) = new Iterator[T] {
+      tokenizer: CsvParser,
+      checkHeader: Array[String] => Unit
+  )(convert: Array[String] => T) = new Iterator[T] {
     tokenizer.beginParsing(inputStream)
     private var nextRecord = {
       if (shouldDropHeader) {
-        tokenizer.parseNext()
+        val header = tokenizer.parseNext()
+        checkHeader(header)
       }
       tokenizer.parseNext()
     }
@@ -289,27 +294,52 @@ private[csv] object UnivocityParser {
    */
   def parseIterator(
       lines: Iterator[String],
-      shouldDropHeader: Boolean,
       parser: UnivocityParser,
       schema: StructType): Iterator[InternalRow] = {
     val options = parser.options
 
-    val linesWithoutHeader = if (shouldDropHeader) {
-      // Note that if there are only comments in the first block, the header would probably
-      // be not dropped.
-      CSVUtils.dropHeaderLine(lines, options)
-    } else {
-      lines
-    }
-
     val filteredLines: Iterator[String] =
-      CSVUtils.filterCommentAndEmpty(linesWithoutHeader, options)
+      CSVUtils.filterCommentAndEmpty(lines, options)
 
     val safeParser = new FailureSafeParser[String](
       input => Seq(parser.parse(input)),
       parser.options.parseMode,
       schema,
       parser.options.columnNameOfCorruptRecord)
+
     filteredLines.flatMap(safeParser.parse)
   }
+
+  def checkHeaderColumnNames(
+    parser: UnivocityParser,
+    schema: StructType,
+    columnNames: Array[String],
+    fileName: String
+  ): Unit = {
+    if (parser.options.checkHeader && columnNames != null) {
+      val fieldNames = schema.map(_.name)
+      val isMatched = fieldNames.zip(columnNames).forall { pair =>
+        val (nameInSchema, nameInHeader) = pair
+        nameInSchema == nameInHeader
+      }
+      if (!isMatched) {
+        throw new IllegalArgumentException(
+          s"""|Fields in the header of csv file are not matched to field names of the schema:
+              | Header: ${columnNames.mkString(", ")}
+              | Schema: ${fieldNames.mkString(", ")}
+              |CSV file: $fileName""".stripMargin
+        )
+      }
+    }
+  }
+
+  def checkHeader(
+      parser: UnivocityParser,
+      schema: StructType,
+      header: String,
+      fileName: String
+  ): Unit = {
+    lazy val columnNames = parser.tokenizer.parseLine(header)
+    checkHeaderColumnNames(parser, schema, columnNames, fileName)
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -252,7 +252,11 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
           |(yearMade double, makeName string, modelName string, priceTag decimal,
           | comments string, grp string)
           |USING csv
-          |OPTIONS (path "${testFile(carsTsvFile)}", header "true", delimiter "\t")
+          |OPTIONS (
+          |  path "${testFile(carsTsvFile)}",
+          |  header "true", checkHeader "false",
+          |   delimiter "\t"
+          |)
          """.stripMargin.replaceAll("\n", " "))
 
       assert(
@@ -275,7 +279,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
   test("test for blank column names on read and select columns") {
     val cars = spark.read
       .format("csv")
-      .options(Map("header" -> "true", "inferSchema" -> "true"))
+      .options(Map("header" -> "true", "checkHeader" -> "false", "inferSchema" -> "true"))
       .load(testFile(carsBlankColName))
 
     assert(cars.select("customer").collect().size == 2)
@@ -348,15 +352,15 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
       spark.sql(
         s"""
           |CREATE TEMPORARY VIEW carsTable
-          |(yearMade double, makeName string, modelName string, comments string, blank string)
+          |(year double, make string, model string, comment string, blank string)
           |USING csv
           |OPTIONS (path "${testFile(carsFile)}", header "true")
          """.stripMargin.replaceAll("\n", " "))
 
       val cars = spark.table("carsTable")
       verifyCars(cars, withHeader = true, checkHeader = false, checkValues = false)
       assert(
-        cars.schema.fieldNames === Array("yearMade", "makeName", "modelName", "comments", "blank"))
+        cars.schema.fieldNames === Array("year", "make", "model", "comment", "blank"))
     }
   }
 
@@ -1279,4 +1283,30 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
       Row("0,2013-111-11 12:13:14") :: Row(null) :: Nil
     )
   }
+
+  def checkHeader(multiLine: String): Unit = {
+    test(s"SPARK-23786: Checking column names against schema ($multiLine)") {
+      withTempPath { path =>
+        import collection.JavaConverters._
+        val oschema = new StructType().add("f1", DoubleType).add("f2", DoubleType)
+        val odf = spark.createDataFrame(List(Row(1.0, 1234.5)).asJava, oschema)
+        odf.write.option("header", "true").csv(path.getCanonicalPath)
+        val ischema = new StructType().add("f2", DoubleType).add("f1", DoubleType)
+        val exception = intercept[SparkException] {
+          spark.read
+            .schema(ischema)
+            .option("multiLine", multiLine)
+            .option("header", "true")
+            .option("checkHeader", "true")
+            .csv(path.getCanonicalPath)
+            .collect()
+        }
+        assert(exception.getMessage.contains(
+          "Fields in the header of csv file are not matched to field names of the schema"
+        ))
+      }
+    }
+  }
+
+  List("false", "true").foreach(checkHeader(_))
 }