Clarify JSON and CSV parser behavior.

viirya · viirya · commit 4ad330b1def5 · 2018-02-24T07:40:36.000Z
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala
@@ -357,6 +357,9 @@ class JacksonParser(
       }
     } catch {
       case e @ (_: RuntimeException | _: JsonProcessingException) =>
+        // JSON parser currently doesn't support partial results for corrupted records.
+        // For such records, all fields other than the field configured by
+        // `columnNameOfCorruptRecord` are set to `null`.
         throw BadRecordException(() => recordLiteral(record), () => None, e)
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -345,12 +345,14 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
    * <li>`mode` (default `PERMISSIVE`): allows a mode for dealing with corrupt records
    * during parsing.
    *   <ul>
-   *     <li>`PERMISSIVE` : sets other fields to `null` when it meets a corrupted record, and puts
-   *     the malformed string into a field configured by `columnNameOfCorruptRecord`. To keep
+   *     <li>`PERMISSIVE` : when it meets a corrupted record, puts the malformed string into a
+   *     field configured by `columnNameOfCorruptRecord`, and sets other fields to `null`. To keep
    *     corrupt records, an user can set a string type field named `columnNameOfCorruptRecord`
    *     in an user-defined schema. If a schema does not have the field, it drops corrupt records
    *     during parsing. When inferring a schema, it implicitly adds a `columnNameOfCorruptRecord`
-   *     field in an output schema.</li>
+   *     field in an output schema. It doesn't support partial results. Even just one field can't
+   *     be correctly parsed, all fields except for the field of `columnNameOfCorruptRecord` will
+   *     be set to `null`.</li>
    *     <li>`DROPMALFORMED` : ignores the whole corrupted records.</li>
    *     <li>`FAILFAST` : throws an exception when it meets corrupted records.</li>
    *   </ul>
@@ -550,12 +552,14 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
    * <li>`mode` (default `PERMISSIVE`): allows a mode for dealing with corrupt records
    *    during parsing. It supports the following case-insensitive modes.
    *   <ul>
-   *     <li>`PERMISSIVE` : sets other fields to `null` when it meets a corrupted record, and puts
-   *     the malformed string into a field configured by `columnNameOfCorruptRecord`. To keep
+   *     <li>`PERMISSIVE` : when it meets a corrupted record, puts the malformed string into a
+   *     field configured by `columnNameOfCorruptRecord`, and sets other fields to `null`. To keep
    *     corrupt records, an user can set a string type field named `columnNameOfCorruptRecord`
    *     in an user-defined schema. If a schema does not have the field, it drops corrupt records
-   *     during parsing. When a length of parsed CSV tokens is shorter than an expected length
-   *     of a schema, it sets `null` for extra fields.</li>
+   *     during parsing. It supports partial result for the records just with less or more tokens
+   *     than the schema. When it meets a malformed record whose parsed tokens is shorter than an
+   *     expected length of a schema, it sets `null` for extra fields. When a length of tokens is
+   *     longer than a schema, it drops extra tokens.</li>
    *     <li>`DROPMALFORMED` : ignores the whole corrupted records.</li>
    *     <li>`FAILFAST` : throws an exception when it meets corrupted records.</li>
    *   </ul>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
@@ -203,6 +203,8 @@ class UnivocityParser(
           case _: BadRecordException => None
         }
       }
+      // For records with less or more tokens than the schema, tries to return partial results
+      // if possible.
       throw BadRecordException(
         () => getCurrentInput,
         () => getPartialResult(),
@@ -218,6 +220,9 @@ class UnivocityParser(
         row
       } catch {
         case NonFatal(e) =>
+          // For corrupted records with the number of tokens same as the schema,
+          // CSV reader doesn't support partial results. All fields other than the field
+          // configured by `columnNameOfCorruptRecord` are set to `null`.
           throw BadRecordException(() => getCurrentInput, () => None, e)
       }
     }

Original file line number	Diff line number	Diff line change
`@@ -357,6 +357,9 @@ class JacksonParser(`
`357`	`357`	`}`
`358`	`358`	`} catch {`
`359`	`359`	`case e @ (_: RuntimeException \| _: JsonProcessingException) =>`
	`360`	`+ // JSON parser currently doesn't support partial results for corrupted records.`
	`361`	`+ // For such records, all fields other than the field configured by`
	`362`	+ // `columnNameOfCorruptRecord` are set to `null`.
`360`	`363`	`throw BadRecordException(() => recordLiteral(record), () => None, e)`
`361`	`364`	`}`
`362`	`365`	`}`
Original file line number	Diff line number	Diff line change
`@@ -203,6 +203,8 @@ class UnivocityParser(`
`203`	`203`	`case _: BadRecordException => None`
`204`	`204`	`}`
`205`	`205`	`}`
	`206`	`+ // For records with less or more tokens than the schema, tries to return partial results`
	`207`	`+ // if possible.`
`206`	`208`	`throw BadRecordException(`
`207`	`209`	`() => getCurrentInput,`
`208`	`210`	`() => getPartialResult(),`
`@@ -218,6 +220,9 @@ class UnivocityParser(`
`218`	`220`	`row`
`219`	`221`	`} catch {`
`220`	`222`	`case NonFatal(e) =>`
	`223`	`+ // For corrupted records with the number of tokens same as the schema,`
	`224`	`+ // CSV reader doesn't support partial results. All fields other than the field`
	`225`	+ // configured by `columnNameOfCorruptRecord` are set to `null`.
`221`	`226`	`throw BadRecordException(() => getCurrentInput, () => None, e)`
`222`	`227`	`}`
`223`	`228`	`}`