Skip to content

Commit 4ad330b

Browse files
committed
Clarify JSON and CSV parser behavior.
1 parent 049f243 commit 4ad330b

File tree

3 files changed

+19
-7
lines changed

3 files changed

+19
-7
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,9 @@ class JacksonParser(
357357
}
358358
} catch {
359359
case e @ (_: RuntimeException | _: JsonProcessingException) =>
360+
// JSON parser currently doesn't support partial results for corrupted records.
361+
// For such records, all fields other than the field configured by
362+
// `columnNameOfCorruptRecord` are set to `null`.
360363
throw BadRecordException(() => recordLiteral(record), () => None, e)
361364
}
362365
}

sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -345,12 +345,14 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
345345
* <li>`mode` (default `PERMISSIVE`): allows a mode for dealing with corrupt records
346346
* during parsing.
347347
* <ul>
348-
* <li>`PERMISSIVE` : sets other fields to `null` when it meets a corrupted record, and puts
349-
* the malformed string into a field configured by `columnNameOfCorruptRecord`. To keep
348+
* <li>`PERMISSIVE` : when it meets a corrupted record, puts the malformed string into a
349+
* field configured by `columnNameOfCorruptRecord`, and sets other fields to `null`. To keep
350350
* corrupt records, an user can set a string type field named `columnNameOfCorruptRecord`
351351
* in an user-defined schema. If a schema does not have the field, it drops corrupt records
352352
* during parsing. When inferring a schema, it implicitly adds a `columnNameOfCorruptRecord`
353-
* field in an output schema.</li>
353+
* field in an output schema. It doesn't support partial results. Even just one field can't
354+
* be correctly parsed, all fields except for the field of `columnNameOfCorruptRecord` will
355+
* be set to `null`.</li>
354356
* <li>`DROPMALFORMED` : ignores the whole corrupted records.</li>
355357
* <li>`FAILFAST` : throws an exception when it meets corrupted records.</li>
356358
* </ul>
@@ -550,12 +552,14 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
550552
* <li>`mode` (default `PERMISSIVE`): allows a mode for dealing with corrupt records
551553
* during parsing. It supports the following case-insensitive modes.
552554
* <ul>
553-
* <li>`PERMISSIVE` : sets other fields to `null` when it meets a corrupted record, and puts
554-
* the malformed string into a field configured by `columnNameOfCorruptRecord`. To keep
555+
* <li>`PERMISSIVE` : when it meets a corrupted record, puts the malformed string into a
556+
* field configured by `columnNameOfCorruptRecord`, and sets other fields to `null`. To keep
555557
* corrupt records, an user can set a string type field named `columnNameOfCorruptRecord`
556558
* in an user-defined schema. If a schema does not have the field, it drops corrupt records
557-
* during parsing. When a length of parsed CSV tokens is shorter than an expected length
558-
* of a schema, it sets `null` for extra fields.</li>
559+
* during parsing. It supports partial result for the records just with less or more tokens
560+
* than the schema. When it meets a malformed record whose parsed tokens is shorter than an
561+
* expected length of a schema, it sets `null` for extra fields. When a length of tokens is
562+
* longer than a schema, it drops extra tokens.</li>
559563
* <li>`DROPMALFORMED` : ignores the whole corrupted records.</li>
560564
* <li>`FAILFAST` : throws an exception when it meets corrupted records.</li>
561565
* </ul>

sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,8 @@ class UnivocityParser(
203203
case _: BadRecordException => None
204204
}
205205
}
206+
// For records with less or more tokens than the schema, tries to return partial results
207+
// if possible.
206208
throw BadRecordException(
207209
() => getCurrentInput,
208210
() => getPartialResult(),
@@ -218,6 +220,9 @@ class UnivocityParser(
218220
row
219221
} catch {
220222
case NonFatal(e) =>
223+
// For corrupted records with the number of tokens same as the schema,
224+
// CSV reader doesn't support partial results. All fields other than the field
225+
// configured by `columnNameOfCorruptRecord` are set to `null`.
221226
throw BadRecordException(() => getCurrentInput, () => None, e)
222227
}
223228
}

0 commit comments

Comments
 (0)