Skip to content

Commit 80dba17

Browse files
author
Nathan Howell
committed
Add comments regarding null handling and empty strings
1 parent 842846d commit 80dba17

File tree

1 file changed

+15
-1
lines changed

1 file changed

+15
-1
lines changed

sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD2.scala

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,12 @@ private[sql] object JsonRDD2 extends Logging {
3838
parseJson(json, schema, columnNameOfCorruptRecords)
3939
}
4040

41+
/**
42+
* Infer the type of a collection of json records in three stages:
43+
* 1. Infer the type of each record
44+
* 2. Merge types by choosing the lowest type necessary to cover equal keys
45+
* 3. Replace any remaining null fields with string, the top type
46+
*/
4147
def inferSchema(
4248
json: RDD[String],
4349
samplingRatio: Double = 1.0,
@@ -79,7 +85,15 @@ private[sql] object JsonRDD2 extends Logging {
7985
parser.nextToken()
8086
inferField(parser)
8187

82-
case VALUE_STRING if parser.getTextLength < 1 => NullType
88+
case VALUE_STRING if parser.getTextLength < 1 =>
89+
// Zero length strings and nulls have special handling to deal
90+
// with JSON generators that do not distinguish between the two.
91+
// To accurately infer types for empty strings that are really
92+
// meant to represent nulls we assume that the two are isomorphic
93+
// but will defer treating null fields as strings until all the
94+
// record fields' types have been combined.
95+
NullType
96+
8397
case VALUE_STRING => StringType
8498
case START_OBJECT =>
8599
val builder = Seq.newBuilder[StructField]

0 commit comments

Comments
 (0)