Skip to content

Commit f3ed62a

Browse files
gatorsmilecloud-fan
authored andcommitted
[SPARK-20831][SQL] Fix INSERT OVERWRITE data source tables with IF NOT EXISTS
### What changes were proposed in this pull request? Currently, we have a bug when we specify `IF NOT EXISTS` in `INSERT OVERWRITE` data source tables. For example, given a query: ```SQL INSERT OVERWRITE TABLE $tableName partition (b=2, c=3) IF NOT EXISTS SELECT 9, 10 ``` we will get the following error: ``` unresolved operator 'InsertIntoTable Relation[a#425,d#426,b#427,c#428] parquet, Map(b -> Some(2), c -> Some(3)), true, true;; 'InsertIntoTable Relation[a#425,d#426,b#427,c#428] parquet, Map(b -> Some(2), c -> Some(3)), true, true +- Project [cast(9#423 as int) AS a#429, cast(10#424 as int) AS d#430] +- Project [9 AS 9#423, 10 AS 10#424] +- OneRowRelation$ ``` This PR is to fix the issue to follow the behavior of Hive serde tables > INSERT OVERWRITE will overwrite any existing data in the table or partition unless IF NOT EXISTS is provided for a partition ### How was this patch tested? Modified an existing test case Author: gatorsmile <[email protected]> Closes #18050 from gatorsmile/insertPartitionIfNotExists.
1 parent 2597674 commit f3ed62a

File tree

12 files changed

+90
-94
lines changed

12 files changed

+90
-94
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -366,7 +366,7 @@ package object dsl {
366366
def insertInto(tableName: String, overwrite: Boolean = false): LogicalPlan =
367367
InsertIntoTable(
368368
analysis.UnresolvedRelation(TableIdentifier(tableName)),
369-
Map.empty, logicalPlan, overwrite, false)
369+
Map.empty, logicalPlan, overwrite, ifPartitionNotExists = false)
370370

371371
def as(alias: String): LogicalPlan = SubqueryAlias(alias, logicalPlan)
372372

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -410,17 +410,20 @@ case class Hint(name: String, parameters: Seq[String], child: LogicalPlan) exten
410410
* would have Map('a' -> Some('1'), 'b' -> None).
411411
* @param query the logical plan representing data to write to.
412412
* @param overwrite overwrite existing table or partitions.
413-
* @param ifNotExists If true, only write if the table or partition does not exist.
413+
* @param ifPartitionNotExists If true, only write if the partition does not exist.
414+
* Only valid for static partitions.
414415
*/
415416
case class InsertIntoTable(
416417
table: LogicalPlan,
417418
partition: Map[String, Option[String]],
418419
query: LogicalPlan,
419420
overwrite: Boolean,
420-
ifNotExists: Boolean)
421+
ifPartitionNotExists: Boolean)
421422
extends LogicalPlan {
422-
assert(overwrite || !ifNotExists)
423-
assert(partition.values.forall(_.nonEmpty) || !ifNotExists)
423+
// IF NOT EXISTS is only valid in INSERT OVERWRITE
424+
assert(overwrite || !ifPartitionNotExists)
425+
// IF NOT EXISTS is only valid in static partitions
426+
assert(partition.values.forall(_.nonEmpty) || !ifPartitionNotExists)
424427

425428
// We don't want `table` in children as sometimes we don't want to transform it.
426429
override def children: Seq[LogicalPlan] = query :: Nil

sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,7 @@ object SQLConf {
303303
val HIVE_MANAGE_FILESOURCE_PARTITIONS =
304304
buildConf("spark.sql.hive.manageFilesourcePartitions")
305305
.doc("When true, enable metastore partition management for file source tables as well. " +
306-
"This includes both datasource and converted Hive tables. When partition managment " +
306+
"This includes both datasource and converted Hive tables. When partition management " +
307307
"is enabled, datasource tables store partition in the Hive metastore, and use the " +
308308
"metastore to prune partitions during query planning.")
309309
.booleanConf

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -176,14 +176,14 @@ class PlanParserSuite extends PlanTest {
176176
def insert(
177177
partition: Map[String, Option[String]],
178178
overwrite: Boolean = false,
179-
ifNotExists: Boolean = false): LogicalPlan =
180-
InsertIntoTable(table("s"), partition, plan, overwrite, ifNotExists)
179+
ifPartitionNotExists: Boolean = false): LogicalPlan =
180+
InsertIntoTable(table("s"), partition, plan, overwrite, ifPartitionNotExists)
181181

182182
// Single inserts
183183
assertEqual(s"insert overwrite table s $sql",
184184
insert(Map.empty, overwrite = true))
185185
assertEqual(s"insert overwrite table s partition (e = 1) if not exists $sql",
186-
insert(Map("e" -> Option("1")), overwrite = true, ifNotExists = true))
186+
insert(Map("e" -> Option("1")), overwrite = true, ifPartitionNotExists = true))
187187
assertEqual(s"insert into s $sql",
188188
insert(Map.empty))
189189
assertEqual(s"insert into table s partition (c = 'd', e = 1) $sql",
@@ -193,9 +193,9 @@ class PlanParserSuite extends PlanTest {
193193
val plan2 = table("t").where('x > 5).select(star())
194194
assertEqual("from t insert into s select * limit 1 insert into u select * where x > 5",
195195
InsertIntoTable(
196-
table("s"), Map.empty, plan.limit(1), false, ifNotExists = false).union(
196+
table("s"), Map.empty, plan.limit(1), false, ifPartitionNotExists = false).union(
197197
InsertIntoTable(
198-
table("u"), Map.empty, plan2, false, ifNotExists = false)))
198+
table("u"), Map.empty, plan2, false, ifPartitionNotExists = false)))
199199
}
200200

201201
test ("insert with if not exists") {

sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -286,7 +286,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
286286
partition = Map.empty[String, Option[String]],
287287
query = df.logicalPlan,
288288
overwrite = mode == SaveMode.Overwrite,
289-
ifNotExists = false)
289+
ifPartitionNotExists = false)
290290
}
291291
}
292292

sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -430,6 +430,7 @@ case class DataSource(
430430
InsertIntoHadoopFsRelationCommand(
431431
outputPath = outputPath,
432432
staticPartitions = Map.empty,
433+
ifPartitionNotExists = false,
433434
partitionColumns = partitionAttributes,
434435
bucketSpec = bucketSpec,
435436
fileFormat = format,

sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -142,8 +142,8 @@ case class DataSourceAnalysis(conf: SQLConf) extends Rule[LogicalPlan] with Cast
142142
parts, query, overwrite, false) if parts.isEmpty =>
143143
InsertIntoDataSourceCommand(l, query, overwrite)
144144

145-
case InsertIntoTable(
146-
l @ LogicalRelation(t: HadoopFsRelation, _, table), parts, query, overwrite, false) =>
145+
case i @ InsertIntoTable(
146+
l @ LogicalRelation(t: HadoopFsRelation, _, table), parts, query, overwrite, _) =>
147147
// If the InsertIntoTable command is for a partitioned HadoopFsRelation and
148148
// the user has specified static partitions, we add a Project operator on top of the query
149149
// to include those constant column values in the query result.
@@ -195,6 +195,7 @@ case class DataSourceAnalysis(conf: SQLConf) extends Rule[LogicalPlan] with Cast
195195
InsertIntoHadoopFsRelationCommand(
196196
outputPath,
197197
staticPartitions,
198+
i.ifPartitionNotExists,
198199
partitionSchema,
199200
t.bucketSpec,
200201
t.fileFormat,

sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,13 @@ import org.apache.spark.sql.execution.command._
3737
* overwrites: when the spec is empty, all partitions are overwritten.
3838
* When it covers a prefix of the partition keys, only partitions matching
3939
* the prefix are overwritten.
40+
* @param ifPartitionNotExists If true, only write if the partition does not exist.
41+
* Only valid for static partitions.
4042
*/
4143
case class InsertIntoHadoopFsRelationCommand(
4244
outputPath: Path,
4345
staticPartitions: TablePartitionSpec,
46+
ifPartitionNotExists: Boolean,
4447
partitionColumns: Seq[Attribute],
4548
bucketSpec: Option[BucketSpec],
4649
fileFormat: FileFormat,
@@ -61,8 +64,8 @@ case class InsertIntoHadoopFsRelationCommand(
6164
val duplicateColumns = query.schema.fieldNames.groupBy(identity).collect {
6265
case (x, ys) if ys.length > 1 => "\"" + x + "\""
6366
}.mkString(", ")
64-
throw new AnalysisException(s"Duplicate column(s) : $duplicateColumns found, " +
65-
s"cannot save to file.")
67+
throw new AnalysisException(s"Duplicate column(s): $duplicateColumns found, " +
68+
"cannot save to file.")
6669
}
6770

6871
val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(options)
@@ -76,11 +79,12 @@ case class InsertIntoHadoopFsRelationCommand(
7679

7780
var initialMatchingPartitions: Seq[TablePartitionSpec] = Nil
7881
var customPartitionLocations: Map[TablePartitionSpec, String] = Map.empty
82+
var matchingPartitions: Seq[CatalogTablePartition] = Seq.empty
7983

8084
// When partitions are tracked by the catalog, compute all custom partition locations that
8185
// may be relevant to the insertion job.
8286
if (partitionsTrackedByCatalog) {
83-
val matchingPartitions = sparkSession.sessionState.catalog.listPartitions(
87+
matchingPartitions = sparkSession.sessionState.catalog.listPartitions(
8488
catalogTable.get.identifier, Some(staticPartitions))
8589
initialMatchingPartitions = matchingPartitions.map(_.spec)
8690
customPartitionLocations = getCustomPartitionLocations(
@@ -101,8 +105,12 @@ case class InsertIntoHadoopFsRelationCommand(
101105
case (SaveMode.ErrorIfExists, true) =>
102106
throw new AnalysisException(s"path $qualifiedOutputPath already exists.")
103107
case (SaveMode.Overwrite, true) =>
104-
deleteMatchingPartitions(fs, qualifiedOutputPath, customPartitionLocations, committer)
105-
true
108+
if (ifPartitionNotExists && matchingPartitions.nonEmpty) {
109+
false
110+
} else {
111+
deleteMatchingPartitions(fs, qualifiedOutputPath, customPartitionLocations, committer)
112+
true
113+
}
106114
case (SaveMode.Append, _) | (SaveMode.Overwrite, _) | (SaveMode.ErrorIfExists, false) =>
107115
true
108116
case (SaveMode.Ignore, exists) =>

sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -160,9 +160,9 @@ class DetermineTableStats(session: SparkSession) extends Rule[LogicalPlan] {
160160
*/
161161
object HiveAnalysis extends Rule[LogicalPlan] {
162162
override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
163-
case InsertIntoTable(relation: CatalogRelation, partSpec, query, overwrite, ifNotExists)
164-
if DDLUtils.isHiveTable(relation.tableMeta) =>
165-
InsertIntoHiveTable(relation.tableMeta, partSpec, query, overwrite, ifNotExists)
163+
case InsertIntoTable(r: CatalogRelation, partSpec, query, overwrite, ifPartitionNotExists)
164+
if DDLUtils.isHiveTable(r.tableMeta) =>
165+
InsertIntoHiveTable(r.tableMeta, partSpec, query, overwrite, ifPartitionNotExists)
166166

167167
case CreateTable(tableDesc, mode, None) if DDLUtils.isHiveTable(tableDesc) =>
168168
CreateTableCommand(tableDesc, ignoreIfExists = mode == SaveMode.Ignore)
@@ -207,11 +207,11 @@ case class RelationConversions(
207207
override def apply(plan: LogicalPlan): LogicalPlan = {
208208
plan transformUp {
209209
// Write path
210-
case InsertIntoTable(r: CatalogRelation, partition, query, overwrite, ifNotExists)
210+
case InsertIntoTable(r: CatalogRelation, partition, query, overwrite, ifPartitionNotExists)
211211
// Inserting into partitioned table is not supported in Parquet/Orc data source (yet).
212-
if query.resolved && DDLUtils.isHiveTable(r.tableMeta) &&
213-
!r.isPartitioned && isConvertible(r) =>
214-
InsertIntoTable(convert(r), partition, query, overwrite, ifNotExists)
212+
if query.resolved && DDLUtils.isHiveTable(r.tableMeta) &&
213+
!r.isPartitioned && isConvertible(r) =>
214+
InsertIntoTable(convert(r), partition, query, overwrite, ifPartitionNotExists)
215215

216216
// Read path
217217
case relation: CatalogRelation

sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateHiveTableAsSelectCommand.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ case class CreateHiveTableAsSelectCommand(
6262
Map(),
6363
query,
6464
overwrite = false,
65-
ifNotExists = false)).toRdd
65+
ifPartitionNotExists = false)).toRdd
6666
} else {
6767
// TODO ideally, we should get the output data ready first and then
6868
// add the relation into catalog, just in case of failure occurs while data
@@ -78,7 +78,7 @@ case class CreateHiveTableAsSelectCommand(
7878
Map(),
7979
query,
8080
overwrite = true,
81-
ifNotExists = false)).toRdd
81+
ifPartitionNotExists = false)).toRdd
8282
} catch {
8383
case NonFatal(e) =>
8484
// drop the created table.

0 commit comments

Comments
 (0)