[SPARK-28757][SQL] File table location should include both values of option path and paths

gengliangwang · cloud-fan · commit 92bfd9a317b9 · 2019-08-16T22:27:27.000+08:00
### What changes were proposed in this pull request? If both options `path` and `paths` are passed to file data source v2, both values of the options should be included as the target paths. ### Why are the changes needed? In V1 implementation, file table location includes both values of option `path` and `paths`. In the refactoring of #24025, the value of option `path` is ignored if "paths" are specified. We should make it consistent with V1. ### Does this PR introduce any user-facing change? No ### How was this patch tested? Unit test Closes #25473 from gengliangwang/fixPathOption. Authored-by: Gengliang Wang <gengliang.wang@databricks.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileDataSourceV2.scala
@@ -43,11 +43,10 @@ trait FileDataSourceV2 extends TableProvider with DataSourceRegister {
 
   protected def getPaths(map: CaseInsensitiveStringMap): Seq[String] = {
     val objectMapper = new ObjectMapper()
-    Option(map.get("paths")).map { pathStr =>
+    val paths = Option(map.get("paths")).map { pathStr =>
       objectMapper.readValue(pathStr, classOf[Array[String]]).toSeq
-    }.getOrElse {
-      Option(map.get("path")).toSeq
-    }
+    }.getOrElse(Seq.empty)
+    paths ++ Option(map.get("path")).toSeq
   }
 
   protected def getTableName(paths: Seq[String]): String = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
@@ -29,6 +29,9 @@ import org.scalatest.BeforeAndAfterAll
 import org.apache.spark.SparkException
 import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskEnd}
 import org.apache.spark.sql.TestingUDT.{IntervalData, IntervalUDT, NullData, NullUDT}
+import org.apache.spark.sql.catalyst.planning.PhysicalOperation
+import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
+import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetTable
 import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, SortMergeJoinExec}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
@@ -707,6 +710,27 @@ class FileBasedDataSourceSuite extends QueryTest with SharedSQLContext with Befo
       }
     }
   }
+
+  test("File table location should include both values of option `path` and `paths`") {
+    withSQLConf(SQLConf.USE_V1_SOURCE_READER_LIST.key -> "") {
+      withTempPaths(3) { paths =>
+        paths.zipWithIndex.foreach { case (path, index) =>
+          Seq(index).toDF("a").write.mode("overwrite").parquet(path.getCanonicalPath)
+        }
+        val df = spark
+          .read
+          .option("path", paths.head.getCanonicalPath)
+          .parquet(paths(1).getCanonicalPath, paths(2).getCanonicalPath)
+        df.queryExecution.optimizedPlan match {
+          case PhysicalOperation(_, _, DataSourceV2Relation(table: ParquetTable, _, _)) =>
+            assert(table.paths.toSet == paths.map(_.getCanonicalPath).toSet)
+          case _ =>
+            throw new AnalysisException("Can not match ParquetTable in the query.")
+        }
+        checkAnswer(df, Seq(0, 1, 2).map(Row(_)))
+      }
+    }
+  }
 }
 
 object TestingUDT {