Merge branch 'SPARK-20865' into 'spark_2.1'

cenyuhai · cenyuhai · commit 74e09d6d28c1 · 2017-09-18T12:32:48.000+08:00
[SPARK-20865] Structured streaming dataframe cache、unpersist报错 structured streaming dataset cache 会报错，应当给个log告警忽略cache,unpersist等操作。 resolve apache#74 See merge request !65
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -31,6 +31,7 @@ import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.api.java.function._
 import org.apache.spark.api.python.{PythonRDD, SerDeUtil}
 import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst._
 import org.apache.spark.sql.catalyst.analysis._
@@ -156,7 +157,7 @@ class Dataset[T] private[sql](
     @transient val sparkSession: SparkSession,
     @DeveloperApi @InterfaceStability.Unstable @transient val queryExecution: QueryExecution,
     encoder: Encoder[T])
-  extends Serializable {
+  extends Serializable with Logging {
 
   queryExecution.assertAnalyzed()
 
@@ -503,35 +504,40 @@ class Dataset[T] private[sql](
   @Experimental
   @InterfaceStability.Evolving
   def checkpoint(eager: Boolean): Dataset[T] = {
-    val internalRdd = queryExecution.toRdd.map(_.copy())
-    internalRdd.checkpoint()
+    if (isStreaming) {
+      logWarning("Checkpoint is no-op in queries with streaming sources.")
+      this
+    } else {
+      val internalRdd = queryExecution.toRdd.map(_.copy())
+      internalRdd.checkpoint()
 
-    if (eager) {
-      internalRdd.count()
-    }
+      if (eager) {
+        internalRdd.count()
+      }
 
-    val physicalPlan = queryExecution.executedPlan
+      val physicalPlan = queryExecution.executedPlan
 
-    // Takes the first leaf partitioning whenever we see a `PartitioningCollection`. Otherwise the
-    // size of `PartitioningCollection` may grow exponentially for queries involving deep inner
-    // joins.
-    def firstLeafPartitioning(partitioning: Partitioning): Partitioning = {
-      partitioning match {
-        case p: PartitioningCollection => firstLeafPartitioning(p.partitionings.head)
-        case p => p
+      // Takes the first leaf partitioning whenever we see a `PartitioningCollection`. Otherwise the
+      // size of `PartitioningCollection` may grow exponentially for queries involving deep inner
+      // joins.
+      def firstLeafPartitioning(partitioning: Partitioning): Partitioning = {
+        partitioning match {
+          case p: PartitioningCollection => firstLeafPartitioning(p.partitionings.head)
+          case p => p
+        }
       }
-    }
 
-    val outputPartitioning = firstLeafPartitioning(physicalPlan.outputPartitioning)
+      val outputPartitioning = firstLeafPartitioning(physicalPlan.outputPartitioning)
 
-    Dataset.ofRows(
-      sparkSession,
-      LogicalRDD(
-        logicalPlan.output,
-        internalRdd,
-        outputPartitioning,
-        physicalPlan.outputOrdering
-      )(sparkSession)).as[T]
+      Dataset.ofRows(
+        sparkSession,
+        LogicalRDD(
+          logicalPlan.output,
+          internalRdd,
+          outputPartitioning,
+          physicalPlan.outputOrdering
+        )(sparkSession)).as[T]
+    }
   }
 
   /**
@@ -2493,7 +2499,11 @@ class Dataset[T] private[sql](
    * @since 1.6.0
    */
   def persist(): this.type = {
-    sparkSession.sharedState.cacheManager.cacheQuery(this)
+    if (isStreaming) {
+      logWarning("Persist is no-op in queries with streaming sources.")
+    } else {
+      sparkSession.sharedState.cacheManager.cacheQuery(this)
+    }
     this
   }
 
@@ -2515,7 +2525,11 @@ class Dataset[T] private[sql](
    * @since 1.6.0
    */
   def persist(newLevel: StorageLevel): this.type = {
-    sparkSession.sharedState.cacheManager.cacheQuery(this, None, newLevel)
+    if (isStreaming) {
+      logWarning("Persist is no-op in queries with streaming sources.")
+    } else {
+      sparkSession.sharedState.cacheManager.cacheQuery(this, None, newLevel)
+    }
     this
   }
 
@@ -2540,7 +2554,11 @@ class Dataset[T] private[sql](
    * @since 1.6.0
    */
   def unpersist(blocking: Boolean): this.type = {
-    sparkSession.sharedState.cacheManager.uncacheQuery(this, blocking)
+    if (isStreaming) {
+      logWarning("Unpersist is no-op in queries with streaming sources.")
+    } else {
+      sparkSession.sharedState.cacheManager.uncacheQuery(this, blocking)
+    }
     this
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala
@@ -18,6 +18,8 @@
 package org.apache.spark.sql.execution.streaming.state
 
 import java.io.{DataInputStream, DataOutputStream, FileNotFoundException, IOException}
+import java.nio.channels.ClosedChannelException
+import java.util.Locale
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable
@@ -73,7 +75,12 @@ private[state] class HDFSBackedStateStoreProvider(
     hadoopConf: Configuration
   ) extends StateStoreProvider with Logging {
 
-  type MapType = java.util.HashMap[UnsafeRow, UnsafeRow]
+  // ConcurrentHashMap is used because it generates fail-safe iterators on filtering
+  // - The iterator is weakly consistent with the map, i.e., iterator's data reflect the values in
+  //   the map when the iterator was created
+  // - Any updates to the map while iterating through the filtered iterator does not throw
+  //   java.util.ConcurrentModificationException
+  type MapType = java.util.concurrent.ConcurrentHashMap[UnsafeRow, UnsafeRow]
 
   /** Implementation of [[StateStore]] API which is backed by a HDFS-compatible file system */
   class HDFSBackedStateStore(val version: Long, mapToUpdate: MapType)
@@ -208,7 +215,7 @@ private[state] class HDFSBackedStateStoreProvider(
     }
 
     override def toString(): String = {
-      s"HDFSStateStore[id = (op=${id.operatorId}, part=${id.partitionId}), dir = $baseDir]"
+      s"HDFSStateStore[id=(op=${id.operatorId},part=${id.partitionId}),dir=$baseDir]"
     }
   }
 
@@ -565,7 +572,7 @@ private[state] class HDFSBackedStateStoreProvider(
       val nameParts = path.getName.split("\\.")
       if (nameParts.size == 2) {
         val version = nameParts(0).toLong
-        nameParts(1).toLowerCase match {
+        nameParts(1).toLowerCase(Locale.ROOT) match {
           case "delta" =>
             // ignore the file otherwise, snapshot file already exists for that batch id
             if (!versionToFiles.contains(version)) {