More cleanup

zsxwing · zsxwing · commit c41e7bc1fbeb · 2017-03-06T23:58:28.000-08:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
@@ -50,7 +50,6 @@ object SessionCatalog {
 class SessionCatalog(
     externalCatalog: ExternalCatalog,
     globalTempViewManager: GlobalTempViewManager,
-    functionResourceLoader: FunctionResourceLoader,
     functionRegistry: FunctionRegistry,
     conf: CatalystConf,
     hadoopConf: Configuration,
@@ -66,11 +65,11 @@ class SessionCatalog(
     this(
       externalCatalog,
       new GlobalTempViewManager("global_temp"),
-      DummyFunctionResourceLoader,
       functionRegistry,
       conf,
       new Configuration(),
       CatalystSqlParser)
+    functionResourceLoader = DummyFunctionResourceLoader
   }
 
   // For testing only.
@@ -92,6 +91,8 @@ class SessionCatalog(
   @GuardedBy("this")
   protected var currentDb = formatDatabaseName(DEFAULT_DATABASE)
 
+  @volatile var functionResourceLoader: FunctionResourceLoader = _
+
   /**
    * Checks if the given name conforms the Hive standard ("[a-zA-z_0-9]+"),
    * i.e. if this name only contains characters, numbers, and _.
@@ -990,6 +991,9 @@ class SessionCatalog(
    * by a tuple (resource type, resource uri).
    */
   def loadFunctionResources(resources: Seq[FunctionResource]): Unit = {
+    if (functionResourceLoader == null) {
+      throw new IllegalStateException("functionResourceLoader has not yet been initialized")
+    }
     resources.foreach(functionResourceLoader.loadResource)
   }
 
@@ -1186,22 +1190,17 @@ class SessionCatalog(
   }
 
   /**
-   * Get an identical copy of the `SessionCatalog`.
-   * The temporary views and function registry are retained.
-   * The table relation cache will not be populated.
-   * @note `externalCatalog` and `globalTempViewManager` are from shared state, do not need deep
-   * copy. `FunctionResourceLoader` is effectively stateless, also does not need deep copy.
-   * All arguments passed in should be associated with a particular `SparkSession`.
+   * Create a new [[SessionCatalog]] with the provided parameters. `externalCatalog` and
+   * `globalTempViewManager` are `inherited`, while `currentDb` and `tempTables` are copied.
    */
-  def clone(
+  def newSessionCatalogWith(
       conf: CatalystConf,
       hadoopConf: Configuration,
       functionRegistry: FunctionRegistry,
       parser: ParserInterface): SessionCatalog = {
     val catalog = new SessionCatalog(
       externalCatalog,
       globalTempViewManager,
-      functionResourceLoader,
       functionRegistry,
       conf,
       hadoopConf,
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
@@ -1206,7 +1206,7 @@ class SessionCatalogSuite extends PlanTest {
     original.createTempView("copytest1", tempTable1, overrideIfExists = false)
 
     // check if tables copied over
-    val clone = original.clone(
+    val clone = original.newSessionCatalogWith(
       SimpleCatalystConf(caseSensitiveAnalysis = true),
       new Configuration(),
       new SimpleFunctionRegistry,
@@ -1236,7 +1236,7 @@ class SessionCatalogSuite extends PlanTest {
     original.setCurrentDatabase(db1)
 
     // check if current db copied over
-    val clone = original.clone(
+    val clone = original.newSessionCatalogWith(
       SimpleCatalystConf(caseSensitiveAnalysis = true),
       new Configuration(),
       new SimpleFunctionRegistry,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/ExperimentalMethods.scala b/sql/core/src/main/scala/org/apache/spark/sql/ExperimentalMethods.scala
@@ -48,10 +48,8 @@ class ExperimentalMethods private[sql]() {
 
   override def clone(): ExperimentalMethods = {
     val result = new ExperimentalMethods
-    synchronized {
-      result.extraStrategies = extraStrategies
-      result.extraOptimizations = extraOptimizations
-    }
+    result.extraStrategies = extraStrategies
+    result.extraOptimizations = extraOptimizations
     result
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -80,10 +80,6 @@ class SparkSession private(
     @transient private val parentSessionState: Option[SessionState])
   extends Serializable with Closeable with Logging { self =>
 
-  private[sql] def this(sc: SparkContext, existingSharedState: Option[SharedState]) {
-    this(sc, existingSharedState, None)
-  }
-
   private[sql] def this(sc: SparkContext) {
     this(sc, None, None)
   }
@@ -129,9 +125,11 @@ class SparkSession private(
   lazy val sessionState: SessionState = {
     parentSessionState
       .map(_.clone(this))
-      .getOrElse(SparkSession.instantiateSessionState(
-        SparkSession.sessionStateClassName(sparkContext.conf),
-        self))
+      .getOrElse {
+        SparkSession.instantiateSessionState(
+          SparkSession.sessionStateClassName(sparkContext.conf),
+          self)
+      }
   }
 
   /**
@@ -221,13 +219,12 @@ class SparkSession private(
    * @since 2.0.0
    */
   def newSession(): SparkSession = {
-    new SparkSession(sparkContext, Some(sharedState))
+    new SparkSession(sparkContext, Some(sharedState), parentSessionState = None)
   }
 
   /**
-   * :: Experimental ::
    * Create an identical copy of this `SparkSession`, sharing the underlying `SparkContext`
-   * and cached data. All the state of this session (i.e. SQL configurations, temporary tables,
+   * and shared state. All the state of this session (i.e. SQL configurations, temporary tables,
    * registered functions) is copied over, and the cloned session is set up with the same shared
    * state as this session. The cloned session is independent of this session, that is, any
    * non-global change in either session is not reflected in the other.
@@ -236,12 +233,8 @@ class SparkSession private(
    * This method will force the initialization of the shared state to ensure that parent
    * and child sessions are set up with the same shared state. If the underlying catalog
    * implementation is Hive, this will initialize the metastore, which may take some time.
-   *
-   * @since 2.2.0
    */
-  @Experimental
-  @InterfaceStability.Evolving
-  def cloneSession(): SparkSession = {
+  private[sql] def cloneSession(): SparkSession = {
     val result = new SparkSession(sparkContext, Some(sharedState), Some(sessionState))
     result.sessionState // force copy of SessionState
     result
@@ -919,6 +912,7 @@ object SparkSession {
           }
         })
       }
+
       return session
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala
@@ -17,7 +17,10 @@
 
 package org.apache.spark.sql.internal
 
+import java.io.File
+
 import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
 
 import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.sql._
@@ -35,6 +38,10 @@ import org.apache.spark.sql.util.ExecutionListenerManager
 
 /**
  * A class that holds all session-specific state in a given [[SparkSession]].
+ * @param sparkContext The [[SparkContext]].
+ * @param sharedState The shared state.
+ * @param conf SQL-specific key-value configurations.
+ * @param experimentalMethods The experimental methods.
  * @param functionRegistry Internal catalog for managing functions registered by the user.
  * @param catalog Internal catalog for managing table and database states.
  * @param sqlParser Parser that extracts expressions, plans, table identifiers etc. from SQL texts.
@@ -55,42 +62,60 @@ private[sql] class SessionState(
     val streamingQueryManager: StreamingQueryManager,
     val queryExecutionCreator: LogicalPlan => QueryExecution) {
 
+  def newHadoopConf(): Configuration = SessionState.newHadoopConf(
+    sparkContext.hadoopConfiguration,
+    conf)
+
+  def newHadoopConfWithOptions(options: Map[String, String]): Configuration = {
+    val hadoopConf = newHadoopConf()
+    options.foreach { case (k, v) =>
+      if ((v ne null) && k != "path" && k != "paths") {
+        hadoopConf.set(k, v)
+      }
+    }
+    hadoopConf
+  }
+
+  /**
+   * A class for loading resources specified by a function.
+   */
+  val functionResourceLoader: FunctionResourceLoader = {
+    new FunctionResourceLoader {
+      override def loadResource(resource: FunctionResource): Unit = {
+        resource.resourceType match {
+          case JarResource => addJar(resource.uri)
+          case FileResource => sparkContext.addFile(resource.uri)
+          case ArchiveResource =>
+            throw new AnalysisException(
+              "Archive is not allowed to be loaded. If YARN mode is used, " +
+                "please use --archives options while calling spark-submit.")
+        }
+      }
+    }
+  }
+
   /**
    * Interface exposed to the user for registering user-defined functions.
    * Note that the user-defined functions must be deterministic.
    */
   val udf: UDFRegistration = new UDFRegistration(functionRegistry)
 
   /**
-   *   Logical query plan optimizer.
+   * Logical query plan optimizer.
    */
   val optimizer: Optimizer = new SparkOptimizer(catalog, conf, experimentalMethods)
 
-  /**
-   * An interface to register custom [[org.apache.spark.sql.util.QueryExecutionListener]]s
-   * that listen for execution metrics.
-   */
-  val listenerManager: ExecutionListenerManager = new ExecutionListenerManager
-
   /**
    * Planner that converts optimized logical plans to physical plans.
    */
   def planner: SparkPlanner =
     new SparkPlanner(sparkContext, conf, experimentalMethods.extraStrategies)
 
-  def newHadoopConf(): Configuration = SessionState.newHadoopConf(
-    sparkContext.hadoopConfiguration,
-    conf)
-
-  def newHadoopConfWithOptions(options: Map[String, String]): Configuration = {
-    val hadoopConf = newHadoopConf()
-    options.foreach { case (k, v) =>
-      if ((v ne null) && k != "path" && k != "paths") {
-        hadoopConf.set(k, v)
-      }
-    }
-    hadoopConf
-  }
+  /**
+   * An interface to register custom [[org.apache.spark.sql.util.QueryExecutionListener]]s
+   * that listen for execution metrics.
+   */
+  val listenerManager: ExecutionListenerManager = new ExecutionListenerManager
 
   /**
    * Get an identical copy of the `SessionState` and associate it with the given `SparkSession`
@@ -100,7 +125,7 @@ private[sql] class SessionState(
     val confCopy = conf.clone()
     val functionRegistryCopy = functionRegistry.clone()
     val sqlParser: ParserInterface = new SparkSqlParser(confCopy)
-    val catalogCopy = catalog.clone(
+    val catalogCopy = catalog.newSessionCatalogWith(
       confCopy,
       SessionState.newHadoopConf(sparkContext.hadoopConfiguration, confCopy),
       functionRegistryCopy,
@@ -132,7 +157,26 @@ private[sql] class SessionState(
     catalog.refreshTable(sqlParser.parseTableIdentifier(tableName))
   }
 
-  def addJar(path: String): Unit = sharedState.addJar(path)
+  /**
+   * Add a jar path to [[SparkContext]] and the classloader.
+   *
+   * Note: this method seems not access any session state, but the subclass `HiveSessionState` needs
+   * to add the jar to its hive client for the current session. Hence, it still needs to be in
+   * [[SessionState]].
+   */
+  def addJar(path: String): Unit = {
+    sparkContext.addJar(path)
+    val uri = new Path(path).toUri
+    val jarURL = if (uri.getScheme == null) {
+      // `path` is a local file path without a URL scheme
+      new File(path).toURI.toURL
+    } else {
+      // `path` is a URL with a scheme
+      uri.toURL
+    }
+    sharedState.jarClassLoader.addURL(jarURL)
+    Thread.currentThread().setContextClassLoader(sharedState.jarClassLoader)
+  }
 }
 
 
@@ -150,16 +194,11 @@ object SessionState {
 
     val functionRegistry = FunctionRegistry.builtin.clone()
 
-    // A class for loading resources specified by a function.
-    val functionResourceLoader: FunctionResourceLoader =
-      createFunctionResourceLoader(sparkContext, sparkSession.sharedState)
-
     val sqlParser: ParserInterface = new SparkSqlParser(sqlConf)
 
     val catalog = new SessionCatalog(
       sparkSession.sharedState.externalCatalog,
       sparkSession.sharedState.globalTempViewManager,
-      functionResourceLoader,
       functionRegistry,
       sqlConf,
       newHadoopConf(sparkContext.hadoopConfiguration, sqlConf),
@@ -171,7 +210,7 @@ object SessionState {
 
     val queryExecutionCreator = (plan: LogicalPlan) => new QueryExecution(sparkSession, plan)
 
-    new SessionState(
+    val sessionState = new SessionState(
       sparkContext,
       sparkSession.sharedState,
       sqlConf,
@@ -182,23 +221,11 @@ object SessionState {
       analyzer,
       streamingQueryManager,
       queryExecutionCreator)
-  }
-
-  def createFunctionResourceLoader(
-      sparkContext: SparkContext,
-      sharedState: SharedState): FunctionResourceLoader = {
-    new FunctionResourceLoader {
-      override def loadResource(resource: FunctionResource): Unit = {
-        resource.resourceType match {
-          case JarResource => sharedState.addJar(resource.uri)
-          case FileResource => sparkContext.addFile(resource.uri)
-          case ArchiveResource =>
-            throw new AnalysisException(
-              "Archive is not allowed to be loaded. If YARN mode is used, " +
-                "please use --archives options while calling spark-submit.")
-        }
-      }
-    }
+    // functionResourceLoader needs to access SessionState.addJar, so it cannot be created before
+    // creating SessionState. Setting `catalog.functionResourceLoader` here is safe since the caller
+    // cannot use SessionCatalog before we return SessionState.
+    catalog.functionResourceLoader = sessionState.functionResourceLoader
+    sessionState
   }
 
   def newHadoopConf(hadoopConf: Configuration, sqlConf: SQLConf): Configuration = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.sql.internal
 
-import java.io.File
-
 import scala.reflect.ClassTag
 import scala.util.control.NonFatal
 
@@ -146,20 +144,6 @@ private[sql] class SharedState(val sparkContext: SparkContext) extends Logging {
     }
     SparkSession.sqlListener.get()
   }
-
-  def addJar(path: String): Unit = {
-    sparkContext.addJar(path)
-    val uri = new Path(path).toUri
-    val jarURL = if (uri.getScheme == null) {
-      // `path` is a local file path without a URL scheme
-      new File(path).toURI.toURL
-    } else {
-      // `path` is a URL with a scheme
-      uri.toURL
-    }
-    jarClassLoader.addURL(jarURL)
-    Thread.currentThread().setContextClassLoader(jarClassLoader)
-  }
 }
 
 object SharedState {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionState.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionState.scala
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSessionCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSessionCatalogSuite.scala

Original file line number	Diff line number	Diff line change
`@@ -48,10 +48,8 @@ class ExperimentalMethods private[sql]() {`
`48`	`48`
`49`	`49`	`override def clone(): ExperimentalMethods = {`
`50`	`50`	`val result = new ExperimentalMethods`
`51`		`- synchronized {`
`52`		`- result.extraStrategies = extraStrategies`
`53`		`- result.extraOptimizations = extraOptimizations`
`54`		`- }`
	`51`	`+ result.extraStrategies = extraStrategies`
	`52`	`+ result.extraOptimizations = extraOptimizations`
`55`	`53`	`result`
`56`	`54`	`}`
`57`	`55`	`}`