-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-19540][SQL] Add ability to clone SparkSession wherein cloned session has an identical copy of the SessionState #16826
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
18ce1b8
9beb78d
a343d8a
4210079
6da6bda
579d0b7
2837e73
8c00344
f423f74
b1371d8
2cee190
e2bbfa8
8ac778a
0c732ce
3c995e1
292011a
b027412
295ee41
847b484
9beba84
3d2e4a6
4f70d12
dd2dedd
8a8d47b
ffc2058
16824f9
fd11ee2
437b0bc
300d3a0
3ee271f
c3f052f
0bdc81c
2740c63
0f167db
2f0b1ad
c41e7bc
5eb6733
05abcf8
4c23e7a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1180,4 +1180,25 @@ class SessionCatalog( | |
| } | ||
| } | ||
|
|
||
| /** | ||
| * Get an identical copy of the `SessionCatalog`. | ||
| * The temporary tables and function registry are retained. | ||
|
||
| * The table relation cache will not be populated. | ||
| */ | ||
| override def clone: SessionCatalog = { | ||
| val catalog = new SessionCatalog( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if you are copying... shouldnt all the fields be copied, instead of just being reused? |
||
| externalCatalog, | ||
| globalTempViewManager, | ||
| functionResourceLoader, | ||
| functionRegistry, | ||
| conf, | ||
| hadoopConf, | ||
| parser) | ||
|
|
||
| catalog.currentDb = currentDb | ||
|
||
| tempTables.foreach(kv => catalog.tempTables.put(kv._1, kv._2)) // copy over temporary tables | ||
|
|
||
| catalog | ||
| } | ||
|
|
||
|
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,6 +17,8 @@ | |
|
|
||
| package org.apache.spark.sql | ||
|
|
||
| import scala.collection.mutable.ListBuffer | ||
|
|
||
| import org.apache.spark.annotation.{Experimental, InterfaceStability} | ||
| import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan | ||
| import org.apache.spark.sql.catalyst.rules.Rule | ||
|
|
@@ -46,4 +48,23 @@ class ExperimentalMethods private[sql]() { | |
|
|
||
| @volatile var extraOptimizations: Seq[Rule[LogicalPlan]] = Nil | ||
|
|
||
| /** | ||
| * Get an identical copy of this `ExperimentalMethods` instance. | ||
| * @note This is used when forking a `SparkSession`. | ||
| * `clone` is provided here instead of implementing equivalent functionality | ||
| * in `SparkSession.clone` since it needs to be updated | ||
| * as the class `ExperimentalMethods` is extended or modified. | ||
| */ | ||
| override def clone: ExperimentalMethods = { | ||
| def cloneSeq[T](seq: Seq[T]): Seq[T] = { | ||
| val newSeq = new ListBuffer[T] | ||
| newSeq ++= seq | ||
| newSeq | ||
| } | ||
|
|
||
| val result = new ExperimentalMethods | ||
| result.extraStrategies = cloneSeq(extraStrategies) | ||
|
||
| result.extraOptimizations = cloneSeq(extraOptimizations) | ||
| result | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -44,7 +44,7 @@ import org.apache.spark.sql.internal.{CatalogImpl, SessionState, SharedState} | |
| import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION | ||
| import org.apache.spark.sql.sources.BaseRelation | ||
| import org.apache.spark.sql.streaming._ | ||
| import org.apache.spark.sql.types.{DataType, LongType, StructType} | ||
| import org.apache.spark.sql.types.{DataType, StructType} | ||
| import org.apache.spark.sql.util.ExecutionListenerManager | ||
| import org.apache.spark.util.Utils | ||
|
|
||
|
|
@@ -72,11 +72,16 @@ import org.apache.spark.util.Utils | |
| @InterfaceStability.Stable | ||
| class SparkSession private( | ||
| @transient val sparkContext: SparkContext, | ||
| @transient private val existingSharedState: Option[SharedState]) | ||
| @transient private val existingSharedState: Option[SharedState], | ||
| existingSessionState: Option[SessionState]) | ||
|
||
| extends Serializable with Closeable with Logging { self => | ||
|
|
||
| private[sql] def this(sc: SparkContext, existingSharedState: Option[SharedState]) { | ||
| this(sc, existingSharedState, None) | ||
| } | ||
|
|
||
| private[sql] def this(sc: SparkContext) { | ||
| this(sc, None) | ||
| this(sc, None, None) | ||
| } | ||
|
|
||
| sparkContext.assertNotStopped() | ||
|
|
@@ -107,9 +112,9 @@ class SparkSession private( | |
| */ | ||
| @transient | ||
| private[sql] lazy val sessionState: SessionState = { | ||
| SparkSession.reflect[SessionState, SparkSession]( | ||
| existingSessionState.getOrElse(SparkSession.reflect[SessionState, SparkSession]( | ||
| SparkSession.sessionStateClassName(sparkContext.conf), | ||
| self) | ||
| self)) | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -213,6 +218,24 @@ class SparkSession private( | |
| new SparkSession(sparkContext, Some(sharedState)) | ||
| } | ||
|
|
||
| /** | ||
| * Start a new session, sharing the underlying `SparkContext` and cached data. | ||
|
||
| * If inheritSessionState is enabled, then SQL configurations, temporary tables, | ||
| * registered functions are copied over from parent `SparkSession`. | ||
| * | ||
| * @note Other than the `SparkContext`, all shared state is initialized lazily. | ||
| * This method will force the initialization of the shared state to ensure that parent | ||
| * and child sessions are set up with the same shared state. If the underlying catalog | ||
| * implementation is Hive, this will initialize the metastore, which may take some time. | ||
| */ | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: please add
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why not remove the boolean flag and just call this cloneSession?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That seems cleaner, fixed. |
||
| def newSession(inheritSessionState: Boolean): SparkSession = { | ||
| if (inheritSessionState) { | ||
| new SparkSession(sparkContext, Some(sharedState), Some(sessionState.clone)) | ||
| } else { | ||
| newSession() | ||
| } | ||
| } | ||
|
|
||
|
|
||
| /* --------------------------------- * | ||
| | Methods for creating DataFrames | | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -38,16 +38,31 @@ import org.apache.spark.sql.util.ExecutionListenerManager | |
|
|
||
| /** | ||
| * A class that holds all session-specific state in a given [[SparkSession]]. | ||
| * If an `existingSessionState` is supplied, then its members will be copied over. | ||
| */ | ||
| private[sql] class SessionState(sparkSession: SparkSession) { | ||
| private[sql] class SessionState( | ||
| sparkSession: SparkSession, | ||
| existingSessionState: Option[SessionState]) { | ||
|
||
|
|
||
| private[sql] def this(sparkSession: SparkSession) = { | ||
| this(sparkSession, None) | ||
| } | ||
|
|
||
| // Note: These are all lazy vals because they depend on each other (e.g. conf) and we | ||
| // want subclasses to override some of the fields. Otherwise, we would get a lot of NPEs. | ||
|
|
||
| /** | ||
| * SQL-specific key-value configurations. | ||
| */ | ||
| lazy val conf: SQLConf = new SQLConf | ||
| lazy val conf: SQLConf = { | ||
| val result = new SQLConf | ||
| if (existingSessionState.nonEmpty) { | ||
|
||
| existingSessionState.get.conf.getAllConfs.foreach { | ||
| case (k, v) => if (v ne null) result.setConfString(k, v) | ||
| } | ||
| } | ||
| result | ||
| } | ||
|
|
||
| def newHadoopConf(): Configuration = { | ||
| val hadoopConf = new Configuration(sparkSession.sparkContext.hadoopConfiguration) | ||
|
|
@@ -65,12 +80,29 @@ private[sql] class SessionState(sparkSession: SparkSession) { | |
| hadoopConf | ||
| } | ||
|
|
||
| lazy val experimentalMethods = new ExperimentalMethods | ||
| lazy val experimentalMethods: ExperimentalMethods = { | ||
| existingSessionState | ||
| .map(_.experimentalMethods.clone) | ||
| .getOrElse(new ExperimentalMethods) | ||
| } | ||
|
|
||
| /** | ||
| * Internal catalog for managing functions registered by the user. | ||
| */ | ||
| lazy val functionRegistry: FunctionRegistry = FunctionRegistry.builtin.copy() | ||
| lazy val functionRegistry: FunctionRegistry = { | ||
|
||
| val registry = FunctionRegistry.builtin.copy() | ||
|
|
||
| if (existingSessionState.nonEmpty) { | ||
| val sourceRegistry = existingSessionState.get.functionRegistry | ||
| sourceRegistry | ||
| .listFunction() | ||
| .foreach(name => registry.registerFunction( | ||
| name, | ||
| sourceRegistry.lookupFunction(name).get, | ||
| sourceRegistry.lookupFunctionBuilder(name).get)) | ||
| } | ||
| registry | ||
| } | ||
|
|
||
| /** | ||
| * A class for loading resources specified by a function. | ||
|
|
@@ -93,14 +125,18 @@ private[sql] class SessionState(sparkSession: SparkSession) { | |
| /** | ||
| * Internal catalog for managing table and database states. | ||
| */ | ||
| lazy val catalog = new SessionCatalog( | ||
| sparkSession.sharedState.externalCatalog, | ||
| sparkSession.sharedState.globalTempViewManager, | ||
| functionResourceLoader, | ||
| functionRegistry, | ||
| conf, | ||
| newHadoopConf(), | ||
| sqlParser) | ||
| lazy val catalog: SessionCatalog = { | ||
| existingSessionState | ||
| .map(_.catalog.clone) | ||
| .getOrElse(new SessionCatalog( | ||
| sparkSession.sharedState.externalCatalog, | ||
| sparkSession.sharedState.globalTempViewManager, | ||
| functionResourceLoader, | ||
| functionRegistry, | ||
| conf, | ||
| newHadoopConf(), | ||
| sqlParser)) | ||
| } | ||
|
|
||
|
||
| /** | ||
| * Interface exposed to the user for registering user-defined functions. | ||
|
|
@@ -165,6 +201,13 @@ private[sql] class SessionState(sparkSession: SparkSession) { | |
| conf.setConfString(k, v) | ||
| } | ||
|
|
||
| /** | ||
| * Get an identical copy of the `SessionState`. | ||
| */ | ||
| override def clone: SessionState = { | ||
| new SessionState(sparkSession, Some(this)) | ||
| } | ||
|
|
||
| // ------------------------------------------------------ | ||
| // Helper methods, partially leftover from pre-2.0 days | ||
| // ------------------------------------------------------ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,6 +18,8 @@ | |
| package org.apache.spark.sql | ||
|
|
||
| import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite} | ||
| import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan | ||
| import org.apache.spark.sql.catalyst.rules.Rule | ||
|
|
||
| /** | ||
| * Test cases for the builder pattern of [[SparkSession]]. | ||
|
|
@@ -123,4 +125,70 @@ class SparkSessionBuilderSuite extends SparkFunSuite { | |
| session.stop() | ||
| } | ||
| } | ||
|
|
||
| test("fork new session and inherit a copy of the session state") { | ||
|
||
| val activeSession = SparkSession.builder().master("local").getOrCreate() | ||
| val forkedSession = activeSession.newSession(inheritSessionState = true) | ||
|
|
||
| assert(forkedSession ne activeSession) | ||
| assert(forkedSession.sessionState ne activeSession.sessionState) | ||
|
|
||
| forkedSession.stop() | ||
| activeSession.stop() | ||
| } | ||
|
|
||
| test("fork new session and inherit sql config options") { | ||
|
||
| val activeSession = SparkSession | ||
| .builder() | ||
| .master("local") | ||
| .config("spark-configb", "b") | ||
|
||
| .getOrCreate() | ||
| val forkedSession = activeSession.newSession(inheritSessionState = true) | ||
|
|
||
| assert(forkedSession ne activeSession) | ||
| assert(forkedSession.conf ne activeSession.conf) | ||
| assert(forkedSession.conf.get("spark-configb") == "b") | ||
|
|
||
| forkedSession.stop() | ||
| activeSession.stop() | ||
| } | ||
|
|
||
| test("fork new session and inherit function registry and udf") { | ||
| val activeSession = SparkSession.builder().master("local").getOrCreate() | ||
| activeSession.udf.register("strlenScala", (_: String).length + (_: Int)) | ||
| val forkedSession = activeSession.newSession(inheritSessionState = true) | ||
|
|
||
| assert(forkedSession ne activeSession) | ||
| assert(forkedSession.sessionState.functionRegistry ne | ||
| activeSession.sessionState.functionRegistry) | ||
| assert(forkedSession.sessionState.functionRegistry.lookupFunction("strlenScala").nonEmpty) | ||
|
|
||
| forkedSession.stop() | ||
| activeSession.stop() | ||
| } | ||
|
|
||
| test("fork new session and inherit experimental methods") { | ||
| object DummyRule1 extends Rule[LogicalPlan] { | ||
| def apply(p: LogicalPlan): LogicalPlan = p | ||
| } | ||
| object DummyRule2 extends Rule[LogicalPlan] { | ||
| def apply(p: LogicalPlan): LogicalPlan = p | ||
| } | ||
| val optimizations = List(DummyRule1, DummyRule2) | ||
|
|
||
| val activeSession = SparkSession.builder().master("local").getOrCreate() | ||
| activeSession.experimental.extraOptimizations = optimizations | ||
|
|
||
| val forkedSession = activeSession.newSession(inheritSessionState = true) | ||
|
|
||
| assert(forkedSession ne activeSession) | ||
| assert(forkedSession.experimental ne activeSession.experimental) | ||
| assert(forkedSession.experimental.extraOptimizations ne | ||
| activeSession.experimental.extraOptimizations) | ||
| assert(forkedSession.experimental.extraOptimizations.toSet == | ||
| activeSession.experimental.extraOptimizations.toSet) | ||
|
|
||
| forkedSession.stop() | ||
| activeSession.stop() | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -493,6 +493,21 @@ class CatalogSuite | |
| } | ||
| } | ||
|
|
||
| test("clone SessionCatalog") { | ||
| // need to test tempTables are cloned | ||
| assert(spark.catalog.listTables().collect().isEmpty) | ||
|
|
||
| createTempTable("my_temp_table") | ||
| assert(spark.catalog.listTables().collect().map(_.name).toSet == Set("my_temp_table")) | ||
|
|
||
| val forkedSession = spark.newSession(inheritSessionState = true) | ||
| assert(forkedSession.catalog.listTables().collect().map(_.name).toSet == Set("my_temp_table")) | ||
|
|
||
| dropTable("my_temp_table") | ||
|
||
| assert(spark.catalog.listTables().collect().map(_.name).toSet == Set()) | ||
| assert(forkedSession.catalog.listTables().collect().map(_.name).toSet == Set("my_temp_table")) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. didnt test the other way round. changes in forked catalog should not reflect the original
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes! |
||
| } | ||
|
|
||
| // TODO: add tests for the rest of them | ||
|
||
|
|
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -32,13 +32,11 @@ import org.apache.spark.{SparkConf, SparkContext} | |
| import org.apache.spark.internal.Logging | ||
| import org.apache.spark.sql.{SparkSession, SQLContext} | ||
| import org.apache.spark.sql.catalyst.analysis._ | ||
| import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder | ||
| import org.apache.spark.sql.catalyst.expressions.ExpressionInfo | ||
| import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan | ||
| import org.apache.spark.sql.execution.QueryExecution | ||
| import org.apache.spark.sql.execution.command.CacheTableCommand | ||
| import org.apache.spark.sql.hive._ | ||
| import org.apache.spark.sql.internal.{SharedState, SQLConf} | ||
| import org.apache.spark.sql.internal.{SessionState, SharedState, SQLConf} | ||
| import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION | ||
| import org.apache.spark.util.{ShutdownHookManager, Utils} | ||
|
|
||
|
|
@@ -115,16 +113,22 @@ class TestHiveContext( | |
| private[hive] class TestHiveSparkSession( | ||
| @transient private val sc: SparkContext, | ||
| @transient private val existingSharedState: Option[SharedState], | ||
| existingSessionState: Option[SessionState], | ||
|
||
| private val loadTestTables: Boolean) | ||
| extends SparkSession(sc) with Logging { self => | ||
|
|
||
| def this(sc: SparkContext, loadTestTables: Boolean) { | ||
| this( | ||
| sc, | ||
| existingSharedState = None, | ||
| existingSessionState = None, | ||
| loadTestTables) | ||
| } | ||
|
|
||
| def this(sc: SparkContext, existingSharedState: Option[SharedState], loadTestTables: Boolean) { | ||
| this(sc, existingSharedState, existingSessionState = None, loadTestTables) | ||
| } | ||
|
|
||
| { // set the metastore temporary configuration | ||
| val metastoreTempConf = HiveUtils.newTemporaryConfiguration(useInMemoryDerby = false) ++ Map( | ||
| ConfVars.METASTORE_INTEGER_JDO_PUSHDOWN.varname -> "true", | ||
|
|
@@ -151,7 +155,7 @@ private[hive] class TestHiveSparkSession( | |
| new TestHiveSessionState(self) | ||
|
|
||
| override def newSession(): TestHiveSparkSession = { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can change it to override |
||
| new TestHiveSparkSession(sc, Some(sharedState), loadTestTables) | ||
| new TestHiveSparkSession(sc, Some(sharedState), None, loadTestTables) | ||
| } | ||
|
|
||
| private var cacheTables: Boolean = false | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is not correct. I would like to say