-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-13809][SQL] State store for streaming aggregations #11645
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
c019474
4f8dade
f417bde
d8cee54
7d74c67
c5dd061
7adca70
a0ba498
48afbe6
bee673c
d963efa
22d7e66
34ae7ff
13c29a2
d5e2b10
f5660d2
d313683
7ea847c
b8b4632
e89f4d0
b5e2421
76dd988
8123818
2fb5b85
dee7a0e
15e1780
b0bd043
e6c7016
3fe34e9
8cb0da8
9fb9c43
e6b1fb3
e6f5ab8
2bd6cbd
25afe31
29c2af0
32c0139
3824053
534ad48
19a60a6
5b7cf53
f4f3838
502e5a5
2f29d9a
24fb325
4752d73
756762a
63fad92
b147f59
819ca17
70cc7b1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -583,6 +583,3 @@ private[state] class HDFSBackedStateStoreProvider( | |
| } | ||
| } | ||
|
|
||
| private[state] object HDFSBackedStateStoreProvider { | ||
|
|
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -53,19 +53,25 @@ private[sql] object StateStoreCoordinatorRef extends Logging { | |
| * Create a reference to a [[StateStoreCoordinator]], This can be called from driver as well as | ||
| * executors. | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This can not be called from executors as creating |
||
| */ | ||
| def apply(env: SparkEnv): StateStoreCoordinatorRef = synchronized { | ||
| def forDriver(env: SparkEnv): StateStoreCoordinatorRef = synchronized { | ||
| try { | ||
| val coordinator = new StateStoreCoordinator(env.rpcEnv) | ||
| val coordinatorRef = env.rpcEnv.setupEndpoint(endpointName, coordinator) | ||
| logInfo("Registered StateStoreCoordinator endpoint") | ||
| new StateStoreCoordinatorRef(coordinatorRef) | ||
| } catch { | ||
| case e: IllegalArgumentException => | ||
| logDebug("Retrieving existing StateStoreCoordinator endpoint") | ||
| val rpcEndpointRef = RpcUtils.makeDriverRef(endpointName, env.conf, env.rpcEnv) | ||
| logDebug("Retrieved existing StateStoreCoordinator endpoint") | ||
| new StateStoreCoordinatorRef(rpcEndpointRef) | ||
| } | ||
| } | ||
|
|
||
| def forExecutor(env: SparkEnv): StateStoreCoordinatorRef = synchronized { | ||
| val rpcEndpointRef = RpcUtils.makeDriverRef(endpointName, env.conf, env.rpcEnv) | ||
| logDebug("Retrieved existing StateStoreCoordinator endpoint") | ||
| new StateStoreCoordinatorRef(rpcEndpointRef) | ||
| } | ||
| } | ||
|
|
||
| /** | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -142,6 +142,38 @@ class StateStoreRDDSuite extends SparkFunSuite with BeforeAndAfter with BeforeAn | |
| } | ||
| } | ||
|
|
||
| test("distributed test") { | ||
| quietly { | ||
| withSpark(new SparkContext(sparkConf.setMaster("local-cluster[2, 1, 1024]"))) { sc => | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: should clone |
||
| implicit val sqlContet = new SQLContext(sc) | ||
| val path = Utils.createDirectory(tempDir, Random.nextString(10)).toString | ||
| val increment = (store: StateStore, iter: Iterator[String]) => { | ||
| iter.foreach { s => | ||
| store.update( | ||
| stringToRow(s), oldRow => { | ||
| val oldValue = oldRow.map(rowToInt).getOrElse(0) | ||
| intToRow(oldValue + 1) | ||
| }) | ||
| } | ||
| store.commit() | ||
| store.iterator().map(rowsToStringInt) | ||
| } | ||
| val opId = 0 | ||
| val rdd1 = makeRDD(sc, Seq("a", "b", "a")).mapPartitionWithStateStore( | ||
| increment, path, opId, storeVersion = 0, keySchema, valueSchema) | ||
| assert(rdd1.collect().toSet === Set("a" -> 2, "b" -> 1)) | ||
|
|
||
| // Generate next version of stores | ||
| val rdd2 = makeRDD(sc, Seq("a", "c")).mapPartitionWithStateStore( | ||
| increment, path, opId, storeVersion = 1, keySchema, valueSchema) | ||
| assert(rdd2.collect().toSet === Set("a" -> 3, "b" -> 1, "c" -> 1)) | ||
|
|
||
| // Make sure the previous RDD still has the same data. | ||
| assert(rdd1.collect().toSet === Set("a" -> 2, "b" -> 1)) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| private def makeRDD(sc: SparkContext, seq: Seq[String]): RDD[String] = { | ||
| sc.makeRDD(seq, 2).groupBy(x => x).flatMap(_._2) | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These should be defined in
SQLConf.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This cannot be defined in SQLConf as this is a executor-wide configuration. I am renaming this to
spark.streaming.stateStore.maintenanceInterval