-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-11290][STREAMING] Basic implementation of trackStateByKey #9256
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 13 commits
7f15e29
ff77312
1fea358
27dbabc
3fc5067
514eb01
d5b2bec
58eee1e
672e3e6
51465f4
10f6a0e
b7c653d
bd9cd94
be8cffc
6c02f44
23596b8
df927ba
6a75966
0f1b1bc
62d9abd
df3bb1b
b28179f
a78130d
fb5a296
f1a6696
77c9a66
ae64786
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,138 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.streaming | ||
|
|
||
| /** | ||
| * Abstract class for getting and updating the tracked state in the `trackStateByKey` operation of | ||
| * [[org.apache.spark.streaming.dstream.PairDStreamFunctions pair DStream]] and | ||
| * [[org.apache.spark.streaming.api.java.JavaPairDStream]]. | ||
| * {{{ | ||
| * | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Todo: Add example in doc. |
||
| * }}} | ||
| */ | ||
| sealed abstract class State[S] { | ||
|
|
||
| /** Whether the state already exists */ | ||
| def exists(): Boolean | ||
|
|
||
| /** | ||
| * Get the state if it exists, otherwise wise it will throw an exception. | ||
| * Check with `exists()` whether the state exists or not before calling `get()`. | ||
| */ | ||
| def get(): S | ||
|
|
||
| /** | ||
| * Update the state with a new value. Note that you cannot update the state if the state is | ||
| * timing out (that is, `isTimingOut() return true`, or if the state has already been removed by | ||
| * `remove()`. | ||
| */ | ||
| def update(newState: S): Unit | ||
|
|
||
| /** Remove the state if it exists. */ | ||
| def remove(): Unit | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this be
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. BTW I have no strong feeling here other than that it match existing things, if we have them.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The only semantically similar thing is a HashMap, and there both Scala and Java HashMap uses remove() |
||
|
|
||
| /** Is the state going to be timed out by the system after this batch interval */ | ||
| def isTimingOut(): Boolean | ||
|
|
||
| @inline final def getOption(): Option[S] = Option(get()) | ||
|
|
||
| /** Get the state if it exists, otherwise return the default value */ | ||
| @inline final def getOrElse[S1 >: S](default: => S1): S1 = { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure is this "call-by-name" parameter Java friendly? Assuming this
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, that is probably a valid concern. If users cannot call it from Java, this could be a Scala-only thing. |
||
| if (exists) this.get else default | ||
| } | ||
|
|
||
| @inline final override def toString() = getOption.map { _.toString }.getOrElse("<state not set>") | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we need this?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We dont "need" it. But its just convenient for people to automatically convert to an option, so that they can write
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We shouldn't have that as part of our API, we don't use implicit conversions in other places in Spark. It will be pretty confusing. Instead they can just do state.getOption.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Cool. Will remove it. |
||
|
|
||
| /** Internal implementation of the [[State]] interface */ | ||
| private[streaming] class StateImpl[S] extends State[S] { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why did we separate this into another class? I don't see any other subclasses of State or great reasons to create them
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I did it so that we can separate the publicly visible interfaces, from the private[streaming] ones. private[streaming] ones are still publicly accessible from Java, so I was trying to avoid that. Other than than I dont really have a strong reason. If that isnt a strong enough reason to have a separate abstract class and concrete implementation, then I can merge them into one. |
||
|
|
||
| private var state: S = null.asInstanceOf[S] | ||
| private var defined: Boolean = true | ||
| private var timingOut: Boolean = false | ||
| private var updated: Boolean = false | ||
| private var removed: Boolean = false | ||
|
|
||
| // ========= Public API ========= | ||
| def exists(): Boolean = { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add override modifier to everything that's implementing an interface function |
||
| defined | ||
| } | ||
|
|
||
| def get(): S = { | ||
| state | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good catch. On the todo list to add State unit tests to verify this behavior. |
||
| } | ||
|
|
||
| def update(newState: S): Unit = { | ||
| require(!removed, "Cannot update the state after it has been removed") | ||
| require(!timingOut, "Cannot update the state that is timing out") | ||
| state = newState | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this required for defensive guard
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, if the user accidentally tries to update the state that is going to be removed by timeout anyways, the system should throw an error rather than silently allowing him/her to update without actually being updated. |
||
| updated = true | ||
| } | ||
|
|
||
| def isTimingOut(): Boolean = { | ||
| timingOut | ||
| } | ||
|
|
||
| def remove(): Unit = { | ||
| require(!timingOut, "Cannot remove the state that is timing out") | ||
| removed = true | ||
| } | ||
|
|
||
| // ========= Internal API ========= | ||
|
|
||
| /** Whether the state has been marked for removing */ | ||
| def isRemoved(): Boolean = { | ||
| removed | ||
| } | ||
|
|
||
| /** Whether the state has been been updated */ | ||
| def isUpdated(): Boolean = { | ||
| updated | ||
| } | ||
|
|
||
| /** | ||
| * Internal method to update the state data and reset internal flags in `this`. | ||
| * This method allows `this` object to be reused across many state records. | ||
| */ | ||
| def wrap(optionalState: Option[S]): Unit = { | ||
| optionalState match { | ||
| case Some(newState) => | ||
| this.state = newState | ||
| defined = true | ||
|
|
||
| case None => | ||
| this.state = null.asInstanceOf[S] | ||
| defined = false | ||
| } | ||
| timingOut = false | ||
| removed = false | ||
| updated = false | ||
| } | ||
|
|
||
| /** | ||
| * Internal method to update the state data and reset internal flags in `this`. | ||
| * This method allows `this` object to be reused across many state records. | ||
| */ | ||
| def wrapTiminoutState(newState: S): Unit = { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems to contain a typo? It's also not clear how this is different from wrap, from the comment
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point, will update comment. |
||
| this.state = newState | ||
| defined = true | ||
| timingOut = true | ||
| removed = false | ||
| updated = false | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,111 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.streaming | ||
|
|
||
| import scala.reflect.ClassTag | ||
|
|
||
| import org.apache.spark.{HashPartitioner, Partitioner} | ||
| import org.apache.spark.api.java.JavaPairRDD | ||
| import org.apache.spark.rdd.RDD | ||
|
|
||
|
|
||
| /** | ||
| * Abstract class having all the specifications of DStream.trackStateByKey(). | ||
| * Use the `TrackStateSpec.create()` or `TrackStateSpec.create()` to create instances of this class. | ||
| * | ||
| * {{{ | ||
| * TrackStateSpec(trackingFunction) // in Scala | ||
| * TrackStateSpec.create(trackingFunction) // in Java | ||
| * }}} | ||
| */ | ||
| sealed abstract class TrackStateSpec[K: ClassTag, V: ClassTag, S: ClassTag, T: ClassTag] | ||
|
||
| extends Serializable { | ||
|
|
||
| def initialState(rdd: RDD[(K, S)]): this.type | ||
| def initialState(javaPairRDD: JavaPairRDD[K, S]): this.type | ||
|
|
||
| def numPartitions(numPartitions: Int): this.type | ||
| def partitioner(partitioner: Partitioner): this.type | ||
|
|
||
| def timeout(interval: Duration): this.type | ||
|
||
| } | ||
|
|
||
|
|
||
| /** Builder object for creating instances of TrackStateSpec */ | ||
| object TrackStateSpec { | ||
|
|
||
| def apply[K: ClassTag, V: ClassTag, S: ClassTag, T: ClassTag]( | ||
| trackingFunction: (K, Option[V], State[S]) => Option[T]): TrackStateSpec[K, V, S, T] = { | ||
| new TrackStateSpecImpl[K, V, S, T](trackingFunction) | ||
| } | ||
|
|
||
| def create[K: ClassTag, V: ClassTag, S: ClassTag, T: ClassTag]( | ||
| trackingFunction: (K, Option[V], State[S]) => Option[T]): TrackStateSpec[K, V, S, T] = { | ||
| apply(trackingFunction) | ||
| } | ||
|
||
| } | ||
|
|
||
|
|
||
| /** Internal implementation of [[TrackStateSpec]] interface */ | ||
| private[streaming] | ||
| case class TrackStateSpecImpl[K: ClassTag, V: ClassTag, S: ClassTag, T: ClassTag]( | ||
| function: (K, Option[V], State[S]) => Option[T]) extends TrackStateSpec[K, V, S, T] { | ||
|
|
||
| require(function != null) | ||
|
|
||
| @volatile private var partitioner: Partitioner = null | ||
| @volatile private var initialStateRDD: RDD[(K, S)] = null | ||
| @volatile private var timeoutInterval: Duration = null | ||
|
|
||
|
|
||
| def initialState(rdd: RDD[(K, S)]): this.type = { | ||
| this.initialStateRDD = rdd | ||
| this | ||
| } | ||
|
|
||
| def initialState(javaPairRDD: JavaPairRDD[K, S]): this.type = { | ||
| this.initialStateRDD = javaPairRDD.rdd | ||
| this | ||
| } | ||
|
|
||
|
|
||
| def numPartitions(numPartitions: Int): this.type = { | ||
| this.partitioner(new HashPartitioner(numPartitions)) | ||
| this | ||
| } | ||
|
|
||
| def partitioner(partitioner: Partitioner): this.type = { | ||
| this.partitioner = partitioner | ||
| this | ||
| } | ||
|
|
||
| def timeout(interval: Duration): this.type = { | ||
| this.timeoutInterval = interval | ||
| this | ||
| } | ||
|
|
||
| // ================= Private Methods ================= | ||
|
|
||
| private[streaming] def getFunction(): (K, Option[V], State[S]) => Option[T] = function | ||
|
|
||
| private[streaming] def getInitialStateRDD(): Option[RDD[(K, S)]] = Option(initialStateRDD) | ||
|
|
||
| private[streaming] def getPartitioner(): Option[Partitioner] = Option(partitioner) | ||
|
|
||
| private[streaming] def getTimeoutInterval(): Option[Duration] = Option(timeoutInterval) | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since
updateStateByKeyisn't removed, maybe it's better to keep this example and create a new file fortrackStateByKey.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if we want to encourage people to move to the new API, it might be ok to not have the old one.