-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-19876][SS][WIP] OneTime Trigger Executor #17219
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
08a36f8
0e21d0e
9b8abb4
682eb1a
a129dd5
3e666b1
b4ef029
7cb43b7
98812cb
8c5b84f
573ec98
5d2ba62
2989fad
24746f3
fd28ed7
841f5c9
83db3b8
b02d10d
5e16632
d7c5edf
ae92ec6
64cd233
8b50da3
f928ade
0925965
db5ae3f
0c3e20c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,61 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.sql.execution.streaming | ||
|
|
||
| import java.io.{InputStream, OutputStream} | ||
| import java.nio.charset.StandardCharsets._ | ||
|
|
||
| import scala.io.{Source => IOSource} | ||
|
|
||
| import org.apache.spark.sql.SparkSession | ||
|
|
||
| class OffsetCommitLog(sparkSession: SparkSession, path: String) | ||
| extends HDFSMetadataLog[Option[String]](sparkSession, path) { | ||
|
|
||
| override protected def deserialize(in: InputStream): Option[String] = { | ||
| // called inside a try-finally where the underlying stream is closed in the caller | ||
| val lines = IOSource.fromInputStream(in, UTF_8.name()).getLines() | ||
| if (!lines.hasNext) { | ||
| throw new IllegalStateException("Incomplete log file") | ||
|
||
| } | ||
| val version = lines.next() | ||
| if (version != OffsetCommitLog.VERSION) { | ||
| throw new IllegalStateException(s"Unknown log version: ${version}") | ||
|
||
| } | ||
| // read metadata | ||
| lines.next().trim match { | ||
| case OffsetCommitLog.SERIALIZED_VOID => None | ||
| case metadata => Some(metadata) | ||
| } | ||
| } | ||
|
|
||
| override protected def serialize(metadata: Option[String], out: OutputStream): Unit = { | ||
| // called inside a try-finally where the underlying stream is closed in the caller | ||
| out.write(OffsetCommitLog.VERSION.getBytes(UTF_8)) | ||
| out.write('\n') | ||
|
|
||
| // write metadata or void | ||
| out.write(metadata.getOrElse(OffsetCommitLog.SERIALIZED_VOID).getBytes(UTF_8)) | ||
| } | ||
| } | ||
|
|
||
| object OffsetCommitLog { | ||
| private val VERSION = "v1" | ||
| private val SERIALIZED_VOID = "-" | ||
|
||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -162,6 +162,7 @@ class StreamExecution( | |
|
|
||
| private val triggerExecutor = trigger match { | ||
| case t: ProcessingTime => ProcessingTimeExecutor(t, triggerClock) | ||
| case _ @ OneTime => OneTimeExecutor() | ||
| } | ||
|
|
||
| /** Defines the internal state of execution */ | ||
|
|
@@ -206,6 +207,12 @@ class StreamExecution( | |
| */ | ||
| val offsetLog = new OffsetSeqLog(sparkSession, checkpointFile("offsets")) | ||
|
|
||
| /** | ||
| * A log that records the committed batch ids. This is used to check if a batch was committed | ||
| * on restart, instead of (possibly) re-running the previous batch. | ||
|
||
| */ | ||
| val commitLog = new OffsetCommitLog(sparkSession, checkpointFile("commits")) | ||
|
|
||
| /** Whether all fields of the query have been initialized */ | ||
| private def isInitialized: Boolean = state.get != INITIALIZING | ||
|
|
||
|
|
@@ -284,6 +291,7 @@ class StreamExecution( | |
| finishTrigger(dataAvailable) | ||
| if (dataAvailable) { | ||
| // We'll increase currentBatchId after we complete processing current batch's data | ||
| commitLog.add(currentBatchId, None) | ||
|
||
| currentBatchId += 1 | ||
| } else { | ||
| currentStatus = currentStatus.copy(isDataAvailable = false) | ||
|
|
@@ -377,17 +385,25 @@ class StreamExecution( | |
| private def populateStartOffsets(): Unit = { | ||
| offsetLog.getLatest() match { | ||
| case Some((batchId, nextOffsets)) => | ||
|
||
| logInfo(s"Resuming streaming query, starting with batch $batchId") | ||
| currentBatchId = batchId | ||
| availableOffsets = nextOffsets.toStreamProgress(sources) | ||
| offsetSeqMetadata = nextOffsets.metadata.getOrElse(OffsetSeqMetadata()) | ||
| logDebug(s"Found possibly unprocessed offsets $availableOffsets " + | ||
| s"at batch timestamp ${offsetSeqMetadata.batchTimestampMs}") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why remove this debug log?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah. this does not make sense here any more. But please add similar logging of recovered metadata later, where you have logged the start and available offsets. |
||
|
|
||
| offsetLog.get(batchId - 1).foreach { | ||
| case lastOffsets => | ||
| committedOffsets = lastOffsets.toStreamProgress(sources) | ||
| logDebug(s"Resuming with committed offsets: $committedOffsets") | ||
| currentBatchId = commitLog.getLatest() match { | ||
|
||
| case Some((committedBatchId, metadata)) => committedBatchId + 1 | ||
| case None => batchId // can only happen if batchId is 0 | ||
| } | ||
| if (currentBatchId > 0) { | ||
| offsetLog.get(currentBatchId - 1).foreach { | ||
| case lastOffsets => | ||
| committedOffsets = lastOffsets.toStreamProgress(sources) | ||
| logDebug(s"Resuming with committed offsets: $committedOffsets") | ||
| } | ||
| } | ||
| logInfo(s"Resuming streaming query, starting with batch $currentBatchId") | ||
| if (currentBatchId == batchId) { | ||
| availableOffsets = nextOffsets.toStreamProgress(sources) | ||
| offsetSeqMetadata = nextOffsets.metadata.getOrElse(OffsetSeqMetadata()) | ||
| logDebug(s"Found possibly unprocessed offsets $availableOffsets " + | ||
| s"at batch timestamp ${offsetSeqMetadata.batchTimestampMs}") | ||
| } else { | ||
| constructNextBatch() | ||
| } | ||
| case None => // We are starting this stream for the first time. | ||
| logInfo(s"Starting new streaming query.") | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -36,6 +36,26 @@ import org.apache.spark.unsafe.types.CalendarInterval | |
| @InterfaceStability.Evolving | ||
| sealed trait Trigger | ||
|
|
||
| /** | ||
| * :: Experimental :: | ||
| * A trigger that runs a query once then terminates | ||
| * | ||
| * Scala Example: | ||
| * {{{ | ||
| * df.write.trigger(OneTime) | ||
| * }}} | ||
| * | ||
| * Java Example: | ||
| * {{{ | ||
| * df.write.trigger(OneTime.create()) | ||
|
||
| * }}} | ||
| * | ||
| * @since 2.2.0 | ||
| */ | ||
| @Experimental | ||
| @InterfaceStability.Evolving | ||
| object OneTime extends Trigger | ||
|
|
||
| /** | ||
| * :: Experimental :: | ||
| * A trigger that runs a query periodically based on the processing time. If `interval` is 0, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -277,7 +277,8 @@ class StreamingAggregationSuite extends StateStoreMetricsTest with BeforeAndAfte | |
| true | ||
| }, | ||
| StartStream(ProcessingTime("10 seconds"), triggerClock = clock), | ||
| CheckLastBatch((20L, 1), (85L, 1)), | ||
| // The commit log should ensure that we do not run another batch | ||
| CheckLastBatch(), | ||
|
||
| AssertOnQuery { q => | ||
| clock.getTimeMillis() == 90000L | ||
| }, | ||
|
|
@@ -327,7 +328,8 @@ class StreamingAggregationSuite extends StateStoreMetricsTest with BeforeAndAfte | |
| true | ||
| }, | ||
| StartStream(ProcessingTime("10 day"), triggerClock = clock), | ||
| CheckLastBatch((20L, 1), (85L, 1)), | ||
| // Commit log should prevent batch from running again | ||
|
||
| CheckLastBatch(), | ||
|
|
||
| // advance clock to 100 days, should retain keys >= 90 | ||
| AddData(inputData, 85L, 90L, 100L, 105L), | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -109,8 +109,9 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter { | |
|
|
||
| // Termination event generated with exception message when stopped with error | ||
| StartStream(ProcessingTime(100), triggerClock = clock), | ||
| AdvanceManualClock(100), // advance clock to ensure completed initial trigger | ||
|
||
| AddData(inputData, 0), | ||
| AdvanceManualClock(100), | ||
| AdvanceManualClock(100), // process bad data | ||
| ExpectFailure[SparkException](), | ||
| AssertOnQuery { query => | ||
| eventually(Timeout(streamingTimeout)) { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Scala doc please