-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-30915][SS] CompactibleFileStreamLog: Avoid reading the metadata log file when finding the latest batch ID #27664
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
0cd8cda
d270961
30338fb
ff9078e
f6078bb
83451c1
4a5679e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -182,17 +182,26 @@ class HDFSMetadataLog[T <: AnyRef : ClassTag](sparkSession: SparkSession, path: | |
| } | ||
| } | ||
|
|
||
| override def getLatest(): Option[(Long, T)] = { | ||
| val batchIds = fileManager.list(metadataPath, batchFilesFilter) | ||
| /** | ||
| * Return the latest batch Id without reading the file. This method only checks for existence of | ||
| * file to avoid cost on reading and deserializing log file. | ||
| */ | ||
| def getLatestBatchId(): Option[Long] = { | ||
| fileManager.list(metadataPath, batchFilesFilter) | ||
| .map(f => pathToBatchId(f.getPath)) | ||
| .sorted(Ordering.Long.reverse) | ||
| for (batchId <- batchIds) { | ||
| val batch = get(batchId) | ||
| if (batch.isDefined) { | ||
| return Some((batchId, batch.get)) | ||
| .headOption | ||
| } | ||
|
|
||
| override def getLatest(): Option[(Long, T)] = { | ||
| getLatestBatchId().map { batchId => | ||
| val content = get(batchId).getOrElse { | ||
| // If we find the last batch file, we must read that file, other than failing back to | ||
| // old batches. | ||
| throw new IllegalStateException(s"failed to read log file for batch $batchId") | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit but maybe not to involve a new behavior change here?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a part of the change in #25965 which should be dealt with. It shouldn't give the content with batch ID which is less than the latest batch ID - it should rather fail.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That said, you might have interesting proposals on my old PRs, https://github.com/apache/spark/pulls/HeartSaVioR
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the reference, will take a look later.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for reference #25965, LGTM to this change.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just pulled the comment here. Either this or #25965 will have to resolve merge conflict but wanted to be sure the code comment is clear in any way. |
||
| } | ||
| (batchId, content) | ||
| } | ||
| None | ||
| } | ||
|
|
||
| /** | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,7 +18,15 @@ | |
| package org.apache.spark.sql.execution.streaming | ||
|
|
||
| import java.io.{ByteArrayInputStream, ByteArrayOutputStream} | ||
| import java.lang.{Long => JLong} | ||
| import java.net.URI | ||
| import java.nio.charset.StandardCharsets.UTF_8 | ||
| import java.util.concurrent.ConcurrentHashMap | ||
| import java.util.concurrent.atomic.AtomicLong | ||
|
|
||
| import scala.util.Random | ||
|
|
||
| import org.apache.hadoop.fs.{FSDataInputStream, Path, RawLocalFileSystem} | ||
|
|
||
| import org.apache.spark.SparkFunSuite | ||
| import org.apache.spark.sql.internal.SQLConf | ||
|
|
@@ -240,6 +248,44 @@ class FileStreamSinkLogSuite extends SparkFunSuite with SharedSparkSession { | |
| )) | ||
| } | ||
|
|
||
| test("getLatestBatchId") { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't add E2E test to simplify the test code, but if we prefer E2E than I'll try to add a new test to FileStreamSinkSuite. |
||
| withCountOpenLocalFileSystemAsLocalFileSystem { | ||
| val scheme = CountOpenLocalFileSystem.scheme | ||
| withSQLConf(SQLConf.FILE_SINK_LOG_COMPACT_INTERVAL.key -> "3") { | ||
| withTempDir { dir => | ||
| val sinkLog = new FileStreamSinkLog(FileStreamSinkLog.VERSION, spark, | ||
| s"$scheme:///${dir.getCanonicalPath}") | ||
| for (batchId <- 0L to 2L) { | ||
| sinkLog.add( | ||
| batchId, | ||
| Array(newFakeSinkFileStatus("/a/b/" + batchId, FileStreamSinkLog.ADD_ACTION))) | ||
| } | ||
|
|
||
| def getCountForOpenOnMetadataFile(batchId: Long): Long = { | ||
| val path = sinkLog.batchIdToPath(batchId).toUri.getPath | ||
| CountOpenLocalFileSystem.pathToNumOpenCalled.getOrDefault(path, 0L) | ||
| } | ||
|
|
||
| CountOpenLocalFileSystem.resetCount() | ||
|
|
||
| assert(sinkLog.getLatestBatchId() === Some(2L)) | ||
| // getLatestBatchId doesn't open the latest metadata log file | ||
| (0L to 2L).foreach { batchId => | ||
| assert(getCountForOpenOnMetadataFile(batchId) === 0L) | ||
| } | ||
|
|
||
| assert(sinkLog.getLatest().map(_._1).getOrElse(-1L) === 2L) | ||
| (0L to 1L).foreach { batchId => | ||
| assert(getCountForOpenOnMetadataFile(batchId) === 0L) | ||
| } | ||
| // getLatest opens the latest metadata log file, which explains the needs on | ||
| // having "getLatestBatchId". | ||
| assert(getCountForOpenOnMetadataFile(2L) === 1L) | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Create a fake SinkFileStatus using path and action. Most of tests don't care about other fields | ||
| * in SinkFileStatus. | ||
|
|
@@ -267,4 +313,41 @@ class FileStreamSinkLogSuite extends SparkFunSuite with SharedSparkSession { | |
| val log = new FileStreamSinkLog(FileStreamSinkLog.VERSION, spark, input.toString) | ||
| log.allFiles() | ||
| } | ||
|
|
||
| private def withCountOpenLocalFileSystemAsLocalFileSystem(body: => Unit): Unit = { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The code regarding FileSystem I add here is very similar with what I add in #27620. When either one gets merged, I'll rebase and deduplicate it. |
||
| val optionKey = s"fs.${CountOpenLocalFileSystem.scheme}.impl" | ||
| val originClassForLocalFileSystem = spark.conf.getOption(optionKey) | ||
| try { | ||
| spark.conf.set(optionKey, classOf[CountOpenLocalFileSystem].getName) | ||
| body | ||
| } finally { | ||
| originClassForLocalFileSystem match { | ||
| case Some(fsClazz) => spark.conf.set(optionKey, fsClazz) | ||
| case _ => spark.conf.unset(optionKey) | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| class CountOpenLocalFileSystem extends RawLocalFileSystem { | ||
| import CountOpenLocalFileSystem._ | ||
|
|
||
| override def getUri: URI = { | ||
| URI.create(s"$scheme:///") | ||
| } | ||
|
|
||
| override def open(f: Path, bufferSize: Int): FSDataInputStream = { | ||
| val path = f.toUri.getPath | ||
| pathToNumOpenCalled.compute(path, (_, v) => { | ||
| if (v == null) 1L else v + 1 | ||
| }) | ||
| super.open(f, bufferSize) | ||
| } | ||
| } | ||
|
|
||
| object CountOpenLocalFileSystem { | ||
| val scheme = s"FileStreamSinkLogSuite${math.abs(Random.nextInt)}fs" | ||
| val pathToNumOpenCalled = new ConcurrentHashMap[String, JLong] | ||
|
|
||
| def resetCount(): Unit = pathToNumOpenCalled.clear() | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice catch!
spark/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala
Line 196 in 7ad6ba3
spark/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceLog.scala
Line 99 in 7ad6ba3
Can these two places also be optimized in this way?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah I think so. Nice finding. Thanks!