-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-20568][SS] Provide option to clean up completed files in streaming query #22952
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
30b82df
1697c86
2f5a73f
6e2a824
33a5331
b1a6bec
b67778a
dd9d4ad
178d2f4
21c71c4
01f5750
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,7 +20,10 @@ package org.apache.spark.sql.execution.streaming | |
| import java.net.URI | ||
| import java.util.concurrent.TimeUnit._ | ||
|
|
||
| import org.apache.hadoop.fs.{FileStatus, Path} | ||
| import scala.util.control.NonFatal | ||
|
|
||
| import org.apache.hadoop.conf.Configuration | ||
| import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} | ||
|
|
||
| import org.apache.spark.deploy.SparkHadoopUtil | ||
| import org.apache.spark.internal.Logging | ||
|
|
@@ -53,6 +56,9 @@ class FileStreamSource( | |
| fs.makeQualified(new Path(path)) // can contain glob patterns | ||
| } | ||
|
|
||
| private val sourceCleaner: Option[FileStreamSourceCleaner] = FileStreamSourceCleaner( | ||
| fs, qualifiedBasePath, sourceOptions, hadoopConf) | ||
|
|
||
| private val optionsWithPartitionBasePath = sourceOptions.optionMapWithoutPath ++ { | ||
| if (!SparkHadoopUtil.get.isGlobPath(new Path(path)) && options.contains("path")) { | ||
| Map("basePath" -> path) | ||
|
|
@@ -258,16 +264,21 @@ class FileStreamSource( | |
| * equal to `end` and will only request offsets greater than `end` in the future. | ||
| */ | ||
| override def commit(end: Offset): Unit = { | ||
| // No-op for now; FileStreamSource currently garbage-collects files based on timestamp | ||
| // and the value of the maxFileAge parameter. | ||
| val logOffset = FileStreamSourceOffset(end).logOffset | ||
|
|
||
| sourceCleaner.foreach { cleaner => | ||
| val files = metadataLog.get(Some(logOffset), Some(logOffset)).flatMap(_._2) | ||
| val validFileEntities = files.filter(_.batchId == logOffset) | ||
| logDebug(s"completed file entries: ${validFileEntities.mkString(",")}") | ||
| validFileEntities.foreach(cleaner.clean) | ||
| } | ||
| } | ||
|
|
||
| override def stop(): Unit = {} | ||
| } | ||
|
|
||
|
|
||
| object FileStreamSource { | ||
|
|
||
| /** Timestamp for file modification time, in ms since January 1, 1970 UTC. */ | ||
| type Timestamp = Long | ||
|
|
||
|
|
@@ -330,4 +341,96 @@ object FileStreamSource { | |
|
|
||
| def size: Int = map.size() | ||
| } | ||
|
|
||
| private[sql] trait FileStreamSourceCleaner { | ||
| def clean(entry: FileEntry): Unit | ||
| } | ||
|
|
||
| private[sql] object FileStreamSourceCleaner { | ||
| def apply( | ||
| fileSystem: FileSystem, | ||
| sourcePath: Path, | ||
| option: FileStreamOptions, | ||
| hadoopConf: Configuration): Option[FileStreamSourceCleaner] = option.cleanSource match { | ||
| case CleanSourceMode.ARCHIVE => | ||
| require(option.sourceArchiveDir.isDefined) | ||
| val path = new Path(option.sourceArchiveDir.get) | ||
| val archiveFs = path.getFileSystem(hadoopConf) | ||
| val qualifiedArchivePath = archiveFs.makeQualified(path) | ||
| Some(new SourceFileArchiver(fileSystem, sourcePath, archiveFs, qualifiedArchivePath)) | ||
|
|
||
| case CleanSourceMode.DELETE => | ||
| Some(new SourceFileRemover(fileSystem)) | ||
|
|
||
| case _ => None | ||
| } | ||
| } | ||
|
|
||
| private[sql] class SourceFileArchiver( | ||
| fileSystem: FileSystem, | ||
| sourcePath: Path, | ||
| baseArchiveFileSystem: FileSystem, | ||
| baseArchivePath: Path) extends FileStreamSourceCleaner with Logging { | ||
| assertParameters() | ||
|
|
||
| private def assertParameters(): Unit = { | ||
| require(fileSystem.getUri == baseArchiveFileSystem.getUri, "Base archive path is located " + | ||
| s"on a different file system than the source files. source path: $sourcePath" + | ||
| s" / base archive path: $baseArchivePath") | ||
|
|
||
| /** | ||
| * FileStreamSource reads the files which one of below conditions is met: | ||
| * 1) file itself is matched with source path | ||
| * 2) parent directory is matched with source path | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @HeartSaVioR Could you clarify this? I think there are some cases we still read files but they don't met these conditions:
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @zsxwing For Adding two cases, FileStreamSource can read any files under the source path, which invalidates the depth check. There're three options to deal with this:
Which one (or couple of) would be the preferred approach?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FYI, just filed https://issues.apache.org/jira/browse/SPARK-30281 and raised a patch with picking the option 2. #26920 |
||
| * | ||
| * Checking with glob pattern is costly, so set this requirement to eliminate the cases | ||
| * where the archive path can be matched with source path. For example, when file is moved | ||
| * to archive directory, destination path will retain input file's path as suffix, so | ||
| * destination path can't be matched with source path if archive directory's depth is longer | ||
| * than 2, as neither file nor parent directory of destination path can be matched with | ||
| * source path. | ||
| */ | ||
| require(baseArchivePath.depth() > 2, "Base archive path must have at least 2 " + | ||
| "subdirectories from root directory. e.g. '/data/archive'") | ||
| } | ||
|
|
||
| override def clean(entry: FileEntry): Unit = { | ||
| val curPath = new Path(new URI(entry.path)) | ||
| val newPath = new Path(baseArchivePath.toString.stripSuffix("/") + curPath.toUri.getPath) | ||
|
|
||
| try { | ||
| logDebug(s"Creating directory if it doesn't exist ${newPath.getParent}") | ||
| if (!fileSystem.exists(newPath.getParent)) { | ||
| fileSystem.mkdirs(newPath.getParent) | ||
| } | ||
|
|
||
| logDebug(s"Archiving completed file $curPath to $newPath") | ||
| if (!fileSystem.rename(curPath, newPath)) { | ||
| logWarning(s"Fail to move $curPath to $newPath / skip moving file.") | ||
| } | ||
| } catch { | ||
| case NonFatal(e) => | ||
| logWarning(s"Fail to move $curPath to $newPath / skip moving file.", e) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| private[sql] class SourceFileRemover(fileSystem: FileSystem) | ||
| extends FileStreamSourceCleaner with Logging { | ||
|
|
||
| override def clean(entry: FileEntry): Unit = { | ||
| val curPath = new Path(new URI(entry.path)) | ||
| try { | ||
| logDebug(s"Removing completed file $curPath") | ||
|
|
||
| if (!fileSystem.delete(curPath, false)) { | ||
| logWarning(s"Failed to remove $curPath / skip removing file.") | ||
| } | ||
| } catch { | ||
| case NonFatal(e) => | ||
| // Log to error but swallow exception to avoid process being stopped | ||
| logWarning(s"Fail to remove $curPath / skip removing file.", e) | ||
| } | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd drop s3n & s3 refs as they have gone from deprecated to deceased
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks like beyond of this PR: we can address it in separate PR. Could you raise another one?