apache · HeartSaVioR · Nov 22, 2019 · Nov 22, 2019 · Nov 25, 2019 · Dec 2, 2019
diff --git a/docs/structured-streaming-programming-guide.md b/docs/structured-streaming-programming-guide.md
@@ -551,7 +551,7 @@ Here are the details of all the sources in Spark.
         When "archive" is provided, additional option <code>sourceArchiveDir</code> must be provided as well. The value of "sourceArchiveDir" must have 2 subdirectories (so depth of directory is greater than 2). e.g. <code>/archived/here</code>. This will ensure archived files are never included as new source files.<br/>
         Spark will move source files respecting their own path. For example, if the path of source file is <code>/a/b/dataset.txt</code> and the path of archive directory is <code>/archived/here</code>, file will be moved to <code>/archived/here/a/b/dataset.txt</code>.<br/>
         NOTE: Both archiving (via moving) or deleting completed files will introduce overhead (slow down) in each micro-batch, so you need to understand the cost for each operation in your file system before enabling this option. On the other hand, enabling this option will reduce the cost to list source files which can be an expensive operation.<br/>
-        NOTE 2: The source path should not be used from multiple sources or queries when enabling this option.<br/>
+        NOTE 2: The source path should not be used from multiple sources or queries when enabling this option. Similarly, you must ensure the source path doesn't match to any files in output directory of file stream sink.<br/>
         NOTE 3: Both delete and move actions are best effort. Failing to delete or move files will not fail the streaming query.
         <br/><br/>
         For file-format-specific options, see the related methods in <code>DataStreamReader</code>

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
@@ -259,6 +259,8 @@ class FileStreamSource(
 
   override def toString: String = s"FileStreamSource[$qualifiedBasePath]"
 
+  private var warnedIgnoringCleanSourceOption: Boolean = false
+
   /**
    * Informs the source that Spark has completed processing all data for offsets less than or
    * equal to `end` and will only request offsets greater than `end` in the future.
@@ -267,10 +269,22 @@ class FileStreamSource(
     val logOffset = FileStreamSourceOffset(end).logOffset
 
     sourceCleaner.foreach { cleaner =>
-      val files = metadataLog.get(Some(logOffset), Some(logOffset)).flatMap(_._2)
-      val validFileEntities = files.filter(_.batchId == logOffset)
-      logDebug(s"completed file entries: ${validFileEntities.mkString(",")}")
-      validFileEntities.foreach(cleaner.clean)
+      sourceHasMetadata match {
+        case Some(true) if !warnedIgnoringCleanSourceOption =>
+          logWarning("Ignoring 'cleanSource' option since source path refers to the output" +
+            " directory of FileStreamSink.")
+          warnedIgnoringCleanSourceOption = true
+
+        case Some(false) =>
+          val files = metadataLog.get(Some(logOffset), Some(logOffset)).flatMap(_._2)
+          val validFileEntities = files.filter(_.batchId == logOffset)
+          logDebug(s"completed file entries: ${validFileEntities.mkString(",")}")
+          validFileEntities.foreach(cleaner.clean)
+
+        case _ =>
+          logWarning("Ignoring 'cleanSource' option since Spark hasn't figured out whether " +
 new MetadataLogFileIndex(sparkSession, qualifiedBasePath, 
 new MetadataLogFileIndex(sparkSession, qualifiedBasePath, 
+            "source path refers to the output directory of FileStreamSink or not.")
+      }
     }
   }
 

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
@@ -29,6 +29,7 @@ import org.apache.hadoop.util.Progressable
 import org.scalatest.PrivateMethodTester
 import org.scalatest.time.SpanSugar._
 
+import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.execution.streaming._
@@ -149,6 +150,20 @@ abstract class FileStreamSourceTest
     }
   }
 
+  case class AddFilesToFileStreamSinkLog(
+      fs: FileSystem,
+      srcDir: Path,
+      sinkLog: FileStreamSinkLog,
+      batchId: Int)(
+      pathFilter: Path => Boolean) extends ExternalAction {
+    override def runAction(): Unit = {
+      val statuses = fs.listStatus(srcDir, new PathFilter {
+        override def accept(path: Path): Boolean = pathFilter(path)
+      })
+      sinkLog.add(batchId, statuses.map { s => SinkFileStatus(s) })
+    }
+  }
+
   /** Use `format` and `path` to create FileStreamSource via DataFrameReader */
   def createFileStream(
       format: String,
@@ -1617,14 +1632,6 @@ class FileStreamSourceSuite extends FileStreamSourceTest {
   }
 
   test("remove completed files when remove option is enabled") {
-    def assertFileIsRemoved(files: Array[String], fileName: String): Unit = {
-      assert(!files.exists(_.startsWith(fileName)))
-    }
-
-    def assertFileIsNotRemoved(files: Array[String], fileName: String): Unit = {
-      assert(files.exists(_.startsWith(fileName)))
-    }
-
     withTempDirs { case (src, tmp) =>
       withSQLConf(
         SQLConf.FILE_SOURCE_LOG_COMPACT_INTERVAL.key -> "2",
@@ -1642,28 +1649,24 @@ class FileStreamSourceSuite extends FileStreamSourceTest {
           CheckAnswer("keep1"),
           AssertOnQuery("input file removed") { _: StreamExecution =>
             // it doesn't rename any file yet
-            assertFileIsNotRemoved(src.list(), "keep1")
+            assertFileIsNotRemoved(src, "keep1")
             true
           },
           AddTextFileData("keep2", src, tmp, tmpFilePrefix = "ke ep2 %"),
           CheckAnswer("keep1", "keep2"),
           AssertOnQuery("input file removed") { _: StreamExecution =>
-            val files = src.list()
-
             // it renames input file for first batch, but not for second batch yet
-            assertFileIsRemoved(files, "keep1")
-            assertFileIsNotRemoved(files, "ke ep2 %")
+            assertFileIsRemoved(src, "keep1")
+            assertFileIsNotRemoved(src, "ke ep2 %")
 
             true
           },
           AddTextFileData("keep3", src, tmp, tmpFilePrefix = "keep3"),
           CheckAnswer("keep1", "keep2", "keep3"),
           AssertOnQuery("input file renamed") { _: StreamExecution =>
-            val files = src.list()
-
             // it renames input file for second batch, but not third batch yet
-            assertFileIsRemoved(files, "ke ep2 %")
-            assertFileIsNotRemoved(files, "keep3")
+            assertFileIsRemoved(src, "ke ep2 %")
+            assertFileIsNotRemoved(src, "keep3")
 
             true
           }
@@ -1739,6 +1742,58 @@ class FileStreamSourceSuite extends FileStreamSourceTest {
     }
   }
 
+  Seq("delete", "archive").foreach { cleanOption =>
+    test(s"skip $cleanOption when source path refers the output dir of FileStreamSink") {
+      withThreeTempDirs { case (src, tmp, archiveDir) =>
+        withSQLConf(
+          SQLConf.FILE_SOURCE_LOG_COMPACT_INTERVAL.key -> "2",
+          // Force deleting the old logs
+          SQLConf.FILE_SOURCE_LOG_CLEANUP_DELAY.key -> "1"
+        ) {
+          val option = Map("latestFirst" -> "false", "maxFilesPerTrigger" -> "1",
+            "cleanSource" -> cleanOption, "sourceArchiveDir" -> archiveDir.getAbsolutePath)
+
+          val fileStream = createFileStream("text", src.getCanonicalPath, options = option)
+          val filtered = fileStream.filter($"value" contains "keep")
+
+          // create FileStreamSinkLog under source directory
+          val sinkLog = new FileStreamSinkLog(FileStreamSinkLog.VERSION, spark,
+            new File(src, FileStreamSink.metadataDir).getCanonicalPath)
+          val hadoopConf = SparkHadoopUtil.newConfiguration(sparkConf)
+          val srcPath = new Path(src.getCanonicalPath)
+          val fileSystem = srcPath.getFileSystem(hadoopConf)
+
+          // Here we will just check whether the source file is removed or not, as we cover
+          // functionality test of "archive" in other UT.
+          testStream(filtered)(
+            AddTextFileData("keep1", src, tmp, tmpFilePrefix = "keep1"),
+            AddFilesToFileStreamSinkLog(fileSystem, srcPath, sinkLog, 0) { path =>
+              path.getName.startsWith("keep1")
+            },
+            CheckAnswer("keep1"),
+            AssertOnQuery("input file removed") { _: StreamExecution =>
+              // it doesn't remove any files for recent batch yet
+              assertFileIsNotRemoved(src, "keep1")
+              true
+            },
+            AddTextFileData("keep2", src, tmp, tmpFilePrefix = "ke ep2 %"),
+            AddFilesToFileStreamSinkLog(fileSystem, srcPath, sinkLog, 1) { path =>
+              path.getName.startsWith("ke ep2 %")
+            },
+            CheckAnswer("keep1", "keep2"),
+            AssertOnQuery("input file removed") { _: StreamExecution =>
+              // it doesn't remove any file in src since it's the output dir of FileStreamSink
+              assertFileIsNotRemoved(src, "keep1")
+              // it doesn't remove any files for recent batch yet
+              assertFileIsNotRemoved(src, "ke ep2 %")
+              true
+            }
+          )
+        }
+      }
+    }
+  }
+
   class FakeFileSystem(scheme: String) extends FileSystem {
     override def exists(f: Path): Boolean = true
 
@@ -1797,6 +1852,14 @@ class FileStreamSourceSuite extends FileStreamSourceTest {
     }
   }
 
+  private def assertFileIsRemoved(sourceDir: File, fileName: String): Unit = {
+    assert(!sourceDir.list().exists(_.startsWith(fileName)))
+  }
+
+  private def assertFileIsNotRemoved(sourceDir: File, fileName: String): Unit = {
+    assert(sourceDir.list().exists(_.startsWith(fileName)))
+  }
+
   private def assertFileIsNotMoved(sourceDir: File, expectedDir: File, filePrefix: String): Unit = {
     assert(sourceDir.exists())
     assert(sourceDir.list().exists(_.startsWith(filePrefix)))