apache · sameeragarwal · May 31, 2016 · Jun 1, 2016 · dongjoon-hyun · May 31, 2016
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
@@ -19,10 +19,13 @@ package org.apache.spark.sql.execution
 
 import java.util.concurrent.locks.ReentrantReadWriteLock
 
+import org.apache.hadoop.fs.{FileSystem, Path}
+
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.columnar.InMemoryRelation
 import org.apache.spark.sql.Dataset
+import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation}
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.storage.StorageLevel.MEMORY_AND_DISK
 
@@ -157,4 +160,25 @@ private[sql] class CacheManager extends Logging {
       case _ =>
     }
   }
+
+  /**
+   * Invalidates the cache of any data that contains `qualifiedPath` in one or more
+   * `HadoopFsRelation` node(s) as part of its logical plan.
+   */
+  private[sql] def invalidateCachedPath(fs: FileSystem, qualifiedPath: Path): Unit = writeLock {
+    cachedData.foreach {
+      case data if data.plan.find {
+        case lr: LogicalRelation => lr.relation match {
+          case hr: HadoopFsRelation =>
+            hr.location.paths
+              .map(_.makeQualified(fs.getUri, fs.getWorkingDirectory))
+              .contains(qualifiedPath)
+        }
+      }.isDefined =>
+        val dataIndex = cachedData.indexWhere(cd => data.plan.sameResult(cd.plan))
+        data.cachedRepresentation.uncache(blocking = false)
+        cachedData.remove(dataIndex)
+      case _ =>
+    }
+  }
 }
diff --git a/.../scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/.../scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -91,8 +91,12 @@ private[sql] case class InsertIntoHadoopFsRelationCommand(
           throw new IOException(s"Unable to clear output " +
             s"directory $qualifiedOutputPath prior to writing to it")
         }
+        // Invalidate all caches with this in path
+        sparkSession.sharedState.cacheManager.invalidateCachedPath(fs, qualifiedOutputPath)
         true
       case (SaveMode.Append, _) | (SaveMode.Overwrite, _) | (SaveMode.ErrorIfExists, false) =>
+        // Invalidate all caches with this in path
+      sparkSession.sharedState.cacheManager.invalidateCachedPath(fs, qualifiedOutputPath)
         true
       case (SaveMode.Ignore, exists) =>
         !exists

diff --git a/...src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/...src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
@@ -67,6 +67,28 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
       TableIdentifier("tmp"), ignoreIfNotExists = true)
   }
 
+  test("drop cache on overwrite") {
+    withTempDir { dir =>
+      val path = dir.toString
+      spark.range(1000).write.mode("overwrite").parquet(path)
+      val df = sqlContext.read.parquet(path).cache()
+      assert(df.count() == 1000)
+      sqlContext.range(10).write.mode("overwrite").parquet(path)
+      assert(sqlContext.read.parquet(path).count() == 10)
+    }
+  }
+
+  test("drop cache on append") {
+    withTempDir { dir =>
+      val path = dir.toString
+      spark.range(1000).write.mode("append").parquet(path)
+      val df = sqlContext.read.parquet(path).cache()
+      assert(df.count() == 1000)
+      sqlContext.range(10).write.mode("append").parquet(path)
+      assert(sqlContext.read.parquet(path).count() == 1010)
+    }
+  }
+
   test("self-join") {
     // 4 rows, cells of column 1 of row 2 and row 4 are null
     val data = (1 to 4).map { i =>