apache · MaxGekk · Jul 14, 2018 · Jul 14, 2018 · Jul 14, 2018 · Jul 14, 2018
diff --git a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroFileFormat.scala b/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroFileFormat.scala
@@ -64,7 +64,7 @@ private[avro] class AvroFileFormat extends FileFormat with DataSourceRegister {
     // Schema evolution is not supported yet. Here we only pick a single random sample file to
     // figure out the schema of the whole dataset.
     val sampleFile =
-      if (conf.getBoolean(AvroFileFormat.IgnoreFilesWithoutExtensionProperty, true)) {
+      if (AvroFileFormat.ignoreFilesWithoutExtensions(conf)) {
         files.find(_.getPath.getName.endsWith(".avro")).getOrElse {
           throw new FileNotFoundException(
             "No Avro files found. Hadoop option \"avro.mapred.ignore.inputs.without.extension\" " +
@@ -172,10 +172,7 @@ private[avro] class AvroFileFormat extends FileFormat with DataSourceRegister {
       // Doing input file filtering is improper because we may generate empty tasks that process no
       // input files but stress the scheduler. We should probably add a more general input file
       // filtering mechanism for `FileFormat` data sources. See SPARK-16317.
-      if (
-        conf.getBoolean(AvroFileFormat.IgnoreFilesWithoutExtensionProperty, true) &&
-        !file.filePath.endsWith(".avro")
-      ) {
+      if (AvroFileFormat.ignoreFilesWithoutExtensions(conf) && !file.filePath.endsWith(".avro")) {
         Iterator.empty
       } else {
         val reader = {
@@ -286,4 +283,11 @@ private[avro] object AvroFileFormat {
       value.readFields(new DataInputStream(in))
     }
   }
+
+  def ignoreFilesWithoutExtensions(conf: Configuration): Boolean = {
+    // Files without .avro extensions are not ignored by default
+    val defaultValue = false
+
+    conf.getBoolean(AvroFileFormat.IgnoreFilesWithoutExtensionProperty, defaultValue)
+  }
 }
diff --git a/external/avro/src/test/resources/episodesAvro b/external/avro/src/test/resources/episodesAvro
diff --git a/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala b/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala
@@ -39,6 +39,7 @@ import org.apache.spark.sql.types._
 class AvroSuite extends SparkFunSuite {
   val episodesFile = "src/test/resources/episodes.avro"
   val testFile = "src/test/resources/test.avro"
+  val episodesWithoutExtension = "src/test/resources/episodesAvro"
 
   private var spark: SparkSession = _
 
@@ -623,7 +624,7 @@ class AvroSuite extends SparkFunSuite {
       spark.read.avro("*/*/*/*/*/*/*/something.avro")
     }
 
-    intercept[FileNotFoundException] {
+    intercept[java.io.IOException] {
       TestUtils.withTempDir { dir =>
         FileUtils.touch(new File(dir, "test"))
         spark.read.avro(dir.toString)
@@ -809,4 +810,16 @@ class AvroSuite extends SparkFunSuite {
       assert(readDf.collect().sameElements(writeDf.collect()))
     }
   }
+
+  test("SPARK-24805: reading files without .avro extension") {
+    val df1 = spark.read.avro(episodesWithoutExtension)
+    assert(df1.count == 8)
+
+    val schema = new StructType()
+      .add("title", StringType)
+      .add("air_date", StringType)
+      .add("doctor", IntegerType)
+    val df2 = spark.read.schema(schema).avro(episodesWithoutExtension)
+    assert(df2.count == 8)
+  }
 }