apache · WeichenXu123 · Apr 12, 2019 · Apr 12, 2019 · Apr 12, 2019 · Apr 12, 2019
diff --git a/...core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister b/...core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
@@ -8,3 +8,4 @@ org.apache.spark.sql.execution.datasources.v2.text.TextDataSourceV2
 org.apache.spark.sql.execution.streaming.ConsoleSinkProvider
 org.apache.spark.sql.execution.streaming.sources.RateStreamProvider
 org.apache.spark.sql.execution.streaming.sources.TextSocketSourceProvider
+org.apache.spark.sql.execution.datasources.binaryfile.BinaryFileFormat
diff --git a/...in/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileDataSource.scala b/...in/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileDataSource.scala
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.binaryfile
+
+import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.sql.types._
+
+/**
+ * This "binaryFile" data source format implements Spark SQL data source API for loading binary
+ * file data as `DataFrame`.
+ *
+ * The loaded `DataFrame` has two columns, the schema is:
+ *  - content: `BinaryType` (binary data of the file content)
+ *  - status: `StructType` (the file status information)
+ *
+ * The schema of "status" column described above is:
+ *  - path: `StringType` (the file path)
+ *  - modificationTime: `TimestampType` (last modification time of the file, on some FS
+ *                                       implementation, this might be not available
+ *                                       and fallback to some default value.)
+ *  - length: `LongType` (the file length)
+ *
+ * To use binary file data source, you need to set "binaryFile" as the format in `DataFrameReader`
+ * and optionally specify the data source options, available options include:
+ *  - pathGlobFilter: Only include files with path matching the glob pattern.
+ *                    The glob pattern keeps the same behavior with Hadoop API
+ *                    `org.apache.hadoop.fs.FileSystem.globStatus(pathPattern)`
+ *
+ * In order to control the partition size, we can set spark sql configuration
+ * `spark.sql.files.maxPartitionBytes` and `spark.sql.files.openCostInBytes`.
+ *
+ * Example:
+ * {{{
+ *   // Scala
+ *   val df = spark.read.format("binaryFile")
+ *     .option("pathGlobFilter", "*.txt")
+ *     .load("path/to/fileDir")
+ *
+ *   // Java
+ *   Dataset<Row> df = spark.read().format("binaryFile")
+ *     .option("pathGlobFilter", "*.txt")
+ *     .load("path/to/fileDir");
+ * }}}
+ *
+ * @note This binary file data source does not support saving dataframe to binary files.
+ * @note This class is public for documentation purpose. Please don't use this class directly.
+ * Rather, use the data source API as illustrated above.
+ */
+@Experimental
+@Since("3.0.0")
+class BinaryFileDataSource private() {}
+
+object BinaryFileDataSource {
+
+  private val fileStatusSchema = StructType(
+    StructField("path", StringType, false) ::
+    StructField("modificationTime", TimestampType, false) ::
+    StructField("length", LongType, false) :: Nil)
+
+  /**
+   * The schema of the dataframe returned by binaryFile data source.
+   * See doc in `BinaryFileDataSource`
+   */
+  val binaryFileSchema = StructType(
+    StructField("content", BinaryType, false)::
+    StructField("status", fileStatusSchema, false) :: Nil)
+
+}
diff --git a/...c/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala b/...c/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.binaryfile
+
+import java.sql.Timestamp
+
+import com.google.common.io.{ByteStreams, Closeables}
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileStatus, GlobFilter, Path}
+import org.apache.hadoop.mapreduce.Job
+
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.AttributeReference
+import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
+import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils}
+import org.apache.spark.sql.execution.datasources.{DataSource, FileFormat, OutputWriterFactory, PartitionedFile}
+import org.apache.spark.sql.sources.{DataSourceRegister, Filter}
+import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
+import org.apache.spark.util.SerializableConfiguration
+
+
+private[binaryfile] class BinaryFileFormat extends FileFormat with DataSourceRegister {
+
+  override def inferSchema(
+      sparkSession: SparkSession,
+      options: Map[String, String],
+      files: Seq[FileStatus]): Option[StructType] = Some(BinaryFileDataSource.binaryFileSchema)
+
+  override def prepareWrite(
+      sparkSession: SparkSession,
+      job: Job,
+      options: Map[String, String],
+      dataSchema: StructType): OutputWriterFactory = {
+    throw new UnsupportedOperationException("Write is not supported for binary file data source")
+  }
+
+  override def isSplitable(
+      sparkSession: SparkSession,
+      options: Map[String, String],
+      path: Path): Boolean = {
+    false
+  }
+
+  override def shortName(): String = "binaryFile"
+
+  override protected def buildReader(
+      sparkSession: SparkSession,
+      dataSchema: StructType,
+      partitionSchema: StructType,
+      requiredSchema: StructType,
+      filters: Seq[Filter],
+      options: Map[String, String],
+      hadoopConf: Configuration): (PartitionedFile) => Iterator[InternalRow] = {
+
+    val broadcastedHadoopConf =
+      sparkSession.sparkContext.broadcast(new SerializableConfiguration(hadoopConf))
+
+    val binaryFileSourceOptions = new BinaryFileSourceOptions(options)
+
+    val pathGlobPattern = binaryFileSourceOptions.pathGlobFilter
+
+    (file: PartitionedFile) => {
+      val path = file.filePath
+      val fsPath = new Path(path)
+
+      // TODO: Improve performance here: each file will recompile the glob pattern here.
+      val globFilter = pathGlobPattern.map(new GlobFilter(_))
+      if (!globFilter.isDefined || globFilter.get.accept(fsPath)) {
+        val fs = fsPath.getFileSystem(broadcastedHadoopConf.value.value)
+        val fileStatus = fs.getFileStatus(fsPath)
+        val length = fileStatus.getLen()
+        val modificationTime = new Timestamp(fileStatus.getModificationTime())
+        val stream = fs.open(fsPath)
+
+        val content = try {
+          ByteStreams.toByteArray(stream)
+        } finally {
+          Closeables.close(stream, true)
+        }
+
+        val fullOutput = dataSchema.map { f =>
+          AttributeReference(f.name, f.dataType, f.nullable, f.metadata)()
+        }
+        val requiredOutput = fullOutput.filter { a =>
+          requiredSchema.fieldNames.contains(a.name)
+        }
+
+        val requiredColumns = GenerateUnsafeProjection.generate(requiredOutput, fullOutput)
+
+        val internalRow = InternalRow(
+          content,
+          InternalRow(
+            UTF8String.fromString(path),
+            DateTimeUtils.fromJavaTimestamp(modificationTime),
+            length
+          )
+        )
+
+        Iterator(requiredColumns(internalRow))
+      } else {
+        Iterator.empty
+      }
+    }
+  }
+}
+
+private[binaryfile] class BinaryFileSourceOptions(
+    @transient private val parameters: CaseInsensitiveMap[String]) extends Serializable {
+
+  def this(parameters: Map[String, String]) = this(CaseInsensitiveMap(parameters))
+
+  /**
+   * only include files with path matching the glob pattern.
+   */
+  val pathGlobFilter: Option[String] = {
+    val filter = parameters.getOrElse("pathGlobFilter", null)
+    if (filter != null) Some(filter) else None
+  }
+}
diff --git a/...rc/test/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileSuite.scala b/...rc/test/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileSuite.scala
@@ -0,0 +1,143 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.binaryfile
+
+import java.io.File
+import java.nio.file.{Files, StandardOpenOption}
+import java.sql.Timestamp
+
+import scala.collection.JavaConverters._
+
+import com.google.common.io.{ByteStreams, Closeables}
+import org.apache.hadoop.fs.{FileSystem, GlobFilter, Path}
+
+import org.apache.spark.sql.{QueryTest, Row}
+import org.apache.spark.sql.functions.col
+import org.apache.spark.sql.test.{SharedSQLContext, SQLTestUtils}
+import org.apache.spark.util.Utils
+
+class BinaryFileSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
+
+  private var testDir: String = _
+
+  private var fsTestDir: Path = _
+
+  private var fs: FileSystem = _
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+
+    testDir = Utils.createTempDir().getAbsolutePath
+    fsTestDir = new Path(testDir)
+    fs = fsTestDir.getFileSystem(sparkContext.hadoopConfiguration)
+
+    val year2014Dir = new File(testDir, "year=2014")
+    year2014Dir.mkdir()
+    val year2015Dir = new File(testDir, "year=2015")
+    year2015Dir.mkdir()
+
+    Files.write(
+      new File(year2014Dir, "data.txt").toPath,
+      Seq("2014-test").asJava,
+      StandardOpenOption.CREATE, StandardOpenOption.WRITE
+    )
+    Files.write(
+      new File(year2014Dir, "data2.bin").toPath,
+      "2014-test-bin".getBytes,
+      StandardOpenOption.CREATE, StandardOpenOption.WRITE
+    )
+
+    Files.write(
+      new File(year2015Dir, "bool.csv").toPath,
+      Seq("bool", "True", "False", "true").asJava,
+      StandardOpenOption.CREATE, StandardOpenOption.WRITE
+    )
+    Files.write(
+      new File(year2015Dir, "data.txt").toPath,
+      "2015-test".getBytes,
+      StandardOpenOption.CREATE, StandardOpenOption.WRITE
+    )
+  }
+
+  def testBinaryFileDataSource(pathGlobFilter: String): Unit = {
+    val resultDF = spark.read.format("binaryFile")
+      .option("pathGlobFilter", pathGlobFilter)
+      .load(testDir)
+      .select(
+        col("status.path"),
+        col("status.modificationTime"),
+        col("status.length"),
+        col("content"),
+        col("year") // this is a partition column
+      )
+
+    val expectedRowSet = new collection.mutable.HashSet[Row]()
+
+    val globFilter = new GlobFilter(pathGlobFilter)
+    for (partitionDirStatus <- fs.listStatus(fsTestDir)) {
+      val dirPath = partitionDirStatus.getPath
+
+      val partitionName = dirPath.getName.split("=")(1)
+      val year = partitionName.toInt // partition column "year" value which is `Int` type
+
+      for (fileStatus <- fs.listStatus(dirPath)) {
+        if (globFilter.accept(fileStatus.getPath)) {
+          val fpath = fileStatus.getPath.toString.replace("file:/", "file:///")
+          val flen = fileStatus.getLen
+          val modificationTime = new Timestamp(fileStatus.getModificationTime)
+
+          val fcontent = {
+            val stream = fs.open(fileStatus.getPath)
+            val content = try {
+              ByteStreams.toByteArray(stream)
+            } finally {
+              Closeables.close(stream, true)
+            }
+            content
+          }
+
+          val row = Row(fpath, modificationTime, flen, fcontent, year)
+          expectedRowSet.add(row)
+        }
+      }
+    }
+
+    checkAnswer(resultDF, expectedRowSet.toSeq)
+  }
+
+  test("binary file data source test") {
+    testBinaryFileDataSource(pathGlobFilter = "*.*")
+    testBinaryFileDataSource(pathGlobFilter = "*.bin")
+    testBinaryFileDataSource(pathGlobFilter = "*.txt")
+    testBinaryFileDataSource(pathGlobFilter = "*.{txt,csv}")
+    testBinaryFileDataSource(pathGlobFilter = "*.json")
+  }
+
+  test ("binary file data source do not support write operation") {
+    val df = spark.read.format("binaryFile").load(testDir)
+    withTempDir { tmpDir =>
+      val thrown = intercept[UnsupportedOperationException] {
+        df.write
+          .format("binaryFile")
+          .save(tmpDir + "/test_save")
+      }
+      assert(thrown.getMessage.contains("Write is not supported for binary file data source"))
+    }
+  }
+
+}