apache · itholic · May 14, 2021 · May 14, 2021 · May 17, 2021 · May 17, 2021
diff --git a/docs/sql-data-sources-orc.md b/docs/sql-data-sources-orc.md
@@ -172,3 +172,29 @@ When reading from Hive metastore ORC tables and inserting to Hive metastore ORC
   <td>2.0.0</td>
   </tr>
 </table>
+
+## Data Source Option
+
+Data source options of ORC can be set via:
+* the `.option`/`.options` methods of
+  *  `DataFrameReader`
+  *  `DataFrameWriter`
+  *  `DataStreamReader`
+  *  `DataStreamWriter`
+
+<table class="table">
+  <tr><th><b>Property Name</b></th><th><b>Default</b></th><th><b>Meaning</b></th><th><b>Scope</b></th></tr>
+  <tr>
+    <td><code>mergeSchema</code></td>
+    <td>None</td>
+    <td>sets whether we should merge schemas collected from all ORC part-files. This will override <code>spark.sql.orc.mergeSchema</code>. The default value is specified in <code>spark.sql.orc.mergeSchema</code>.</td>
+    <td>read</td>
+  </tr>
+  <tr>
+    <td><code>compression</code></td>
+    <td>None</td>
+    <td>compression codec to use when saving to file. This can be one of the known case-insensitive shorten names (none, snappy, zlib, lzo, and zstd). This will override <code>orc.compress</code> and <code>spark.sql.orc.compression.codec</code>. If None is set, it uses the value specified in <code>spark.sql.orc.compression.codec</code>.</td>
+    <td>write</td>
+  </tr>
+</table>
+Other generic options can be found in <a href="https://spark.apache.org/docs/latest/sql-data-sources-generic-options.html"> Generic File Source Options</a>.
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
@@ -793,28 +793,13 @@ def orc(self, path, mergeSchema=None, pathGlobFilter=None, recursiveFileLookup=N
         Parameters
         ----------
         path : str or list
-        mergeSchema : str or bool, optional
-            sets whether we should merge schemas collected from all
-            ORC part-files. This will override ``spark.sql.orc.mergeSchema``.
-            The default value is specified in ``spark.sql.orc.mergeSchema``.
-        pathGlobFilter : str or bool
-            an optional glob pattern to only include files with paths matching
-            the pattern. The syntax follows `org.apache.hadoop.fs.GlobFilter`.
-            It does not change the behavior of
-            `partition discovery <https://spark.apache.org/docs/latest/sql-data-sources-parquet.html#partition-discovery>`_.  # noqa
-        recursiveFileLookup : str or bool
-            recursively scan a directory for files. Using this option
-            disables
-            `partition discovery <https://spark.apache.org/docs/latest/sql-data-sources-parquet.html#partition-discovery>`_.  # noqa
 
-            modification times occurring before the specified time. The provided timestamp
-            must be in the following format: YYYY-MM-DDTHH:mm:ss (e.g. 2020-06-01T13:00:00)
-        modifiedBefore : an optional timestamp to only include files with
-            modification times occurring before the specified time. The provided timestamp
-            must be in the following format: YYYY-MM-DDTHH:mm:ss (e.g. 2020-06-01T13:00:00)
-        modifiedAfter : an optional timestamp to only include files with
-            modification times occurring after the specified time. The provided timestamp
-            must be in the following format: YYYY-MM-DDTHH:mm:ss (e.g. 2020-06-01T13:00:00)
+        Other Parameters
+        ----------------
+        Extra options
+            For the extra options, refer to
+            `Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-orc.html#data-source-option>`_  # noqa
+            in the version you use.
 
         Examples
         --------
@@ -1417,12 +1402,13 @@ def orc(self, path, mode=None, partitionBy=None, compression=None):
                 exists.
         partitionBy : str or list, optional
             names of partitioning columns
-        compression : str, optional
-            compression codec to use when saving to file. This can be one of the
-            known case-insensitive shorten names (none, snappy, zlib, lzo, and zstd).
-            This will override ``orc.compress`` and
-            ``spark.sql.orc.compression.codec``. If None is set, it uses the value
-            specified in ``spark.sql.orc.compression.codec``.
+
+        Other Parameters
+        ----------------
+        Extra options
+            For the extra options, refer to
+            `Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-orc.html#data-source-option>`_  # noqa
+            in the version you use.
 
         Examples
         --------

diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
@@ -637,20 +637,12 @@ def orc(self, path, mergeSchema=None, pathGlobFilter=None, recursiveFileLookup=N
 
         .. versionadded:: 2.3.0
 
-        Parameters
-        ----------
-        mergeSchema : str or bool, optional
-            sets whether we should merge schemas collected from all
-            ORC part-files. This will override ``spark.sql.orc.mergeSchema``.
-            The default value is specified in ``spark.sql.orc.mergeSchema``.
-        pathGlobFilter : str or bool, optional
-            an optional glob pattern to only include files with paths matching
-            the pattern. The syntax follows `org.apache.hadoop.fs.GlobFilter`.
-            It does not change the behavior of `partition discovery`_.
-        recursiveFileLookup : str or bool, optional
-            recursively scan a directory for files. Using this option
-            disables
-            `partition discovery <https://spark.apache.org/docs/latest/sql-data-sources-parquet.html#partition-discovery>`_.  # noqa
+        Other Parameters
+        ----------------
+        Extra options
+            For the extra options, refer to
+            `Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-orc.html#data-source-option>`_  # noqa
+            in the version you use.
 
         Examples
         --------

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -874,23 +874,10 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   /**
    * Loads ORC files and returns the result as a `DataFrame`.
    *
-   * You can set the following ORC-specific option(s) for reading ORC files:
-   * <ul>
-   * <li>`mergeSchema` (default is the value specified in `spark.sql.orc.mergeSchema`): sets whether
-   * we should merge schemas collected from all ORC part-files. This will override
-   * `spark.sql.orc.mergeSchema`.</li>
-   * <li>`pathGlobFilter`: an optional glob pattern to only include files with paths matching
-   * the pattern. The syntax follows <code>org.apache.hadoop.fs.GlobFilter</code>.
-   * It does not change the behavior of partition discovery.</li>
-   * <li>`modifiedBefore` (batch only): an optional timestamp to only include files with
-   * modification times  occurring before the specified Time. The provided timestamp
-   * must be in the following form: YYYY-MM-DDTHH:mm:ss (e.g. 2020-06-01T13:00:00)</li>
-   * <li>`modifiedAfter` (batch only): an optional timestamp to only include files with
-   * modification times occurring after the specified Time. The provided timestamp
-   * must be in the following form: YYYY-MM-DDTHH:mm:ss (e.g. 2020-06-01T13:00:00)</li>
-   * <li>`recursiveFileLookup`: recursively scan a directory for files. Using this option
-   * disables partition discovery</li>
-   * </ul>
+   * ORC-specific option(s) for reading ORC files can be found in
+   * <a href=
+   *   "https://spark.apache.org/docs/latest/sql-data-sources-orc.html#data-source-option">
+   *   Data Source Option</a> in the version you use.
    *
    * @param paths input paths
    * @since 2.0.0

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -881,14 +881,10 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
    *   format("orc").save(path)
    * }}}
    *
-   * You can set the following ORC-specific option(s) for writing ORC files:
-   * <ul>
-   * <li>`compression` (default is the value specified in `spark.sql.orc.compression.codec`):
-   * compression codec to use when saving to file. This can be one of the known case-insensitive
-   * shorten names(`none`, `snappy`, `zlib`, `lzo`, and `zstd`). This will override
-   * `orc.compress` and `spark.sql.orc.compression.codec`. If `orc.compress` is given,
-   * it overrides `spark.sql.orc.compression.codec`.</li>
-   * </ul>
+   * ORC-specific option(s) for writing ORC files can be found in
+   * <a href=
+   *   "https://spark.apache.org/docs/latest/sql-data-sources-orc.html#data-source-option">
+   *   Data Source Option</a> in the version you use.
    *
    * @since 1.5.0
    */

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
@@ -453,20 +453,17 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
   /**
    * Loads a ORC file stream, returning the result as a `DataFrame`.
    *
-   * You can set the following ORC-specific option(s) for reading ORC files:
+   * You can set the following option(s):
    * <ul>
    * <li>`maxFilesPerTrigger` (default: no max limit): sets the maximum number of new files to be
    * considered in every trigger.</li>
-   * <li>`mergeSchema` (default is the value specified in `spark.sql.orc.mergeSchema`): sets whether
-   * we should merge schemas collected from all ORC part-files. This will override
-   * `spark.sql.orc.mergeSchema`.</li>
-   * <li>`pathGlobFilter`: an optional glob pattern to only include files with paths matching
-   * the pattern. The syntax follows <code>org.apache.hadoop.fs.GlobFilter</code>.
-   * It does not change the behavior of partition discovery.</li>
-   * <li>`recursiveFileLookup`: recursively scan a directory for files. Using this option
-   * disables partition discovery</li>
    * </ul>
    *
+   * ORC-specific option(s) for reading ORC file stream can be found in
+   * <a href=
+   *   "https://spark.apache.org/docs/latest/sql-data-sources-orc.html#data-source-option">
+   *   Data Source Option</a> in the version you use.
+   *
    * @since 2.3.0
    */
   def orc(path: String): DataFrame = {