From 8049a203b8c5f2f8045701916e66cfc786e16b57 Mon Sep 17 00:00:00 2001
From: Ruifeng Zheng <ruifengz@apache.org>
Date: Wed, 13 Sep 2023 15:51:27 +0800
Subject: [PATCH 1/4] [SPARK-45141][PYTHON][INFRA][TESTS] Pin `pyarrow==12.0.1`
 in CI

### What changes were proposed in this pull request?
Pin `pyarrow==12.0.1` in CI

### Why are the changes needed?
to fix test failure,  https://github.com/apache/spark/actions/runs/6167186123/job/16738683632

```
======================================================================
FAIL [0.095s]: test_from_to_pandas (pyspark.pandas.tests.data_type_ops.test_datetime_ops.DatetimeOpsTests)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/__w/spark/spark/python/pyspark/testing/pandasutils.py", line 122, in _assert_pandas_equal
    assert_series_equal(
  File "/usr/local/lib/python3.9/dist-packages/pandas/_testing/asserters.py", line 931, in assert_series_equal
    assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}")
  File "/usr/local/lib/python3.9/dist-packages/pandas/_testing/asserters.py", line 415, in assert_attr_equal
    raise_assert_detail(obj, msg, left_attr, right_attr)
  File "/usr/local/lib/python3.9/dist-packages/pandas/_testing/asserters.py", line 599, in raise_assert_detail
    raise AssertionError(msg)
AssertionError: Attributes of Series are different

Attribute "dtype" are different
[left]:  datetime64[ns]
[right]: datetime64[us]
```

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
CI and manually test

### Was this patch authored or co-authored using generative AI tooling?
No

Closes #42897 from zhengruifeng/pin_pyarrow.

Authored-by: Ruifeng Zheng <ruifengz@apache.org>
Signed-off-by: Ruifeng Zheng <ruifengz@apache.org>
(cherry picked from commit e3d2dfa8b514f9358823c3cb1ad6523da8a6646b)
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 .github/workflows/build_and_test.yml | 4 ++--
 dev/infra/Dockerfile                 | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index b0760a955342f..8488540b415d5 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -258,7 +258,7 @@ jobs:
     - name: Install Python packages (Python 3.8)
       if: (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
       run: |
-        python3.8 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy unittest-xml-reporting 'grpcio==1.56.0' 'protobuf==3.20.3'
+        python3.8 -m pip install 'numpy>=1.20.0' 'pyarrow==12.0.1' pandas scipy unittest-xml-reporting 'grpcio==1.56.0' 'protobuf==3.20.3'
         python3.8 -m pip list
     # Run the tests.
     - name: Run tests
@@ -684,7 +684,7 @@ jobs:
         #   See also https://issues.apache.org/jira/browse/SPARK-38279.
         python3.9 -m pip install 'sphinx<3.1.0' mkdocs pydata_sphinx_theme 'sphinx-copybutton==0.5.2' nbsphinx numpydoc 'jinja2<3.0.0' 'markupsafe==2.0.1' 'pyzmq<24.0.0' 'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5' 'nest-asyncio==1.5.8' 'rpds-py==0.16.2' 'alabaster==0.7.13'
         python3.9 -m pip install ipython_genutils # See SPARK-38517
-        python3.9 -m pip install sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8'
+        python3.9 -m pip install sphinx_plotly_directive 'numpy>=1.20.0' 'pyarrow==12.0.1' pandas 'plotly>=4.8'
         python3.9 -m pip install 'docutils<0.18.0' # See SPARK-39421
         apt-get update -y
         apt-get install -y ruby ruby-dev
diff --git a/dev/infra/Dockerfile b/dev/infra/Dockerfile
index d3bae836cc631..d3fcd7ab36228 100644
--- a/dev/infra/Dockerfile
+++ b/dev/infra/Dockerfile
@@ -65,7 +65,7 @@ RUN Rscript -e "devtools::install_version('roxygen2', version='7.2.0', repos='ht
 ENV R_LIBS_SITE "/usr/local/lib/R/site-library:${R_LIBS_SITE}:/usr/lib/R/library"
 
 RUN pypy3 -m pip install numpy 'pandas<=2.0.3' scipy coverage matplotlib
-RUN python3.9 -m pip install numpy pyarrow 'pandas<=2.0.3' scipy unittest-xml-reporting plotly>=4.8 'mlflow>=2.3.1' coverage matplotlib openpyxl 'memory-profiler==0.60.0' 'scikit-learn==1.1.*'
+RUN python3.9 -m pip install numpy 'pyarrow==12.0.1' 'pandas<=2.0.3' scipy unittest-xml-reporting plotly>=4.8 'mlflow>=2.3.1' coverage matplotlib openpyxl 'memory-profiler==0.60.0' 'scikit-learn==1.1.*'
 
 # Add Python deps for Spark Connect.
 RUN python3.9 -m pip install 'grpcio>=1.48,<1.57' 'grpcio-status>=1.48,<1.57' 'protobuf==3.20.3' 'googleapis-common-protos==1.56.4'

From cc6912ec612c30e46e1595860a5519bb1caa221b Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dhyun@apple.com>
Date: Sun, 17 Mar 2024 15:15:50 -0700
Subject: [PATCH 2/4] [SPARK-47432][PYTHON][CONNECT][DOCS][3.5] Add `pyarrow`
 upper bound requirement, `<13.0.0`

### What changes were proposed in this pull request?

This PR aims to add `pyarrow` upper bound requirement, `<13.0.0`, to Apache Spark 3.5.x.

### Why are the changes needed?

PyArrow 13.0.0 has breaking changes mentioned by #42920 which is a part of Apache Spark 4.0.0.

### Does this PR introduce _any_ user-facing change?

No, this only clarifies the upper bound.

### How was this patch tested?

Pass the CIs.

### Was this patch authored or co-authored using generative AI tooling?

No.

Closes #45553 from dongjoon-hyun/SPARK-47432.

Authored-by: Dongjoon Hyun <dhyun@apple.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 dev/requirements.txt                           | 2 +-
 python/docs/source/getting_started/install.rst | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/dev/requirements.txt b/dev/requirements.txt
index 597417aba1f3d..0749af75aa4be 100644
--- a/dev/requirements.txt
+++ b/dev/requirements.txt
@@ -3,7 +3,7 @@ py4j
 
 # PySpark dependencies (optional)
 numpy
-pyarrow
+pyarrow<13.0.0
 pandas
 scipy
 plotly
diff --git a/python/docs/source/getting_started/install.rst b/python/docs/source/getting_started/install.rst
index 6822285e96172..e97632a8b384b 100644
--- a/python/docs/source/getting_started/install.rst
+++ b/python/docs/source/getting_started/install.rst
@@ -157,7 +157,7 @@ Package                    Supported version Note
 ========================== ========================= ======================================================================================
 `py4j`                     >=0.10.9.7                Required
 `pandas`                   >=1.0.5                   Required for pandas API on Spark and Spark Connect; Optional for Spark SQL
-`pyarrow`                  >=4.0.0                   Required for pandas API on Spark and Spark Connect; Optional for Spark SQL
+`pyarrow`                  >=4.0.0,<13.0.0           Required for pandas API on Spark and Spark Connect; Optional for Spark SQL
 `numpy`                    >=1.15                    Required for pandas API on Spark and MLLib DataFrame-based API; Optional for Spark SQL
 `grpcio`                   >=1.48,<1.57              Required for Spark Connect
 `grpcio-status`            >=1.48,<1.57              Required for Spark Connect

From bb7a6138b827975fc827813ab42a2b9074bf8d5e Mon Sep 17 00:00:00 2001
From: Huw Campbell <huw.campbell@gmail.com>
Date: Mon, 18 Mar 2024 07:38:10 -0700
Subject: [PATCH 3/4] [SPARK-47434][WEBUI] Fix `statistics` link in
 `StreamingQueryPage`
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### What changes were proposed in this pull request?

Like SPARK-24553, this PR aims to fix redirect issues (incorrect 302) when one is using proxy settings. Change the generated link to be consistent with other links and include a trailing slash

### Why are the changes needed?

When using a proxy, an invalid redirect is issued if this is not included

### Does this PR introduce _any_ user-facing change?

Only that people will be able to use these links if they are using a proxy

### How was this patch tested?

With a proxy installed I went to the location this link would generate and could go to the page, when it redirects with the link as it exists.

Edit: Further tested by building a version of our application with this patch applied, the links work now.

### Was this patch authored or co-authored using generative AI tooling?

No.

Page with working link
<img width="913" alt="Screenshot 2024-03-18 at 4 45 27 PM" src="https://github.com/apache/spark/assets/5205457/dbcd1ffc-b7e6-4f84-8ca7-602c41202bf3">

Goes correctly to
<img width="539" alt="Screenshot 2024-03-18 at 4 45 36 PM" src="https://github.com/apache/spark/assets/5205457/89111c82-b24a-4b33-895f-9c0131e8acb5">

Before it would redirect and we'd get a 404.

<img width="639" alt="image" src="https://github.com/apache/spark/assets/5205457/1adfeba1-a1f6-4c35-9c39-e077c680baef">

Closes #45527 from HuwCampbell/patch-1.

Authored-by: Huw Campbell <huw.campbell@gmail.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
(cherry picked from commit 9b466d329c3c75e89b80109755a41c2d271b8acc)
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 .../org/apache/spark/sql/streaming/ui/StreamingQueryPage.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryPage.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryPage.scala
index 7cd7db4088ac9..ce3e7cde01b7a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryPage.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryPage.scala
@@ -174,7 +174,7 @@ private[ui] class StreamingQueryPagedTable(
 
   override def row(query: StructuredStreamingRow): Seq[Node] = {
     val streamingQuery = query.streamingUIData
-    val statisticsLink = "%s/%s/statistics?id=%s"
+    val statisticsLink = "%s/%s/statistics/?id=%s"
       .format(SparkUIUtils.prependBaseUri(request, parent.basePath), parent.prefix,
         streamingQuery.summary.runId)
 

From 98754657e7855b86845dfc1950220e2ee6777030 Mon Sep 17 00:00:00 2001
From: Gang Wu <ustcwg@gmail.com>
Date: Sun, 17 Mar 2024 11:38:54 +0800
Subject: [PATCH 4/4] Test Apache ORC 1.9.3-SNAPSHOT

---
 dev/deps/spark-deps-hadoop-3-hive-2.3 |  6 +++---
 pom.xml                               | 13 ++++++++++++-
 project/SparkBuild.scala              |  1 +
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/dev/deps/spark-deps-hadoop-3-hive-2.3 b/dev/deps/spark-deps-hadoop-3-hive-2.3
index 8ecf931bf513a..7c0b69689966e 100644
--- a/dev/deps/spark-deps-hadoop-3-hive-2.3
+++ b/dev/deps/spark-deps-hadoop-3-hive-2.3
@@ -212,9 +212,9 @@ opencsv/2.3//opencsv-2.3.jar
 opentracing-api/0.33.0//opentracing-api-0.33.0.jar
 opentracing-noop/0.33.0//opentracing-noop-0.33.0.jar
 opentracing-util/0.33.0//opentracing-util-0.33.0.jar
-orc-core/1.9.2/shaded-protobuf/orc-core-1.9.2-shaded-protobuf.jar
-orc-mapreduce/1.9.2/shaded-protobuf/orc-mapreduce-1.9.2-shaded-protobuf.jar
-orc-shims/1.9.2//orc-shims-1.9.2.jar
+orc-core/1.9.3-SNAPSHOT/shaded-protobuf/orc-core-1.9.3-SNAPSHOT-shaded-protobuf.jar
+orc-mapreduce/1.9.3-SNAPSHOT/shaded-protobuf/orc-mapreduce-1.9.3-SNAPSHOT-shaded-protobuf.jar
+orc-shims/1.9.3-SNAPSHOT//orc-shims-1.9.3-SNAPSHOT.jar
 oro/2.0.8//oro-2.0.8.jar
 osgi-resource-locator/1.0.3//osgi-resource-locator-1.0.3.jar
 paranamer/2.8//paranamer-2.8.jar
diff --git a/pom.xml b/pom.xml
index fb6208777d3ff..0096bb00181af 100644
--- a/pom.xml
+++ b/pom.xml
@@ -141,7 +141,7 @@
     <!-- After 10.15.1.3, the minimum required version is JDK9 -->
     <derby.version>10.14.2.0</derby.version>
     <parquet.version>1.13.1</parquet.version>
-    <orc.version>1.9.2</orc.version>
+    <orc.version>1.9.3-SNAPSHOT</orc.version>
     <orc.classifier>shaded-protobuf</orc.classifier>
     <jetty.version>9.4.54.v20240208</jetty.version>
     <jakartaservlet.version>4.0.3</jakartaservlet.version>
@@ -346,6 +346,17 @@
         <enabled>false</enabled>
       </snapshots>
     </repository>
+    <repository>
+      <id>apache.snapshots</id>
+      <name>Apache Development Snapshot Repository</name>
+      <url>https://repository.apache.org/content/repositories/snapshots/</url>
+      <releases>
+        <enabled>false</enabled>
+      </releases>
+      <snapshots>
+        <enabled>true</enabled>
+      </snapshots>
+    </repository>
   </repositories>
   <pluginRepositories>
     <pluginRepository>
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 79b58deafde57..7a51e216e0d5a 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -310,6 +310,7 @@ object SparkBuild extends PomBuild {
       "gcs-maven-central-mirror" at "https://maven-central.storage-download.googleapis.com/maven2/",
       DefaultMavenRepository,
       Resolver.mavenLocal,
+      "Apache Snapshot Repository" at "https://repository.apache.org/snapshots",
       Resolver.file("ivyLocal", file(Path.userHome.absolutePath + "/.ivy2/local"))(Resolver.ivyStylePatterns)
     ),
     externalResolvers := resolvers.value,