From 8049a203b8c5f2f8045701916e66cfc786e16b57 Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Wed, 13 Sep 2023 15:51:27 +0800 Subject: [PATCH 1/4] [SPARK-45141][PYTHON][INFRA][TESTS] Pin `pyarrow==12.0.1` in CI ### What changes were proposed in this pull request? Pin `pyarrow==12.0.1` in CI ### Why are the changes needed? to fix test failure, https://github.com/apache/spark/actions/runs/6167186123/job/16738683632 ``` ====================================================================== FAIL [0.095s]: test_from_to_pandas (pyspark.pandas.tests.data_type_ops.test_datetime_ops.DatetimeOpsTests) ---------------------------------------------------------------------- Traceback (most recent call last): File "/__w/spark/spark/python/pyspark/testing/pandasutils.py", line 122, in _assert_pandas_equal assert_series_equal( File "/usr/local/lib/python3.9/dist-packages/pandas/_testing/asserters.py", line 931, in assert_series_equal assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}") File "/usr/local/lib/python3.9/dist-packages/pandas/_testing/asserters.py", line 415, in assert_attr_equal raise_assert_detail(obj, msg, left_attr, right_attr) File "/usr/local/lib/python3.9/dist-packages/pandas/_testing/asserters.py", line 599, in raise_assert_detail raise AssertionError(msg) AssertionError: Attributes of Series are different Attribute "dtype" are different [left]: datetime64[ns] [right]: datetime64[us] ``` ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? CI and manually test ### Was this patch authored or co-authored using generative AI tooling? No Closes #42897 from zhengruifeng/pin_pyarrow. Authored-by: Ruifeng Zheng Signed-off-by: Ruifeng Zheng (cherry picked from commit e3d2dfa8b514f9358823c3cb1ad6523da8a6646b) Signed-off-by: Dongjoon Hyun --- .github/workflows/build_and_test.yml | 4 ++-- dev/infra/Dockerfile | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index b0760a955342f..8488540b415d5 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -258,7 +258,7 @@ jobs: - name: Install Python packages (Python 3.8) if: (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) run: | - python3.8 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy unittest-xml-reporting 'grpcio==1.56.0' 'protobuf==3.20.3' + python3.8 -m pip install 'numpy>=1.20.0' 'pyarrow==12.0.1' pandas scipy unittest-xml-reporting 'grpcio==1.56.0' 'protobuf==3.20.3' python3.8 -m pip list # Run the tests. - name: Run tests @@ -684,7 +684,7 @@ jobs: # See also https://issues.apache.org/jira/browse/SPARK-38279. python3.9 -m pip install 'sphinx<3.1.0' mkdocs pydata_sphinx_theme 'sphinx-copybutton==0.5.2' nbsphinx numpydoc 'jinja2<3.0.0' 'markupsafe==2.0.1' 'pyzmq<24.0.0' 'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5' 'nest-asyncio==1.5.8' 'rpds-py==0.16.2' 'alabaster==0.7.13' python3.9 -m pip install ipython_genutils # See SPARK-38517 - python3.9 -m pip install sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8' + python3.9 -m pip install sphinx_plotly_directive 'numpy>=1.20.0' 'pyarrow==12.0.1' pandas 'plotly>=4.8' python3.9 -m pip install 'docutils<0.18.0' # See SPARK-39421 apt-get update -y apt-get install -y ruby ruby-dev diff --git a/dev/infra/Dockerfile b/dev/infra/Dockerfile index d3bae836cc631..d3fcd7ab36228 100644 --- a/dev/infra/Dockerfile +++ b/dev/infra/Dockerfile @@ -65,7 +65,7 @@ RUN Rscript -e "devtools::install_version('roxygen2', version='7.2.0', repos='ht ENV R_LIBS_SITE "/usr/local/lib/R/site-library:${R_LIBS_SITE}:/usr/lib/R/library" RUN pypy3 -m pip install numpy 'pandas<=2.0.3' scipy coverage matplotlib -RUN python3.9 -m pip install numpy pyarrow 'pandas<=2.0.3' scipy unittest-xml-reporting plotly>=4.8 'mlflow>=2.3.1' coverage matplotlib openpyxl 'memory-profiler==0.60.0' 'scikit-learn==1.1.*' +RUN python3.9 -m pip install numpy 'pyarrow==12.0.1' 'pandas<=2.0.3' scipy unittest-xml-reporting plotly>=4.8 'mlflow>=2.3.1' coverage matplotlib openpyxl 'memory-profiler==0.60.0' 'scikit-learn==1.1.*' # Add Python deps for Spark Connect. RUN python3.9 -m pip install 'grpcio>=1.48,<1.57' 'grpcio-status>=1.48,<1.57' 'protobuf==3.20.3' 'googleapis-common-protos==1.56.4' From cc6912ec612c30e46e1595860a5519bb1caa221b Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Sun, 17 Mar 2024 15:15:50 -0700 Subject: [PATCH 2/4] [SPARK-47432][PYTHON][CONNECT][DOCS][3.5] Add `pyarrow` upper bound requirement, `<13.0.0` ### What changes were proposed in this pull request? This PR aims to add `pyarrow` upper bound requirement, `<13.0.0`, to Apache Spark 3.5.x. ### Why are the changes needed? PyArrow 13.0.0 has breaking changes mentioned by #42920 which is a part of Apache Spark 4.0.0. ### Does this PR introduce _any_ user-facing change? No, this only clarifies the upper bound. ### How was this patch tested? Pass the CIs. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #45553 from dongjoon-hyun/SPARK-47432. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun --- dev/requirements.txt | 2 +- python/docs/source/getting_started/install.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dev/requirements.txt b/dev/requirements.txt index 597417aba1f3d..0749af75aa4be 100644 --- a/dev/requirements.txt +++ b/dev/requirements.txt @@ -3,7 +3,7 @@ py4j # PySpark dependencies (optional) numpy -pyarrow +pyarrow<13.0.0 pandas scipy plotly diff --git a/python/docs/source/getting_started/install.rst b/python/docs/source/getting_started/install.rst index 6822285e96172..e97632a8b384b 100644 --- a/python/docs/source/getting_started/install.rst +++ b/python/docs/source/getting_started/install.rst @@ -157,7 +157,7 @@ Package Supported version Note ========================== ========================= ====================================================================================== `py4j` >=0.10.9.7 Required `pandas` >=1.0.5 Required for pandas API on Spark and Spark Connect; Optional for Spark SQL -`pyarrow` >=4.0.0 Required for pandas API on Spark and Spark Connect; Optional for Spark SQL +`pyarrow` >=4.0.0,<13.0.0 Required for pandas API on Spark and Spark Connect; Optional for Spark SQL `numpy` >=1.15 Required for pandas API on Spark and MLLib DataFrame-based API; Optional for Spark SQL `grpcio` >=1.48,<1.57 Required for Spark Connect `grpcio-status` >=1.48,<1.57 Required for Spark Connect From bb7a6138b827975fc827813ab42a2b9074bf8d5e Mon Sep 17 00:00:00 2001 From: Huw Campbell Date: Mon, 18 Mar 2024 07:38:10 -0700 Subject: [PATCH 3/4] [SPARK-47434][WEBUI] Fix `statistics` link in `StreamingQueryPage` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What changes were proposed in this pull request? Like SPARK-24553, this PR aims to fix redirect issues (incorrect 302) when one is using proxy settings. Change the generated link to be consistent with other links and include a trailing slash ### Why are the changes needed? When using a proxy, an invalid redirect is issued if this is not included ### Does this PR introduce _any_ user-facing change? Only that people will be able to use these links if they are using a proxy ### How was this patch tested? With a proxy installed I went to the location this link would generate and could go to the page, when it redirects with the link as it exists. Edit: Further tested by building a version of our application with this patch applied, the links work now. ### Was this patch authored or co-authored using generative AI tooling? No. Page with working link Screenshot 2024-03-18 at 4 45 27 PM Goes correctly to Screenshot 2024-03-18 at 4 45 36 PM Before it would redirect and we'd get a 404. image Closes #45527 from HuwCampbell/patch-1. Authored-by: Huw Campbell Signed-off-by: Dongjoon Hyun (cherry picked from commit 9b466d329c3c75e89b80109755a41c2d271b8acc) Signed-off-by: Dongjoon Hyun --- .../org/apache/spark/sql/streaming/ui/StreamingQueryPage.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryPage.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryPage.scala index 7cd7db4088ac9..ce3e7cde01b7a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryPage.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryPage.scala @@ -174,7 +174,7 @@ private[ui] class StreamingQueryPagedTable( override def row(query: StructuredStreamingRow): Seq[Node] = { val streamingQuery = query.streamingUIData - val statisticsLink = "%s/%s/statistics?id=%s" + val statisticsLink = "%s/%s/statistics/?id=%s" .format(SparkUIUtils.prependBaseUri(request, parent.basePath), parent.prefix, streamingQuery.summary.runId) From 98754657e7855b86845dfc1950220e2ee6777030 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Sun, 17 Mar 2024 11:38:54 +0800 Subject: [PATCH 4/4] Test Apache ORC 1.9.3-SNAPSHOT --- dev/deps/spark-deps-hadoop-3-hive-2.3 | 6 +++--- pom.xml | 13 ++++++++++++- project/SparkBuild.scala | 1 + 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/dev/deps/spark-deps-hadoop-3-hive-2.3 b/dev/deps/spark-deps-hadoop-3-hive-2.3 index 8ecf931bf513a..7c0b69689966e 100644 --- a/dev/deps/spark-deps-hadoop-3-hive-2.3 +++ b/dev/deps/spark-deps-hadoop-3-hive-2.3 @@ -212,9 +212,9 @@ opencsv/2.3//opencsv-2.3.jar opentracing-api/0.33.0//opentracing-api-0.33.0.jar opentracing-noop/0.33.0//opentracing-noop-0.33.0.jar opentracing-util/0.33.0//opentracing-util-0.33.0.jar -orc-core/1.9.2/shaded-protobuf/orc-core-1.9.2-shaded-protobuf.jar -orc-mapreduce/1.9.2/shaded-protobuf/orc-mapreduce-1.9.2-shaded-protobuf.jar -orc-shims/1.9.2//orc-shims-1.9.2.jar +orc-core/1.9.3-SNAPSHOT/shaded-protobuf/orc-core-1.9.3-SNAPSHOT-shaded-protobuf.jar +orc-mapreduce/1.9.3-SNAPSHOT/shaded-protobuf/orc-mapreduce-1.9.3-SNAPSHOT-shaded-protobuf.jar +orc-shims/1.9.3-SNAPSHOT//orc-shims-1.9.3-SNAPSHOT.jar oro/2.0.8//oro-2.0.8.jar osgi-resource-locator/1.0.3//osgi-resource-locator-1.0.3.jar paranamer/2.8//paranamer-2.8.jar diff --git a/pom.xml b/pom.xml index fb6208777d3ff..0096bb00181af 100644 --- a/pom.xml +++ b/pom.xml @@ -141,7 +141,7 @@ 10.14.2.0 1.13.1 - 1.9.2 + 1.9.3-SNAPSHOT shaded-protobuf 9.4.54.v20240208 4.0.3 @@ -346,6 +346,17 @@ false + + apache.snapshots + Apache Development Snapshot Repository + https://repository.apache.org/content/repositories/snapshots/ + + false + + + true + + diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 79b58deafde57..7a51e216e0d5a 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -310,6 +310,7 @@ object SparkBuild extends PomBuild { "gcs-maven-central-mirror" at "https://maven-central.storage-download.googleapis.com/maven2/", DefaultMavenRepository, Resolver.mavenLocal, + "Apache Snapshot Repository" at "https://repository.apache.org/snapshots", Resolver.file("ivyLocal", file(Path.userHome.absolutePath + "/.ivy2/local"))(Resolver.ivyStylePatterns) ), externalResolvers := resolvers.value,