huangxiaopingRD · pull · Jan 10, 2024 · Jan 9, 2024 · Jan 9, 2024 · Jan 9, 2024
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -185,7 +185,7 @@ jobs:
         echo "Preparing the benchmark results:"
         tar -cvf benchmark-results-${{ github.event.inputs.jdk }}-${{ github.event.inputs.scala }}.tar `git diff --name-only` `git ls-files --others --exclude=tpcds-sf-1 --exclude-standard`
     - name: Upload benchmark results
-      uses: actions/upload-artifact@v4
+      uses: actions/upload-artifact@v3
       with:
         name: benchmark-results-${{ github.event.inputs.jdk }}-${{ github.event.inputs.scala }}-${{ matrix.split }}
         path: benchmark-results-${{ github.event.inputs.jdk }}-${{ github.event.inputs.scala }}.tar

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
@@ -272,13 +272,13 @@ jobs:
         ./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS"
     - name: Upload test results to report
       if: always()
-      uses: actions/upload-artifact@v4
+      uses: actions/upload-artifact@v3
       with:
         name: test-results-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
         path: "**/target/test-reports/*.xml"
     - name: Upload unit tests log files
       if: ${{ !success() }}
-      uses: actions/upload-artifact@v4
+      uses: actions/upload-artifact@v3
       with:
         name: unit-tests-log-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
         path: "**/target/unit-tests.log"
@@ -468,13 +468,13 @@ jobs:
         name: PySpark
     - name: Upload test results to report
       if: always()
-      uses: actions/upload-artifact@v4
+      uses: actions/upload-artifact@v3
       with:
         name: test-results-${{ matrix.modules }}--${{ matrix.java }}-${{ inputs.hadoop }}-hive2.3
         path: "**/target/test-reports/*.xml"
     - name: Upload unit tests log files
       if: ${{ !success() }}
-      uses: actions/upload-artifact@v4
+      uses: actions/upload-artifact@v3
       with:
         name: unit-tests-log-${{ matrix.modules }}--${{ matrix.java }}-${{ inputs.hadoop }}-hive2.3
         path: "**/target/unit-tests.log"
@@ -553,7 +553,7 @@ jobs:
         ./dev/run-tests --parallelism 1 --modules sparkr
     - name: Upload test results to report
       if: always()
-      uses: actions/upload-artifact@v4
+      uses: actions/upload-artifact@v3
       with:
         name: test-results-sparkr--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3
         path: "**/target/test-reports/*.xml"
@@ -761,6 +761,9 @@ jobs:
       run: ./dev/lint-r
     - name: Run documentation build
       run: |
+        # Build docs first with SKIP_API to ensure they are buildable without requiring any
+        # language docs to be built beforehand.
+        cd docs; SKIP_API=1 bundle exec jekyll build; cd ..
         if [ -f "./dev/is-changed.py" ]; then
           # Skip PySpark and SparkR docs while keeping Scala/Java/SQL docs
           pyspark_modules=`cd dev && python3.9 -c "import sparktestsupport.modules as m; print(','.join(m.name for m in m.all_modules if m.name.startswith('pyspark')))"`
@@ -774,7 +777,7 @@ jobs:
       run: tar cjf site.tar.bz2 docs/_site
     - name: Upload documentation
       if: github.repository != 'apache/spark'
-      uses: actions/upload-artifact@v4
+      uses: actions/upload-artifact@v3
       with:
         name: site
         path: site.tar.bz2
@@ -927,13 +930,13 @@ jobs:
           spark.sql.join.forceApplyShuffledHashJoin=true
     - name: Upload test results to report
       if: always()
-      uses: actions/upload-artifact@v4
+      uses: actions/upload-artifact@v3
       with:
         name: test-results-tpcds--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3
         path: "**/target/test-reports/*.xml"
     - name: Upload unit tests log files
       if: ${{ !success() }}
-      uses: actions/upload-artifact@v4
+      uses: actions/upload-artifact@v3
       with:
         name: unit-tests-log-tpcds--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3
         path: "**/target/unit-tests.log"
@@ -996,13 +999,13 @@ jobs:
         ./dev/run-tests --parallelism 1 --modules docker-integration-tests --included-tags org.apache.spark.tags.DockerTest
     - name: Upload test results to report
       if: always()
-      uses: actions/upload-artifact@v4
+      uses: actions/upload-artifact@v3
       with:
         name: test-results-docker-integration--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3
         path: "**/target/test-reports/*.xml"
     - name: Upload unit tests log files
       if: ${{ !success() }}
-      uses: actions/upload-artifact@v4
+      uses: actions/upload-artifact@v3
       with:
         name: unit-tests-log-docker-integration--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3
         path: "**/target/unit-tests.log"
@@ -1077,7 +1080,7 @@ jobs:
           build/sbt -Phadoop-3 -Psparkr -Pkubernetes -Pvolcano -Pkubernetes-integration-tests -Dspark.kubernetes.test.driverRequestCores=0.5 -Dspark.kubernetes.test.executorRequestCores=0.2 -Dspark.kubernetes.test.volcanoMaxConcurrencyJobNum=1 -Dtest.exclude.tags=local "kubernetes-integration-tests/test"
       - name: Upload Spark on K8S integration tests log files
         if: ${{ !success() }}
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v3
         with:
           name: spark-on-kubernetes-it-log
           path: "**/target/integration-tests.log"

diff --git a/.github/workflows/build_python.yml b/.github/workflows/build_python.yml
@@ -27,7 +27,7 @@ jobs:
   run-build:
     strategy:
       matrix:
-        pyversion: ["pypy3,python3.10", "python3.11", "python3.12"]
+        pyversion: ["pypy3", "python3.10", "python3.11", "python3.12"]
     permissions:
       packages: write
     name: Run

diff --git a/.github/workflows/maven_test.yml b/.github/workflows/maven_test.yml
@@ -200,13 +200,13 @@ jobs:
           rm -rf ~/.m2/repository/org/apache/spark
       - name: Upload test results to report
         if: always()
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v3
         with:
           name: test-results-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
           path: "**/target/test-reports/*.xml"
       - name: Upload unit tests log files
         if: failure()
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v3
         with:
           name: unit-tests-log-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
           path: "**/target/unit-tests.log"
diff --git a/common/network-common/src/main/java/org/apache/spark/network/shuffledb/StoreVersion.java b/common/network-common/src/main/java/org/apache/spark/network/shuffledb/StoreVersion.java
@@ -54,4 +54,9 @@ public int hashCode() {
         result = 31 * result + minor;
         return result;
     }
+
+    @Override
+    public String toString() {
+      return "StoreVersion[" + major + "." + minor + ']';
+    }
 }
diff --git a/common/network-common/src/test/java/org/apache/spark/network/util/DBProviderSuite.java b/common/network-common/src/test/java/org/apache/spark/network/util/DBProviderSuite.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.network.util;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import org.apache.commons.lang3.SystemUtils;
+import org.apache.spark.network.shuffledb.DBBackend;
+import org.apache.spark.network.shuffledb.StoreVersion;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+import java.io.File;
+import java.io.IOException;
+
+import static org.junit.jupiter.api.Assumptions.assumeFalse;
+
+public class DBProviderSuite {
+
+  @Test
+  public void testRockDBCheckVersionFailed() throws IOException {
+    testCheckVersionFailed(DBBackend.ROCKSDB, "rocksdb");
+  }
+
+  @Test
+  public void testLevelDBCheckVersionFailed() throws IOException {
+    assumeFalse(SystemUtils.IS_OS_MAC_OSX && SystemUtils.OS_ARCH.equals("aarch64"));
+    testCheckVersionFailed(DBBackend.LEVELDB, "leveldb");
+  }
+
+  private void testCheckVersionFailed(DBBackend dbBackend, String namePrefix) throws IOException {
+    String root = System.getProperty("java.io.tmpdir");
+    File dbFile = JavaUtils.createDirectory(root, namePrefix);
+    try {
+      StoreVersion v1 = new StoreVersion(1, 0);
+      ObjectMapper mapper = new ObjectMapper();
+      DBProvider.initDB(dbBackend, dbFile, v1, mapper).close();
+      StoreVersion v2 = new StoreVersion(2, 0);
+      IOException ioe = Assertions.assertThrows(IOException.class, () ->
+        DBProvider.initDB(dbBackend, dbFile, v2, mapper));
+      Assertions.assertTrue(
+        ioe.getMessage().contains("incompatible with current version StoreVersion[2.0]"));
+    } finally {
+      JavaUtils.deleteRecursively(dbFile);
+    }
+  }
+}
diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/CalendarInterval.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/CalendarInterval.java
@@ -44,7 +44,7 @@
  * @since 3.0.0
  */
 @Unstable
-public final class CalendarInterval implements Serializable {
+public final class CalendarInterval implements Serializable, Comparable<CalendarInterval> {
   // NOTE: If you're moving or renaming this file, you should also update Unidoc configuration
   // specified in 'SparkBuild.scala'.
   public final int months;
@@ -127,4 +127,26 @@ private void appendUnit(StringBuilder sb, long value, String unit) {
    * @throws ArithmeticException if a numeric overflow occurs
    */
   public Duration extractAsDuration() { return Duration.of(microseconds, ChronoUnit.MICROS); }
+
+  /**
+   * This method is not used to order CalendarInterval instances, as they are not orderable and
+   * cannot be used in a ORDER BY statement.
+   * Instead, it is used to find identical interval instances for aggregation purposes.
+   * It compares the 'months', 'days', and 'microseconds' fields of this CalendarInterval
+   * with another instance. The comparison is done first on the 'months', then on the 'days',
+   * and finally on the 'microseconds'.
+   *
+   * @param o The CalendarInterval instance to compare with.
+   * @return Zero if this object is equal to the specified object, and non-zero otherwise
+   */
+  @Override
+  public int compareTo(CalendarInterval o) {
+    if (this.months != o.months) {
+      return Integer.compare(this.months, o.months);
+    } else if (this.days != o.days) {
+      return Integer.compare(this.days, o.days);
+    } else {
+      return Long.compare(this.microseconds, o.microseconds);
+    }
+  }
 }
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CalendarIntervalSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CalendarIntervalSuite.java
@@ -76,6 +76,22 @@ public void toStringTest() {
       i.toString());
   }
 
+  @Test
+  public void compareToTest() {
+   CalendarInterval i = new CalendarInterval(0, 0, 0);
+
+   assertEquals(i.compareTo(new CalendarInterval(0, 0, 0)), 0);
+   assertEquals(i.compareTo(new CalendarInterval(0, 0, 1)), -1);
+   assertEquals(i.compareTo(new CalendarInterval(0, 1, 0)), -1);
+   assertEquals(i.compareTo(new CalendarInterval(0, 1, -1)), -1);
+   assertEquals(i.compareTo(new CalendarInterval(1, 0, 0)), -1);
+   assertEquals(i.compareTo(new CalendarInterval(1, 0, -1)), -1);
+   assertEquals(i.compareTo(new CalendarInterval(0, 0, -1)), 1);
+   assertEquals(i.compareTo(new CalendarInterval(0, -1, 0)), 1);
+   assertEquals(i.compareTo(new CalendarInterval(-1, 0, 0)), 1);
+   assertEquals(i.compareTo(new CalendarInterval(-1, 0, 1)), 1);
+  }
+
   @Test
   public void periodAndDurationTest() {
     CalendarInterval interval = new CalendarInterval(120, -40, 123456);

diff --git a/common/utils/src/main/resources/error/error-classes.json b/common/utils/src/main/resources/error/error-classes.json
@@ -324,6 +324,12 @@
     ],
     "sqlState" : "0AKD0"
   },
+  "CANNOT_RESOLVE_DATAFRAME_COLUMN" : {
+    "message" : [
+      "Cannot resolve dataframe column <name>. It's probably because of illegal references like `df1.select(df2.col(\"a\"))`."
+    ],
+    "sqlState" : "42704"
+  },
   "CANNOT_RESOLVE_STAR_EXPAND" : {
     "message" : [
       "Cannot resolve <targetString>.* given input columns <columns>. Please check that the specified table or struct exists and is accessible in the input columns."
@@ -6843,11 +6849,6 @@
       "Cannot modify the value of a static config: <k>"
     ]
   },
-  "_LEGACY_ERROR_TEMP_3051" : {
-    "message" : [
-      "When resolving <u>, fail to find subplan with plan_id=<planId> in <q>"
-    ]
-  },
   "_LEGACY_ERROR_TEMP_3052" : {
     "message" : [
       "Unexpected resolved action: <other>"

diff --git a/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroUtils.scala b/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroUtils.scala
@@ -182,16 +182,19 @@ private[sql] object AvroUtils extends Logging {
 
     def hasNextRow: Boolean = {
       while (!completed && currentRow.isEmpty) {
-        val r = fileReader.hasNext && !fileReader.pastSync(stopPosition)
-        if (!r) {
+        if (fileReader.pastSync(stopPosition)) {
           fileReader.close()
           completed = true
           currentRow = None
-        } else {
+        } else if (fileReader.hasNext()) {
           val record = fileReader.next()
           // the row must be deserialized in hasNextRow, because AvroDeserializer#deserialize
           // potentially filters rows
           currentRow = deserializer.deserialize(record).asInstanceOf[Option[InternalRow]]
+        } else {
+          // In this case, `fileReader.hasNext()` returns false but we are not past sync point yet.
+          // This means empty blocks, we need to continue reading the file in case there are non
+          // empty blocks or we are past sync point.
         }
       }
       currentRow.isDefined

diff --git a/connector/avro/src/test/resources/empty_blocks.avro b/connector/avro/src/test/resources/empty_blocks.avro
diff --git a/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala b/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala
@@ -2716,6 +2716,19 @@ abstract class AvroSuite
     assert(AvroOptions.isValidOption("datetimeRebaseMode"))
     assert(AvroOptions.isValidOption("enableStableIdentifiersForUnionType"))
   }
+
+  test("SPARK-46633: read file with empty blocks") {
+    for (maxPartitionBytes <- Seq(100, 100000, 100000000)) {
+      withSQLConf(SQLConf.FILES_MAX_PARTITION_BYTES.key -> s"$maxPartitionBytes") {
+        val file = getResourceAvroFilePath("empty_blocks.avro")
+        val df = spark.read.format("avro").load(file)
+        val count = df.count()
+        val records = df.collect()
+        assert(count == 58)
+        assert(count == records.length)
+      }
+    }
+  }
 }
 
 class AvroV1Suite extends AvroSuite {

diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala
@@ -894,7 +894,7 @@ class ClientE2ETestSuite extends RemoteSparkSession with SQLHelper with PrivateM
       // df1("i") is not ambiguous, but it's not valid in the projected df.
       df1.select((df1("i") + 1).as("plus")).select(df1("i")).collect()
     }
-    assert(e1.getMessage.contains("MISSING_ATTRIBUTES.RESOLVED_ATTRIBUTE_MISSING_FROM_INPUT"))
+    assert(e1.getMessage.contains("UNRESOLVED_COLUMN.WITH_SUGGESTION"))
 
     checkSameResult(
       Seq(Row(1, "a")),

diff --git a/...nect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectService.scala b/...nect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectService.scala
@@ -191,7 +191,7 @@ class SparkConnectService(debug: Boolean) extends AsyncService with BindableServ
       new SparkConnectReleaseExecuteHandler(responseObserver).handle(request)
     } catch
       ErrorUtils.handleError(
-        "reattachExecute",
+        "releaseExecute",
         observer = responseObserver,
         userId = request.getUserContext.getUserId,
         sessionId = request.getSessionId)

diff --git a/core/src/main/resources/org/apache/spark/ui/static/streaming-page.js b/core/src/main/resources/org/apache/spark/ui/static/streaming-page.js
@@ -223,7 +223,7 @@ function drawTimeline(id, data, minX, maxX, minY, maxY, unitY, batchInterval) {
     .attr("cx", function(d) { return x(d.x); })
     .attr("cy", function(d) { return y(d.y); })
     .attr("r", function(d) { return isFailedBatch(d.x) ? "2" : "3";})
-    .on('mouseover', function(d) {
+    .on('mouseover', function(event, d) {
       var tip = yValueFormat(d.y) + " " + unitY + " at " + timeTipStrings[d.x];
       showBootstrapTooltip(d3.select(this), tip);
       // show the point