huangxiaopingRD · pull · Dec 30, 2023 · Dec 28, 2023 · Dec 28, 2023 · Dec 28, 2023
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
@@ -253,7 +253,7 @@ jobs:
     - name: Install Python packages (Python 3.9)
       if: (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) || contains(matrix.modules, 'connect')
       run: |
-        python3.9 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy unittest-xml-reporting 'grpcio==1.59.3' 'grpcio-status==1.59.3' 'protobuf==4.25.1'
+        python3.9 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy unittest-xml-reporting 'lxml==4.9.4' 'grpcio==1.59.3' 'grpcio-status==1.59.3' 'protobuf==4.25.1'
         python3.9 -m pip list
     # Run the tests.
     - name: Run tests

diff --git a/common/utils/src/main/resources/error/error-classes.json b/common/utils/src/main/resources/error/error-classes.json
@@ -875,12 +875,6 @@
     ],
     "sqlState" : "42K01"
   },
-  "DATA_SOURCE_ALREADY_EXISTS" : {
-    "message" : [
-      "Data source '<provider>' already exists in the registry. Please use a different name for the new data source."
-    ],
-    "sqlState" : "42710"
-  },
   "DATA_SOURCE_NOT_EXIST" : {
     "message" : [
       "Data source '<provider>' not found. Please make sure the data source is registered."
@@ -1480,12 +1474,6 @@
     },
     "sqlState" : "42K0B"
   },
-  "INCORRECT_END_OFFSET" : {
-    "message" : [
-      "Max offset with <rowsPerSecond> rowsPerSecond is <maxSeconds>, but it's <endSeconds> now."
-    ],
-    "sqlState" : "22003"
-  },
   "INCORRECT_RAMP_UP_RATE" : {
     "message" : [
       "Max offset with <rowsPerSecond> rowsPerSecond is <maxSeconds>, but 'rampUpTimeSeconds' is <rampUpTimeSeconds>."
@@ -1906,11 +1894,6 @@
           "Operation not found."
         ]
       },
-      "SESSION_ALREADY_EXISTS" : {
-        "message" : [
-          "Session already exists."
-        ]
-      },
       "SESSION_CLOSED" : {
         "message" : [
           "Session was closed."
@@ -6065,11 +6048,6 @@
       "<walkedTypePath>."
     ]
   },
-  "_LEGACY_ERROR_TEMP_2142" : {
-    "message" : [
-      "Attributes for type <schema> is not supported."
-    ]
-  },
   "_LEGACY_ERROR_TEMP_2144" : {
     "message" : [
       "Unable to find constructor for <tpe>. This could happen if <tpe> is an interface, or a trait without companion object constructor."
@@ -6920,11 +6898,6 @@
       "<clazz>: <msg>"
     ]
   },
-  "_LEGACY_ERROR_TEMP_3066" : {
-    "message" : [
-      "<msg>"
-    ]
-  },
   "_LEGACY_ERROR_TEMP_3067" : {
     "message" : [
       "Streaming aggregation doesn't support group aggregate pandas UDF"
@@ -6980,11 +6953,6 @@
       "More than one event time columns are available. Please ensure there is at most one event time column per stream. event time columns: <eventTimeCols>"
     ]
   },
-  "_LEGACY_ERROR_TEMP_3078" : {
-    "message" : [
-      "Can not match ParquetTable in the query."
-    ]
-  },
   "_LEGACY_ERROR_TEMP_3079" : {
     "message" : [
       "Dynamic partition cannot be the parent of a static partition."

diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala
@@ -85,7 +85,13 @@ class ClientE2ETestSuite extends RemoteSparkSession with SQLHelper with PrivateM
                 |""".stripMargin)
             .collect()
         }
-        assert(ex.getErrorClass != null)
+        assert(
+          ex.getErrorClass ===
+            "INCONSISTENT_BEHAVIOR_CROSS_VERSION.PARSE_DATETIME_BY_NEW_PARSER")
+        assert(
+          ex.getMessageParameters.asScala == Map(
+            "datetime" -> "'02-29'",
+            "config" -> "\"spark.sql.legacy.timeParserPolicy\""))
         if (enrichErrorEnabled) {
           assert(ex.getCause.isInstanceOf[DateTimeException])
         } else {

diff --git a/...ctor/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala b/...ctor/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
@@ -516,6 +516,10 @@ class PlanGenerationTestSuite
     simple.where("a + id < 1000")
   }
 
+  test("between expr") {
+    simple.selectExpr("rand(123) BETWEEN 0.1 AND 0.2")
+  }
+
   test("unpivot values") {
     simple.unpivot(
       ids = Array(fn.col("id"), fn.col("a")),

diff --git a/...ct/common/src/main/scala/org/apache/spark/sql/connect/client/GrpcExceptionConverter.scala b/...ct/common/src/main/scala/org/apache/spark/sql/connect/client/GrpcExceptionConverter.scala
@@ -372,10 +372,14 @@ private[client] object GrpcExceptionConverter {
       .addAllErrorTypeHierarchy(classes.toImmutableArraySeq.asJava)
 
     if (errorClass != null) {
+      val messageParameters = JsonMethods
+        .parse(info.getMetadataOrDefault("messageParameters", "{}"))
+        .extract[Map[String, String]]
       builder.setSparkThrowable(
         FetchErrorDetailsResponse.SparkThrowable
           .newBuilder()
           .setErrorClass(errorClass)
+          .putAllMessageParameters(messageParameters.asJava)
           .build())
     }
 

diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/between_expr.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/between_expr.explain
@@ -0,0 +1,3 @@
+Project [((_common_expr_0#0 >= cast(0.1 as double)) AND (_common_expr_0#0 <= cast(0.2 as double))) AS between(rand(123), 0.1, 0.2)#0]
++- Project [id#0L, a#0, b#0, rand(123) AS _common_expr_0#0]
+   +- LocalRelation <empty>, [id#0L, a#0, b#0]
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/between_expr.json b/connector/connect/common/src/test/resources/query-tests/queries/between_expr.json
@@ -0,0 +1,20 @@
+{
+  "common": {
+    "planId": "1"
+  },
+  "project": {
+    "input": {
+      "common": {
+        "planId": "0"
+      },
+      "localRelation": {
+        "schema": "struct\u003cid:bigint,a:int,b:double\u003e"
+      }
+    },
+    "expressions": [{
+      "expressionString": {
+        "expression": "rand(123) BETWEEN 0.1 AND 0.2"
+      }
+    }]
+  }
+}
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/between_expr.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/between_expr.proto.bin
diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/config/Connect.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/config/Connect.scala
@@ -256,4 +256,13 @@ object Connect {
       .version("4.0.0")
       .booleanConf
       .createWithDefault(true)
+
+  val CONNECT_GRPC_MAX_METADATA_SIZE =
+    buildStaticConf("spark.connect.grpc.maxMetadataSize")
+      .doc(
+        "Sets the maximum size of metadata fields. For instance, it restricts metadata fields " +
+          "in `ErrorInfo`.")
+      .version("4.0.0")
+      .bytesConf(ByteUnit.BYTE)
+      .createWithDefault(1024)
 }
diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/utils/ErrorUtils.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/utils/ErrorUtils.scala
@@ -172,6 +172,7 @@ private[connect] object ErrorUtils extends Logging {
         "classes",
         JsonMethods.compact(JsonMethods.render(allClasses(st.getClass).map(_.getName))))
 
+    val maxMetadataSize = SparkEnv.get.conf.get(Connect.CONNECT_GRPC_MAX_METADATA_SIZE)
     // Add the SQL State and Error Class to the response metadata of the ErrorInfoObject.
     st match {
       case e: SparkThrowable =>
@@ -181,7 +182,12 @@ private[connect] object ErrorUtils extends Logging {
         }
         val errorClass = e.getErrorClass
         if (errorClass != null && errorClass.nonEmpty) {
-          errorInfo.putMetadata("errorClass", errorClass)
+          val messageParameters = JsonMethods.compact(
+            JsonMethods.render(map2jvalue(e.getMessageParameters.asScala.toMap)))
+          if (messageParameters.length <= maxMetadataSize) {
+            errorInfo.putMetadata("errorClass", errorClass)
+            errorInfo.putMetadata("messageParameters", messageParameters)
+          }
         }
       case _ =>
     }
@@ -200,8 +206,10 @@ private[connect] object ErrorUtils extends Logging {
     val withStackTrace =
       if (sessionHolderOpt.exists(
           _.session.conf.get(SQLConf.PYSPARK_JVM_STACKTRACE_ENABLED) && stackTrace.nonEmpty)) {
-        val maxSize = SparkEnv.get.conf.get(Connect.CONNECT_JVM_STACK_TRACE_MAX_SIZE)
-        errorInfo.putMetadata("stackTrace", StringUtils.abbreviate(stackTrace.get, maxSize))
+        val maxSize = Math.min(
+          SparkEnv.get.conf.get(Connect.CONNECT_JVM_STACK_TRACE_MAX_SIZE),
+          maxMetadataSize)
+        errorInfo.putMetadata("stackTrace", StringUtils.abbreviate(stackTrace.get, maxSize.toInt))
       } else {
         errorInfo
       }

diff --git a/connector/docker-integration-tests/pom.xml b/connector/docker-integration-tests/pom.xml
@@ -55,7 +55,7 @@
     <dependency>
       <groupId>com.google.guava</groupId>
       <artifactId>guava</artifactId>
-      <version>18.0</version>
+      <version>19.0</version>
     </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>

diff --git a/...ka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala b/...ka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala
@@ -1789,7 +1789,7 @@ class KafkaMicroBatchV2SourceSuite extends KafkaMicroBatchSourceSuiteBase {
       CheckAnswer(data: _*),
       Execute { query =>
         // The rate limit is 1, so there must be some delay in offsets per partition.
-        val progressWithDelay = query.recentProgress.map(_.sources.head).reverse.find { progress =>
+        val progressWithDelay = query.recentProgress.map(_.sources.head).findLast { progress =>
           // find the metrics that has non-zero average offsetsBehindLatest greater than 0.
           !progress.metrics.isEmpty && progress.metrics.get("avgOffsetsBehindLatest").toDouble > 0
         }

diff --git a/core/benchmarks/ZStandardBenchmark-jdk21-results.txt b/core/benchmarks/ZStandardBenchmark-jdk21-results.txt
@@ -2,48 +2,48 @@
 Benchmark ZStandardCompressionCodec
 ================================================================================================
 
-OpenJDK 64-Bit Server VM 21.0.1+12-LTS on Linux 5.15.0-1051-azure
+OpenJDK 64-Bit Server VM 21.0.1+12-LTS on Linux 5.15.0-1053-azure
 AMD EPYC 7763 64-Core Processor
 Benchmark ZStandardCompressionCodec:                    Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 --------------------------------------------------------------------------------------------------------------------------------------
-Compression 10000 times at level 1 without buffer pool            672            681          10          0.0       67171.0       1.0X
-Compression 10000 times at level 2 without buffer pool            715            718           4          0.0       71458.8       0.9X
-Compression 10000 times at level 3 without buffer pool            831            835           4          0.0       83139.1       0.8X
-Compression 10000 times at level 1 with buffer pool               609            611           2          0.0       60881.5       1.1X
-Compression 10000 times at level 2 with buffer pool               648            649           1          0.0       64791.0       1.0X
-Compression 10000 times at level 3 with buffer pool               744            751           6          0.0       74392.4       0.9X
+Compression 10000 times at level 1 without buffer pool            674            920         293          0.0       67406.4       1.0X
+Compression 10000 times at level 2 without buffer pool            882            884           3          0.0       88195.1       0.8X
+Compression 10000 times at level 3 without buffer pool            973            978           4          0.0       97301.3       0.7X
+Compression 10000 times at level 1 with buffer pool               955            955           1          0.0       95452.0       0.7X
+Compression 10000 times at level 2 with buffer pool               994            996           2          0.0       99432.1       0.7X
+Compression 10000 times at level 3 with buffer pool              1093           1101          12          0.0      109300.9       0.6X
 
-OpenJDK 64-Bit Server VM 21.0.1+12-LTS on Linux 5.15.0-1051-azure
+OpenJDK 64-Bit Server VM 21.0.1+12-LTS on Linux 5.15.0-1053-azure
 AMD EPYC 7763 64-Core Processor
 Benchmark ZStandardCompressionCodec:                        Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------------------------
-Decompression 10000 times from level 1 without buffer pool            842            849          12          0.0       84240.0       1.0X
-Decompression 10000 times from level 2 without buffer pool            842            846           6          0.0       84185.2       1.0X
-Decompression 10000 times from level 3 without buffer pool            843            844           1          0.0       84285.4       1.0X
-Decompression 10000 times from level 1 with buffer pool               770            771           1          0.0       77024.9       1.1X
-Decompression 10000 times from level 2 with buffer pool               771            771           0          0.0       77120.4       1.1X
-Decompression 10000 times from level 3 with buffer pool               770            771           0          0.0       77031.9       1.1X
+Decompression 10000 times from level 1 without buffer pool            826            829           3          0.0       82591.4       1.0X
+Decompression 10000 times from level 2 without buffer pool            825            826           1          0.0       82533.4       1.0X
+Decompression 10000 times from level 3 without buffer pool            827            830           5          0.0       82715.3       1.0X
+Decompression 10000 times from level 1 with buffer pool               763            764           1          0.0       76271.6       1.1X
+Decompression 10000 times from level 2 with buffer pool               763            777          23          0.0       76321.2       1.1X
+Decompression 10000 times from level 3 with buffer pool               763            765           2          0.0       76286.1       1.1X
 
-OpenJDK 64-Bit Server VM 21.0.1+12-LTS on Linux 5.15.0-1051-azure
+OpenJDK 64-Bit Server VM 21.0.1+12-LTS on Linux 5.15.0-1053-azure
 AMD EPYC 7763 64-Core Processor
 Parallel Compression at level 3:          Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-Parallel Compression with 0 workers                  48             50           3          0.0      376597.0       1.0X
-Parallel Compression with 1 workers                  41             42           3          0.0      318927.3       1.2X
-Parallel Compression with 2 workers                  38             40           2          0.0      297410.2       1.3X
-Parallel Compression with 4 workers                  37             39           1          0.0      287605.8       1.3X
-Parallel Compression with 8 workers                  39             40           1          0.0      301948.1       1.2X
-Parallel Compression with 16 workers                 41             43           1          0.0      317095.6       1.2X
+Parallel Compression with 0 workers                  49             50           1          0.0      384188.1       1.0X
+Parallel Compression with 1 workers                  42             44           4          0.0      328139.4       1.2X
+Parallel Compression with 2 workers                  40             42           1          0.0      309013.2       1.2X
+Parallel Compression with 4 workers                  40             41           1          0.0      309732.2       1.2X
+Parallel Compression with 8 workers                  41             43           2          0.0      319730.2       1.2X
+Parallel Compression with 16 workers                 43             45           1          0.0      337944.2       1.1X
 
-OpenJDK 64-Bit Server VM 21.0.1+12-LTS on Linux 5.15.0-1051-azure
+OpenJDK 64-Bit Server VM 21.0.1+12-LTS on Linux 5.15.0-1053-azure
 AMD EPYC 7763 64-Core Processor
 Parallel Compression at level 9:          Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-Parallel Compression with 0 workers                 174            175           1          0.0     1360596.3       1.0X
-Parallel Compression with 1 workers                 189            228          24          0.0     1477060.7       0.9X
-Parallel Compression with 2 workers                 109            118          15          0.0      851455.9       1.6X
-Parallel Compression with 4 workers                 114            118           3          0.0      891964.9       1.5X
-Parallel Compression with 8 workers                 115            122           4          0.0      899748.7       1.5X
-Parallel Compression with 16 workers                119            123           2          0.0      931210.7       1.5X
+Parallel Compression with 0 workers                 160            161           1          0.0     1250203.7       1.0X
+Parallel Compression with 1 workers                 196            197           2          0.0     1529028.2       0.8X
+Parallel Compression with 2 workers                 114            121          10          0.0      892592.4       1.4X
+Parallel Compression with 4 workers                 111            113           1          0.0      865617.7       1.4X
+Parallel Compression with 8 workers                 112            117           2          0.0      878723.8       1.4X
+Parallel Compression with 16 workers                114            117           2          0.0      889199.7       1.4X