Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
5ef9af3
Move 3.5 as 4.0
huaxingao Mar 7, 2025
c739558
Copy back 4.0 as 3.5
huaxingao Mar 7, 2025
7b5edd7
initial support for Spark 4.0
huaxingao Mar 10, 2025
c186dde
spark.hive35 -> spark35
huaxingao Mar 10, 2025
474eff4
change SPEC_ID to optional
huaxingao Apr 2, 2025
913d74b
more changes
huaxingao Apr 11, 2025
0cfbd8a
spotless
huaxingao Apr 11, 2025
f8661ce
copy over the latest 3.5
huaxingao Apr 11, 2025
87c2625
fix test failures
huaxingao Apr 12, 2025
a6fc6b5
Use DBCP to be consistent with Spark
huaxingao Apr 16, 2025
2d8e109
switch to Spark 4.0 RC4
huaxingao Apr 17, 2025
5775521
switch back to HikariCP
huaxingao Apr 17, 2025
6cb26e3
retry snapshot
huaxingao Apr 17, 2025
aaba16e
change spark-ci.yml to upload the reports for failed test
huaxingao Apr 17, 2025
1520135
spotlessApply
huaxingao Apr 17, 2025
66ae025
copy the latest v3.5
huaxingao Apr 18, 2025
f167416
address comments
huaxingao Apr 18, 2025
80748fb
spotlessApply
huaxingao Apr 18, 2025
3832ce9
rebase
huaxingao May 7, 2025
476b079
fix wrong number of args
huaxingao May 7, 2025
80044a5
change to Preconditions.checkArgument
huaxingao May 7, 2025
5170f8c
Revert "Spark 3.5: Update MERGE and UPDATE for row lineage (#12736)"
huaxingao May 7, 2025
75d2c14
Revert "Revert "Spark 3.5: Update MERGE and UPDATE for row lineage (#…
huaxingao May 7, 2025
fb7fb9c
manually revert Update MERGE and UPDATE for row lineage (#12736) from…
huaxingao May 7, 2025
9c63a4c
fix test failure
huaxingao May 8, 2025
60568f1
bump Comet version to 0.8.1
huaxingao May 8, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
4 changes: 2 additions & 2 deletions .github/workflows/java-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ jobs:
runs-on: ubuntu-22.04
strategy:
matrix:
jvm: [11, 17, 21]
jvm: [17, 21]
steps:
- uses: actions/checkout@v4
- uses: actions/setup-java@v4
Expand All @@ -108,7 +108,7 @@ jobs:
runs-on: ubuntu-22.04
strategy:
matrix:
jvm: [11, 17, 21]
jvm: [17, 21]
steps:
- uses: actions/checkout@v4
- uses: actions/setup-java@v4
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/jmh-benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ on:
description: 'The branch name'
required: true
spark_version:
description: 'The spark project version to use, such as iceberg-spark-3.5'
default: 'iceberg-spark-3.5'
description: 'The spark project version to use, such as iceberg-spark-4.0'
default: 'iceberg-spark-4.0'
required: true
benchmarks:
description: 'A list of comma-separated double-quoted Benchmark names, such as "IcebergSourceFlatParquetDataReadBenchmark", "IcebergSourceFlatParquetDataFilterBenchmark"'
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/publish-snapshot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,4 +41,4 @@ jobs:
- run: |
./gradlew printVersion
./gradlew -DallModules publishApachePublicationToMavenRepository -PmavenUser=${{ secrets.NEXUS_USER }} -PmavenPassword=${{ secrets.NEXUS_PW }}
./gradlew -DflinkVersions= -DsparkVersions=3.4,3.5 -DscalaVersion=2.13 -DkafkaVersions=3 publishApachePublicationToMavenRepository -PmavenUser=${{ secrets.NEXUS_USER }} -PmavenPassword=${{ secrets.NEXUS_PW }}
./gradlew -DflinkVersions= -DsparkVersions=3.4,3.5,4.0 -DscalaVersion=2.13 -DkafkaVersions=3 publishApachePublicationToMavenRepository -PmavenUser=${{ secrets.NEXUS_USER }} -PmavenPassword=${{ secrets.NEXUS_PW }}
2 changes: 1 addition & 1 deletion .github/workflows/recurring-jmh-benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
"IcebergSourceNestedParquetDataReadBenchmark", "IcebergSourceNestedParquetDataWriteBenchmark",
"IcebergSourceParquetEqDeleteBenchmark", "IcebergSourceParquetMultiDeleteFileBenchmark",
"IcebergSourceParquetPosDeleteBenchmark", "IcebergSourceParquetWithUnrelatedDeleteBenchmark"]
spark_version: ['iceberg-spark-3.5']
spark_version: ['iceberg-spark-4.0']
env:
SPARK_LOCAL_IP: localhost
steps:
Expand Down
6 changes: 5 additions & 1 deletion .github/workflows/spark-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -71,13 +71,17 @@ jobs:
strategy:
matrix:
jvm: [11, 17, 21]
spark: ['3.4', '3.5']
spark: ['3.4', '3.5', '4.0']
scala: ['2.12', '2.13']
exclude:
# Spark 3.5 is the first version not failing on Java 21 (https://issues.apache.org/jira/browse/SPARK-42369)
# Full Java 21 support is coming in Spark 4 (https://issues.apache.org/jira/browse/SPARK-43831)
- jvm: 11
spark: '4.0'
- jvm: 21
spark: '3.4'
- spark: '4.0'
scala: '2.12'
env:
SPARK_LOCAL_IP: localhost
steps:
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ spark/v3.4/spark/benchmark/*
spark/v3.4/spark-extensions/benchmark/*
spark/v3.5/spark/benchmark/*
spark/v3.5/spark-extensions/benchmark/*
spark/v4.0/spark/benchmark/*
spark/v4.0/spark-extensions/benchmark/*
*/benchmark/*

__pycache__/
Expand Down
3 changes: 3 additions & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,9 @@ allprojects {
repositories {
mavenCentral()
mavenLocal()
maven {
url "https://repository.apache.org/content/repositories/orgapachespark-1480/"
}
}
}

Expand Down
2 changes: 1 addition & 1 deletion core/src/main/java/org/apache/iceberg/MetadataColumns.java
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ private MetadataColumns() {}
public static final int SPEC_ID_COLUMN_ID = Integer.MAX_VALUE - 4;
public static final String SPEC_ID_COLUMN_DOC = "Spec ID used to track the file containing a row";
public static final NestedField SPEC_ID =
NestedField.required(
NestedField.optional(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In Spark 4.0, the metadata columns are nullable, so need to change this field to optional.
apache/spark#50246 (comment)

SPEC_ID_COLUMN_ID, "_spec_id", Types.IntegerType.get(), SPEC_ID_COLUMN_DOC);
// the partition column type is not static and depends on all specs in the table
public static final int PARTITION_COLUMN_ID = Integer.MAX_VALUE - 5;
Expand Down
4 changes: 2 additions & 2 deletions gradle.properties
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ jmhJsonOutputPath=build/reports/jmh/results.json
jmhIncludeRegex=.*
systemProp.defaultFlinkVersions=2.0
systemProp.knownFlinkVersions=1.19,1.20,2.0
systemProp.defaultSparkVersions=3.5
systemProp.knownSparkVersions=3.4,3.5
systemProp.defaultSparkVersions=4.0
systemProp.knownSparkVersions=3.4,3.5,4.0
systemProp.defaultKafkaVersions=3
systemProp.knownKafkaVersions=3
systemProp.defaultScalaVersion=2.12
Expand Down
5 changes: 5 additions & 0 deletions gradle/libs.versions.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ activation = "1.1.1"
aliyun-sdk-oss = "3.10.2"
analyticsaccelerator = "1.0.0"
antlr = "4.9.3"
antlr413 = "4.13.1" # For Spark 4.0 support
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

to be consistent with antlr version in Spark 4.0

aircompressor = "0.27"
apiguardian = "1.1.2"
arrow = "15.0.2"
Expand All @@ -36,6 +37,7 @@ awssdk-s3accessgrants = "2.3.0"
bson-ver = "4.11.5"
caffeine = "2.9.3"
calcite = "1.39.0"
comet = "0.8.1"
datasketches = "6.2.0"
delta-standalone = "3.3.1"
delta-spark = "3.3.1"
Expand Down Expand Up @@ -81,6 +83,7 @@ slf4j = "2.0.17"
snowflake-jdbc = "3.24.0"
spark34 = "3.4.4"
spark35 = "3.5.5"
spark40 = "4.0.0"
sqlite-jdbc = "3.49.1.0"
testcontainers = "1.21.0"
tez08 = { strictly = "0.8.4"} # see rich version usage explanation above
Expand All @@ -92,6 +95,8 @@ aliyun-sdk-oss = { module = "com.aliyun.oss:aliyun-sdk-oss", version.ref = "aliy
analyticsaccelerator-s3 = { module = "software.amazon.s3.analyticsaccelerator:analyticsaccelerator-s3", version.ref = "analyticsaccelerator" }
antlr-antlr4 = { module = "org.antlr:antlr4", version.ref = "antlr" }
antlr-runtime = { module = "org.antlr:antlr4-runtime", version.ref = "antlr" }
antlr-antlr413 = { module = "org.antlr:antlr4", version.ref = "antlr413" }
antlr-runtime413 = { module = "org.antlr:antlr4-runtime", version.ref = "antlr413" }
arrow-memory-netty = { module = "org.apache.arrow:arrow-memory-netty", version.ref = "arrow" }
arrow-vector = { module = "org.apache.arrow:arrow-vector", version.ref = "arrow" }
avro-avro = { module = "org.apache.avro:avro", version.ref = "avro" }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,7 @@ private void initConf(HiveConf conf, int port, boolean directSql) {
// Setting this to avoid thrift exception during running Iceberg tests outside Iceberg.
conf.set(
HiveConf.ConfVars.HIVE_IN_TEST.varname, HiveConf.ConfVars.HIVE_IN_TEST.getDefaultValue());
conf.set("datanucleus.connectionPoolingType", "DBCP");
}

private static void setupMetastoreDB(String dbURL) throws SQLException, IOException {
Expand Down
5 changes: 5 additions & 0 deletions jmh.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@ if (sparkVersions.contains("3.5")) {
jmhProjects.add(project(":iceberg-spark:iceberg-spark-extensions-3.5_${scalaVersion}"))
}

if (sparkVersions.contains("4.0")) {
jmhProjects.add(project(":iceberg-spark:iceberg-spark-4.0_2.13"))
jmhProjects.add(project(":iceberg-spark:iceberg-spark-extensions-4.0_2.13"))
}

configure(jmhProjects) {
apply plugin: 'me.champeau.jmh'
apply plugin: 'io.morethan.jmhreport'
Expand Down
12 changes: 12 additions & 0 deletions settings.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,18 @@ if (sparkVersions.contains("3.5")) {
project(":iceberg-spark:spark-runtime-3.5_${scalaVersion}").name = "iceberg-spark-runtime-3.5_${scalaVersion}"
}

if (sparkVersions.contains("4.0")) {
include ":iceberg-spark:spark-4.0_2.13"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this theoretically create Spark 4.0 if Scala 2.12 is set without error? Feels like we should still use scalaVersion here but have an assert > 2.12 or something.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem is that Flink does work with scala 2.13. I use 2.13 to build Spark module only. I think we can't assert > 2.12 because we still need 2.12 to build Flink.

Copy link
Member

@RussellSpitzer RussellSpitzer May 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doesn't that mean when Scala is set to 2.12 we just should not allow building Spark 4.0?

include ":iceberg-spark:spark-extensions-4.0_2.13"
include ":iceberg-spark:spark-runtime-4.0_2.13"
project(":iceberg-spark:spark-4.0_2.13").projectDir = file('spark/v4.0/spark')
project(":iceberg-spark:spark-4.0_2.13").name = "iceberg-spark-4.0_2.13"
project(":iceberg-spark:spark-extensions-4.0_2.13").projectDir = file('spark/v4.0/spark-extensions')
project(":iceberg-spark:spark-extensions-4.0_2.13").name = "iceberg-spark-extensions-4.0_2.13"
project(":iceberg-spark:spark-runtime-4.0_2.13").projectDir = file('spark/v4.0/spark-runtime')
project(":iceberg-spark:spark-runtime-4.0_2.13").name = "iceberg-spark-runtime-4.0_2.13"
}

if (kafkaVersions.contains("3")) {
include 'kafka-connect'
project(':kafka-connect').name = 'iceberg-kafka-connect'
Expand Down
4 changes: 4 additions & 0 deletions spark/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,7 @@ if (sparkVersions.contains("3.4")) {
if (sparkVersions.contains("3.5")) {
apply from: file("$projectDir/v3.5/build.gradle")
}

if (sparkVersions.contains("4.0")) {
apply from: file("$projectDir/v4.0/build.gradle")
}
4 changes: 2 additions & 2 deletions spark/v3.4/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ project(":iceberg-spark:iceberg-spark-${sparkMajorVersion}_${scalaVersion}") {
exclude group: 'org.roaringbitmap'
}

compileOnly "org.apache.datafusion:comet-spark-spark${sparkMajorVersion}_${scalaVersion}:0.5.0"
compileOnly "org.apache.datafusion:comet-spark-spark${sparkMajorVersion}_${scalaVersion}:${libs.versions.comet.get()}"

implementation libs.parquet.column
implementation libs.parquet.hadoop
Expand Down Expand Up @@ -186,7 +186,7 @@ project(":iceberg-spark:iceberg-spark-extensions-${sparkMajorVersion}_${scalaVer
testImplementation libs.parquet.hadoop
testImplementation libs.awaitility
testImplementation libs.junit.vintage.engine
testImplementation "org.apache.datafusion:comet-spark-spark${sparkMajorVersion}_${scalaVersion}:0.5.0"
testImplementation "org.apache.datafusion:comet-spark-spark${sparkMajorVersion}_${scalaVersion}:${libs.versions.comet.get()}"

// Required because we remove antlr plugin dependencies from the compile configuration, see note above
runtimeOnly libs.antlr.runtime
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@
package org.apache.iceberg.spark.data.vectorized;

import java.io.IOException;
import org.apache.comet.CometSchemaImporter;
import org.apache.comet.parquet.AbstractColumnReader;
import org.apache.comet.parquet.ColumnReader;
import org.apache.comet.parquet.TypeUtil;
import org.apache.comet.parquet.Utils;
import org.apache.comet.shaded.arrow.c.CometSchemaImporter;
import org.apache.comet.shaded.arrow.memory.RootAllocator;
import org.apache.iceberg.parquet.VectorizedReader;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
Expand Down
4 changes: 2 additions & 2 deletions spark/v3.5/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ project(":iceberg-spark:iceberg-spark-${sparkMajorVersion}_${scalaVersion}") {
exclude group: 'org.roaringbitmap'
}

compileOnly "org.apache.datafusion:comet-spark-spark${sparkMajorVersion}_${scalaVersion}:0.5.0"
compileOnly "org.apache.datafusion:comet-spark-spark${sparkMajorVersion}_${scalaVersion}:${libs.versions.comet.get()}"

implementation libs.parquet.column
implementation libs.parquet.hadoop
Expand Down Expand Up @@ -184,7 +184,7 @@ project(":iceberg-spark:iceberg-spark-extensions-${sparkMajorVersion}_${scalaVer
testImplementation libs.avro.avro
testImplementation libs.parquet.hadoop
testImplementation libs.awaitility
testImplementation "org.apache.datafusion:comet-spark-spark${sparkMajorVersion}_${scalaVersion}:0.5.0"
testImplementation "org.apache.datafusion:comet-spark-spark${sparkMajorVersion}_${scalaVersion}:${libs.versions.comet.get()}"

// Required because we remove antlr plugin dependencies from the compile configuration, see note above
runtimeOnly libs.antlr.runtime
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@
package org.apache.iceberg.spark.data.vectorized;

import java.io.IOException;
import org.apache.comet.CometSchemaImporter;
import org.apache.comet.parquet.AbstractColumnReader;
import org.apache.comet.parquet.ColumnReader;
import org.apache.comet.parquet.TypeUtil;
import org.apache.comet.parquet.Utils;
import org.apache.comet.shaded.arrow.c.CometSchemaImporter;
import org.apache.comet.shaded.arrow.memory.RootAllocator;
import org.apache.iceberg.parquet.VectorizedReader;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
Expand Down
Loading