diff --git a/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/mock/AliyunOSSMockLocalStore.java b/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/mock/AliyunOSSMockLocalStore.java index 521b87e31e80..f49697522dca 100644 --- a/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/mock/AliyunOSSMockLocalStore.java +++ b/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/mock/AliyunOSSMockLocalStore.java @@ -37,10 +37,9 @@ import java.security.NoSuchAlgorithmException; import java.util.Comparator; import java.util.List; -import java.util.Locale; import java.util.Map; import java.util.stream.Stream; -import org.apache.directory.api.util.Hex; +import org.apache.commons.codec.binary.Hex; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.io.ByteStreams; @@ -87,7 +86,7 @@ static String md5sum(InputStream is) throws IOException { while ((numBytes = is.read(bytes)) != -1) { md.update(bytes, 0, numBytes); } - return new String(Hex.encodeHex(md.digest())).toUpperCase(Locale.ROOT); + return Hex.encodeHexString(md.digest(), false); } private static void inputStreamToFile(InputStream inputStream, File targetFile) diff --git a/build.gradle b/build.gradle index 099192abaf84..7fd4c5106568 100644 --- a/build.gradle +++ b/build.gradle @@ -348,7 +348,7 @@ project(':iceberg-core') { implementation libs.jackson.databind implementation libs.caffeine implementation libs.roaringbitmap - compileOnly(libs.hadoop2.client) { + compileOnly(libs.hadoop3.client) { exclude group: 'org.apache.avro', module: 'avro' exclude group: 'org.slf4j', module: 'slf4j-log4j12' } @@ -373,7 +373,7 @@ project(':iceberg-data') { implementation project(':iceberg-core') compileOnly project(':iceberg-parquet') compileOnly project(':iceberg-orc') - compileOnly(libs.hadoop2.common) { + compileOnly(libs.hadoop3.common) { exclude group: 'commons-beanutils' exclude group: 'org.apache.avro', module: 'avro' exclude group: 'org.slf4j', module: 'slf4j-log4j12' @@ -396,7 +396,7 @@ project(':iceberg-data') { compileOnly libs.avro.avro - testImplementation(libs.hadoop2.client) { + testImplementation(libs.hadoop3.client) { exclude group: 'org.apache.avro', module: 'avro' exclude group: 'org.slf4j', module: 'slf4j-log4j12' } @@ -427,7 +427,7 @@ project(':iceberg-aliyun') { compileOnly libs.jaxb.api compileOnly libs.activation compileOnly libs.jaxb.runtime - compileOnly(libs.hadoop2.common) { + compileOnly(libs.hadoop3.common) { exclude group: 'org.apache.avro', module: 'avro' exclude group: 'org.slf4j', module: 'slf4j-log4j12' exclude group: 'javax.servlet', module: 'servlet-api' @@ -470,7 +470,7 @@ project(':iceberg-aws') { compileOnly("software.amazon.awssdk:dynamodb") compileOnly("software.amazon.awssdk:lakeformation") - compileOnly(libs.hadoop2.common) { + compileOnly(libs.hadoop3.common) { exclude group: 'org.apache.avro', module: 'avro' exclude group: 'org.slf4j', module: 'slf4j-log4j12' exclude group: 'javax.servlet', module: 'servlet-api' @@ -572,7 +572,7 @@ project(':iceberg-delta-lake') { compileOnly "io.delta:delta-standalone_${scalaVersion}:${libs.versions.delta.standalone.get()}" - compileOnly(libs.hadoop2.common) { + compileOnly(libs.hadoop3.common) { exclude group: 'org.apache.avro', module: 'avro' exclude group: 'org.slf4j', module: 'slf4j-log4j12' exclude group: 'javax.servlet', module: 'servlet-api' @@ -584,7 +584,7 @@ project(':iceberg-delta-lake') { if (sparkVersions.contains("3.5")) { integrationImplementation "io.delta:delta-spark_${scalaVersion}:${libs.versions.delta.spark.get()}" integrationImplementation project(path: ":iceberg-spark:iceberg-spark-3.5_${scalaVersion}") - integrationImplementation(libs.hadoop2.minicluster) { + integrationImplementation(libs.hadoop3.minicluster) { exclude group: 'org.apache.avro', module: 'avro' // to make sure netty libs only come from project(':iceberg-arrow') exclude group: 'io.netty', module: 'netty-buffer' @@ -645,7 +645,7 @@ project(':iceberg-gcp') { testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts') - testImplementation(libs.hadoop2.common) { + testImplementation(libs.hadoop3.common) { exclude group: 'org.apache.avro', module: 'avro' exclude group: 'org.slf4j', module: 'slf4j-log4j12' exclude group: 'javax.servlet', module: 'servlet-api' @@ -722,7 +722,7 @@ project(':iceberg-hive-metastore') { exclude group: 'com.zaxxer', module: 'HikariCP' } - compileOnly(libs.hadoop2.client) { + compileOnly(libs.hadoop3.client) { exclude group: 'org.apache.avro', module: 'avro' exclude group: 'org.slf4j', module: 'slf4j-log4j12' } @@ -754,12 +754,12 @@ project(':iceberg-orc') { exclude group: 'org.apache.hive', module: 'hive-storage-api' } - compileOnly(libs.hadoop2.common) { + compileOnly(libs.hadoop3.common) { exclude group: 'commons-beanutils' exclude group: 'org.apache.avro', module: 'avro' exclude group: 'org.slf4j', module: 'slf4j-log4j12' } - compileOnly(libs.hadoop2.client) { + compileOnly(libs.hadoop3.client) { exclude group: 'org.apache.avro', module: 'avro' } @@ -788,7 +788,7 @@ project(':iceberg-parquet') { } compileOnly libs.avro.avro - compileOnly(libs.hadoop2.client) { + compileOnly(libs.hadoop3.client) { exclude group: 'org.apache.avro', module: 'avro' } @@ -832,8 +832,8 @@ project(':iceberg-arrow') { // We import :netty-common through :arrow-memory-netty // so that the same version as used by the :arrow-memory-netty module is picked. testImplementation libs.arrow.memory.netty - testImplementation libs.hadoop2.common - testImplementation libs.hadoop2.mapreduce.client.core + testImplementation libs.hadoop3.common + testImplementation libs.hadoop3.mapreduce.client.core } } @@ -854,7 +854,7 @@ project(':iceberg-nessie') { implementation libs.jackson.core implementation libs.jackson.databind - compileOnly libs.hadoop2.common + compileOnly libs.hadoop3.common // Only there to prevent "warning: unknown enum constant SchemaType.OBJECT" compile messages compileOnly libs.microprofile.openapi.api diff --git a/flink/v1.18/build.gradle b/flink/v1.18/build.gradle index 83dc07523a3c..d6e27514cb4a 100644 --- a/flink/v1.18/build.gradle +++ b/flink/v1.18/build.gradle @@ -42,9 +42,9 @@ project(":iceberg-flink:iceberg-flink-${flinkMajorVersion}") { compileOnly libs.flink118.connector.base compileOnly libs.flink118.connector.files - compileOnly libs.hadoop2.hdfs - compileOnly libs.hadoop2.common - compileOnly(libs.hadoop2.minicluster) { + compileOnly libs.hadoop3.hdfs + compileOnly libs.hadoop3.common + compileOnly(libs.hadoop3.minicluster) { exclude group: 'org.apache.avro', module: 'avro' } @@ -186,9 +186,9 @@ project(":iceberg-flink:iceberg-flink-runtime-${flinkMajorVersion}") { integrationImplementation libs.flink118.table.api.java.bridge integrationImplementation "org.apache.flink:flink-table-planner_${scalaVersion}:${libs.versions.flink118.get()}" - integrationImplementation libs.hadoop2.common - integrationImplementation libs.hadoop2.hdfs - integrationImplementation(libs.hadoop2.minicluster) { + integrationImplementation libs.hadoop3.common + integrationImplementation libs.hadoop3.hdfs + integrationImplementation(libs.hadoop3.minicluster) { exclude group: 'org.apache.avro', module: 'avro' } diff --git a/flink/v1.19/build.gradle b/flink/v1.19/build.gradle index 50bcadb618e4..599ba085e4c4 100644 --- a/flink/v1.19/build.gradle +++ b/flink/v1.19/build.gradle @@ -42,9 +42,9 @@ project(":iceberg-flink:iceberg-flink-${flinkMajorVersion}") { compileOnly libs.flink119.connector.base compileOnly libs.flink119.connector.files - compileOnly libs.hadoop2.hdfs - compileOnly libs.hadoop2.common - compileOnly(libs.hadoop2.minicluster) { + compileOnly libs.hadoop3.hdfs + compileOnly libs.hadoop3.common + compileOnly(libs.hadoop3.minicluster) { exclude group: 'org.apache.avro', module: 'avro' } @@ -187,9 +187,9 @@ project(":iceberg-flink:iceberg-flink-runtime-${flinkMajorVersion}") { integrationImplementation libs.flink119.table.api.java.bridge integrationImplementation "org.apache.flink:flink-table-planner_${scalaVersion}:${libs.versions.flink119.get()}" - integrationImplementation libs.hadoop2.common - integrationImplementation libs.hadoop2.hdfs - integrationImplementation(libs.hadoop2.minicluster) { + integrationImplementation libs.hadoop3.common + integrationImplementation libs.hadoop3.hdfs + integrationImplementation(libs.hadoop3.minicluster) { exclude group: 'org.apache.avro', module: 'avro' } diff --git a/flink/v1.20/build.gradle b/flink/v1.20/build.gradle index 4a1bae660bdb..3e308d22b021 100644 --- a/flink/v1.20/build.gradle +++ b/flink/v1.20/build.gradle @@ -42,9 +42,9 @@ project(":iceberg-flink:iceberg-flink-${flinkMajorVersion}") { compileOnly libs.flink120.connector.base compileOnly libs.flink120.connector.files - compileOnly libs.hadoop2.hdfs - compileOnly libs.hadoop2.common - compileOnly(libs.hadoop2.minicluster) { + compileOnly libs.hadoop3.hdfs + compileOnly libs.hadoop3.common + compileOnly(libs.hadoop3.minicluster) { exclude group: 'org.apache.avro', module: 'avro' } @@ -187,9 +187,9 @@ project(":iceberg-flink:iceberg-flink-runtime-${flinkMajorVersion}") { integrationImplementation libs.flink120.table.api.java.bridge integrationImplementation "org.apache.flink:flink-table-planner_${scalaVersion}:${libs.versions.flink120.get()}" - integrationImplementation libs.hadoop2.common - integrationImplementation libs.hadoop2.hdfs - integrationImplementation(libs.hadoop2.minicluster) { + integrationImplementation libs.hadoop3.common + integrationImplementation libs.hadoop3.hdfs + integrationImplementation(libs.hadoop3.minicluster) { exclude group: 'org.apache.avro', module: 'avro' } diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 5cb71c6cfba3..04c7bf649297 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -46,7 +46,6 @@ flink119 = { strictly = "1.19.1"} flink120 = { strictly = "1.20.0"} google-libraries-bom = "26.54.0" guava = "33.4.0-jre" -hadoop2 = "2.7.3" hadoop3 = "3.4.1" httpcomponents-httpclient5 = "5.4.2" hive2 = { strictly = "2.3.10"} # see rich version usage explanation above @@ -124,13 +123,11 @@ flink120-streaming-java = { module = "org.apache.flink:flink-streaming-java", ve flink120-table-api-java-bridge = { module = "org.apache.flink:flink-table-api-java-bridge", version.ref = "flink120" } google-libraries-bom = { module = "com.google.cloud:libraries-bom", version.ref = "google-libraries-bom" } guava-guava = { module = "com.google.guava:guava", version.ref = "guava" } -hadoop2-client = { module = "org.apache.hadoop:hadoop-client", version.ref = "hadoop2" } -hadoop2-common = { module = "org.apache.hadoop:hadoop-common", version.ref = "hadoop2" } -hadoop2-hdfs = { module = "org.apache.hadoop:hadoop-hdfs", version.ref = "hadoop2" } -hadoop2-mapreduce-client-core = { module = "org.apache.hadoop:hadoop-mapreduce-client-core", version.ref = "hadoop2" } -hadoop2-minicluster = { module = "org.apache.hadoop:hadoop-minicluster", version.ref = "hadoop2" } hadoop3-client = { module = "org.apache.hadoop:hadoop-client", version.ref = "hadoop3" } hadoop3-common = { module = "org.apache.hadoop:hadoop-common", version.ref = "hadoop3" } +hadoop3-hdfs = { module = "org.apache.hadoop:hadoop-hdfs", version.ref = "hadoop3" } +hadoop3-mapreduce-client-core = { module = "org.apache.hadoop:hadoop-mapreduce-client-core", version.ref = "hadoop3" } +hadoop3-minicluster = { module = "org.apache.hadoop:hadoop-minicluster", version.ref = "hadoop3" } hive2-exec = { module = "org.apache.hive:hive-exec", version.ref = "hive2" } hive2-metastore = { module = "org.apache.hive:hive-metastore", version.ref = "hive2" } hive2-service = { module = "org.apache.hive:hive-service", version.ref = "hive2" } diff --git a/mr/build.gradle b/mr/build.gradle index f41c5410a93f..dac7e3d4542b 100644 --- a/mr/build.gradle +++ b/mr/build.gradle @@ -37,7 +37,7 @@ project(':iceberg-mr') { implementation project(':iceberg-orc') implementation project(':iceberg-parquet') - compileOnly(libs.hadoop2.client) { + compileOnly(libs.hadoop3.client) { exclude group: 'org.apache.avro', module: 'avro' } diff --git a/spark/v3.4/build.gradle b/spark/v3.4/build.gradle index 6eb26e8b73c1..e9f310ef7b36 100644 --- a/spark/v3.4/build.gradle +++ b/spark/v3.4/build.gradle @@ -96,7 +96,7 @@ project(":iceberg-spark:iceberg-spark-${sparkMajorVersion}_${scalaVersion}") { implementation libs.caffeine - testImplementation(libs.hadoop2.minicluster) { + testImplementation(libs.hadoop3.minicluster) { exclude group: 'org.apache.avro', module: 'avro' // to make sure netty libs only come from project(':iceberg-arrow') exclude group: 'io.netty', module: 'netty-buffer' diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestCompressionSettings.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestCompressionSettings.java index 724c6edde26a..26cb4dcc9508 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestCompressionSettings.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestCompressionSettings.java @@ -70,6 +70,7 @@ import org.apache.spark.sql.Row; import org.apache.spark.sql.SparkSession; import org.junit.AfterClass; +import org.junit.Before; import org.junit.BeforeClass; import org.junit.Rule; import org.junit.Test; @@ -106,6 +107,13 @@ public static void startSpark() { TestCompressionSettings.spark = SparkSession.builder().master("local[2]").getOrCreate(); } + @Before + public void resetSpecificConfigurations() { + spark.conf().unset(COMPRESSION_CODEC); + spark.conf().unset(COMPRESSION_LEVEL); + spark.conf().unset(COMPRESSION_STRATEGY); + } + @Parameterized.AfterParam public static void clearSourceCache() { spark.sql(String.format("DROP TABLE IF EXISTS %s", TABLE_NAME)); diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java index f420f1b955c0..961d69b72127 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java @@ -22,6 +22,8 @@ import static org.assertj.core.api.Assertions.assertThatThrownBy; import java.io.File; +import java.nio.file.Files; +import java.nio.file.Paths; import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.iceberg.PartitionSpec; @@ -118,6 +120,7 @@ public void testStreamingWriteAppendMode() throws Exception { // remove the last commit to force Spark to reprocess batch #1 File lastCommitFile = new File(checkpoint + "/commits/1"); Assert.assertTrue("The commit file must be deleted", lastCommitFile.delete()); + Files.deleteIfExists(Paths.get(checkpoint + "/commits/.1.crc")); // restart the query from the checkpoint StreamingQuery restartedQuery = streamWriter.start(); @@ -178,6 +181,7 @@ public void testStreamingWriteCompleteMode() throws Exception { // remove the last commit to force Spark to reprocess batch #1 File lastCommitFile = new File(checkpoint + "/commits/1"); Assert.assertTrue("The commit file must be deleted", lastCommitFile.delete()); + Files.deleteIfExists(Paths.get(checkpoint + "/commits/.1.crc")); // restart the query from the checkpoint StreamingQuery restartedQuery = streamWriter.start(); @@ -238,6 +242,7 @@ public void testStreamingWriteCompleteModeWithProjection() throws Exception { // remove the last commit to force Spark to reprocess batch #1 File lastCommitFile = new File(checkpoint + "/commits/1"); Assert.assertTrue("The commit file must be deleted", lastCommitFile.delete()); + Files.deleteIfExists(Paths.get(checkpoint + "/commits/.1.crc")); // restart the query from the checkpoint StreamingQuery restartedQuery = streamWriter.start(); diff --git a/spark/v3.5/build.gradle b/spark/v3.5/build.gradle index e2d2c7a7ac07..52d9ce0348f6 100644 --- a/spark/v3.5/build.gradle +++ b/spark/v3.5/build.gradle @@ -96,7 +96,7 @@ project(":iceberg-spark:iceberg-spark-${sparkMajorVersion}_${scalaVersion}") { implementation libs.caffeine - testImplementation(libs.hadoop2.minicluster) { + testImplementation(libs.hadoop3.minicluster) { exclude group: 'org.apache.avro', module: 'avro' // to make sure netty libs only come from project(':iceberg-arrow') exclude group: 'io.netty', module: 'netty-buffer' diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java index 17db46b85c35..c84a65cbe951 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java @@ -23,7 +23,9 @@ import static org.assertj.core.api.Assertions.assertThatThrownBy; import java.io.File; +import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.Paths; import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.iceberg.PartitionSpec; @@ -117,6 +119,7 @@ public void testStreamingWriteAppendMode() throws Exception { // remove the last commit to force Spark to reprocess batch #1 File lastCommitFile = new File(checkpoint + "/commits/1"); assertThat(lastCommitFile.delete()).as("The commit file must be deleted").isTrue(); + Files.deleteIfExists(Paths.get(checkpoint + "/commits/.1.crc")); // restart the query from the checkpoint StreamingQuery restartedQuery = streamWriter.start(); @@ -178,6 +181,7 @@ public void testStreamingWriteCompleteMode() throws Exception { // remove the last commit to force Spark to reprocess batch #1 File lastCommitFile = new File(checkpoint + "/commits/1"); assertThat(lastCommitFile.delete()).as("The commit file must be deleted").isTrue(); + Files.deleteIfExists(Paths.get(checkpoint + "/commits/.1.crc")); // restart the query from the checkpoint StreamingQuery restartedQuery = streamWriter.start(); @@ -239,6 +243,7 @@ public void testStreamingWriteCompleteModeWithProjection() throws Exception { // remove the last commit to force Spark to reprocess batch #1 File lastCommitFile = new File(checkpoint + "/commits/1"); assertThat(lastCommitFile.delete()).as("The commit file must be deleted").isTrue(); + Files.deleteIfExists(Paths.get(checkpoint + "/commits/.1.crc")); // restart the query from the checkpoint StreamingQuery restartedQuery = streamWriter.start();