apache · parthchandra · Jun 24, 2025 · Aug 6, 2025 · Oct 31, 2025 · Oct 31, 2025
diff --git a/build.gradle b/build.gradle
@@ -845,6 +845,7 @@ project(':iceberg-orc') {
 }
 
 project(':iceberg-parquet') {
+
   test {
     useJUnitPlatform()
   }

diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml
@@ -37,7 +37,7 @@ awssdk-s3accessgrants = "2.3.0"
 bson-ver = "4.11.5"
 caffeine = "2.9.3"
 calcite = "1.40.0"
-comet = "0.8.1"
+comet = "0.10.1"
 datasketches = "6.2.0"
 delta-standalone = "3.3.2"
 delta-spark = "3.3.2"

diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java b/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java
@@ -56,6 +56,7 @@
 import java.util.Locale;
 import java.util.Map;
 import java.util.Objects;
+import java.util.ServiceLoader;
 import java.util.function.BiConsumer;
 import java.util.function.BiFunction;
 import java.util.function.Function;
@@ -1289,6 +1290,20 @@ public ReadBuilder setCustomType(int fieldId, Class<? extends StructLike> struct
       throw new UnsupportedOperationException("Custom types are not yet supported");
     }
 
+    /**
+     * @deprecated Use {@link #set(String, String)} with "read.parquet.vectorized-reader.factory" =
+     *     "comet" instead
+     */
+    @Deprecated
+    public ReadBuilder enableComet(boolean enableComet) {
+      if (enableComet) {
+        this.properties.put("read.parquet.vectorized-reader.factory", "comet");
+      } else {
+        this.properties.remove("read.parquet.vectorized-reader.factory");
+      }
+      return this;
+    }
+
     public ReadBuilder withFileEncryptionKey(ByteBuffer encryptionKey) {
       this.fileEncryptionKey = encryptionKey;
       return this;
@@ -1300,7 +1315,7 @@ public ReadBuilder withAADPrefix(ByteBuffer aadPrefix) {
     }
 
     @Override
-    @SuppressWarnings({"unchecked", "checkstyle:CyclomaticComplexity"})
+    @SuppressWarnings({"unchecked", "checkstyle:CyclomaticComplexity", "MethodLength"})
     public <D> CloseableIterable<D> build() {
       FileDecryptionProperties fileDecryptionProperties = null;
       if (fileEncryptionKey != null) {
@@ -1352,6 +1367,32 @@ public <D> CloseableIterable<D> build() {
         }
 
         if (batchedReaderFunc != null) {
+          // Try to load custom vectorized reader factory from properties
+          String readerName = properties.get("read.parquet.vectorized-reader.factory");
+
+          if (readerName != null) {
+            LOG.info("Loading custom vectorized reader factory: {}", readerName);
+            VectorizedParquetReaderFactory factory = loadReaderFactory(readerName);
+            if (factory != null) {
+              return factory.createReader(
+                  file,
+                  schema,
+                  options,
+                  batchedReaderFunc,
+                  mapping,
+                  filter,
+                  reuseContainers,
+                  caseSensitive,
+                  maxRecordsPerBatch,
+                  properties,
+                  start,
+                  length,
+                  fileEncryptionKey,
+                  fileAADPrefix);
+            }
+          }
+
+          // Fall back to default VectorizedParquetReader
           return new VectorizedParquetReader<>(
               file,
               schema,
@@ -1444,6 +1485,20 @@ public <D> CloseableIterable<D> build() {
     }
   }
 
+  private static VectorizedParquetReaderFactory loadReaderFactory(String name) {
+    ServiceLoader<VectorizedParquetReaderFactory> loader =
+        ServiceLoader.load(VectorizedParquetReaderFactory.class);
+
+    for (VectorizedParquetReaderFactory factory : loader) {
+      if (factory.name().equalsIgnoreCase(name)) {
+        return factory;
+      }
+    }
+
+    LOG.warn("Could not find vectorized reader factory: {}", name);
+    return null;
+  }
+
   private static class ParquetReadBuilder<T> extends ParquetReader.Builder<T> {
     private Schema schema = null;
     private ReadSupport<T> readSupport = null;

diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java b/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java
@@ -43,7 +43,7 @@
  *
  * @param <T> type of value to read
  */
-class ReadConf<T> {
+public class ReadConf<T> {
   private final ParquetFileReader reader;
   private final InputFile file;
   private final ParquetReadOptions options;
@@ -60,7 +60,7 @@ class ReadConf<T> {
   private final List<Map<ColumnPath, ColumnChunkMetaData>> columnChunkMetaDataForRowGroups;
 
   @SuppressWarnings("unchecked")
-  ReadConf(
+  public ReadConf(
       InputFile file,
       ParquetReadOptions options,
       Schema expectedSchema,
@@ -146,7 +146,7 @@ private ReadConf(ReadConf<T> toCopy) {
     this.columnChunkMetaDataForRowGroups = toCopy.columnChunkMetaDataForRowGroups;
   }
 
-  ParquetFileReader reader() {
+  public ParquetFileReader reader() {
     if (reader != null) {
       reader.setRequestedSchema(projection);
       return reader;
@@ -157,35 +157,43 @@ ParquetFileReader reader() {
     return newReader;
   }
 
-  ParquetValueReader<T> model() {
+  public InputFile file() {
+    return file;
+  }
+
+  public MessageType projection() {
+    return projection;
+  }
+
+  public ParquetValueReader<T> model() {
     return model;
   }
 
-  VectorizedReader<T> vectorizedModel() {
+  public VectorizedReader<T> vectorizedModel() {
     return vectorizedModel;
   }
 
-  boolean[] shouldSkip() {
+  public boolean[] shouldSkip() {
     return shouldSkip;
   }
 
-  long totalValues() {
+  public long totalValues() {
     return totalValues;
   }
 
-  boolean reuseContainers() {
+  public boolean reuseContainers() {
     return reuseContainers;
   }
 
-  Integer batchSize() {
+  public Integer batchSize() {
     return batchSize;
   }
 
-  List<Map<ColumnPath, ColumnChunkMetaData>> columnChunkMetadataForRowGroups() {
+  public List<Map<ColumnPath, ColumnChunkMetaData>> columnChunkMetadataForRowGroups() {
     return columnChunkMetaDataForRowGroups;
   }
 
-  ReadConf<T> copy() {
+  public ReadConf<T> copy() {
     return new ReadConf<>(this);
   }
 

diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/VectorizedParquetReaderFactory.java b/parquet/src/main/java/org/apache/iceberg/parquet/VectorizedParquetReaderFactory.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.parquet;
+
+import java.nio.ByteBuffer;
+import java.util.Map;
+import java.util.function.Function;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.expressions.Expression;
+import org.apache.iceberg.io.CloseableIterable;
+import org.apache.iceberg.io.InputFile;
+import org.apache.iceberg.mapping.NameMapping;
+import org.apache.parquet.ParquetReadOptions;
+import org.apache.parquet.schema.MessageType;
+
+/**
+ * Service Provider Interface (SPI) for creating custom vectorized Parquet readers.
+ *
+ * <p>Implementations of this interface can be loaded at runtime using Java's {@link
+ * java.util.ServiceLoader} mechanism. To register an implementation, create a file named {@code
+ * META-INF/services/org.apache.iceberg.parquet.VectorizedParquetReaderFactory} containing the fully
+ * qualified class name of the implementation.
+ *
+ * <p>This allows for pluggable vectorized reader implementations (e.g., Comet, Arrow, Velox)
+ * without requiring the core parquet module to depend on specific execution engines.
+ */
+public interface VectorizedParquetReaderFactory {
+
+  /**
+   * Returns the unique identifier for this reader factory.
+   *
+   * <p>This name is used to select the reader factory via configuration. For example, "comet" for
+   * the Comet vectorized reader.
+   *
+   * @return the unique name for this factory
+   */
+  String name();
+
+  /**
+   * Creates a vectorized parquet reader with the given configuration.
+   *
+   * @param file the input file to read
+   * @param schema the expected schema for the data
+   * @param options parquet read options
+   * @param batchedReaderFunc function to create a VectorizedReader from a MessageType
+   * @param mapping name mapping for schema evolution
+   * @param filter filter expression to apply during reading
+   * @param reuseContainers whether to reuse containers for records
+   * @param caseSensitive whether column name matching should be case-sensitive
+   * @param maxRecordsPerBatch maximum number of records per batch
+   * @param properties additional properties for reader configuration
+   * @param start optional start position for reading
+   * @param length optional length to read
+   * @param fileEncryptionKey optional encryption key for the file
+   * @param fileAADPrefix optional AAD prefix for encryption
+   * @param <T> the type of records returned by the reader
+   * @return a closeable iterable of records
+   */
+  <T> CloseableIterable<T> createReader(
+      InputFile file,
+      Schema schema,
+      ParquetReadOptions options,
+      Function<MessageType, VectorizedReader<?>> batchedReaderFunc,
+      NameMapping mapping,
+      Expression filter,
+      boolean reuseContainers,
+      boolean caseSensitive,
+      int maxRecordsPerBatch,
+      Map<String, String> properties,
+      Long start,
+      Long length,
+      ByteBuffer fileEncryptionKey,
+      ByteBuffer fileAADPrefix);
+}
diff --git a/spark/v3.4/build.gradle b/spark/v3.4/build.gradle
@@ -264,6 +264,8 @@ project(":iceberg-spark:iceberg-spark-runtime-${sparkMajorVersion}_${scalaVersio
     integrationImplementation project(path: ':iceberg-hive-metastore', configuration: 'testArtifacts')
     integrationImplementation project(path: ":iceberg-spark:iceberg-spark-${sparkMajorVersion}_${scalaVersion}", configuration: 'testArtifacts')
     integrationImplementation project(path: ":iceberg-spark:iceberg-spark-extensions-${sparkMajorVersion}_${scalaVersion}", configuration: 'testArtifacts')
+    integrationImplementation "org.apache.datafusion:comet-spark-spark${sparkMajorVersion}_${scalaVersion}:${libs.versions.comet.get()}"
+
 
     // runtime dependencies for running Hive Catalog based integration test
     integrationRuntimeOnly project(':iceberg-hive-metastore')

diff --git a/.../v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnReader.java b/.../v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnReader.java
@@ -19,18 +19,22 @@
 package org.apache.iceberg.spark.data.vectorized;
 
 import java.io.IOException;
+import org.apache.comet.CometConf;
 import org.apache.comet.CometSchemaImporter;
 import org.apache.comet.parquet.AbstractColumnReader;
 import org.apache.comet.parquet.ColumnReader;
+import org.apache.comet.parquet.ParquetColumnSpec;
+import org.apache.comet.parquet.RowGroupReader;
 import org.apache.comet.parquet.TypeUtil;
 import org.apache.comet.parquet.Utils;
 import org.apache.comet.shaded.arrow.memory.RootAllocator;
 import org.apache.iceberg.parquet.VectorizedReader;
 import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
 import org.apache.iceberg.spark.SparkSchemaUtil;
+import org.apache.iceberg.spark.parquet.CometTypeUtils;
 import org.apache.iceberg.types.Types;
 import org.apache.parquet.column.ColumnDescriptor;
-import org.apache.parquet.column.page.PageReader;
+import org.apache.spark.sql.internal.SQLConf;
 import org.apache.spark.sql.types.DataType;
 import org.apache.spark.sql.types.Metadata;
 import org.apache.spark.sql.types.StructField;
@@ -42,23 +46,28 @@ class CometColumnReader implements VectorizedReader<ColumnVector> {
 
   private final ColumnDescriptor descriptor;
   private final DataType sparkType;
+  private final int fieldId;
 
   // The delegated ColumnReader from Comet side
   private AbstractColumnReader delegate;
   private boolean initialized = false;
   private int batchSize = DEFAULT_BATCH_SIZE;
   private CometSchemaImporter importer;
+  private ParquetColumnSpec spec;
 
-  CometColumnReader(DataType sparkType, ColumnDescriptor descriptor) {
+  CometColumnReader(DataType sparkType, ColumnDescriptor descriptor, int fieldId) {
     this.sparkType = sparkType;
     this.descriptor = descriptor;
+    this.fieldId = fieldId;
   }
 
   CometColumnReader(Types.NestedField field) {
     DataType dataType = SparkSchemaUtil.convert(field.type());
     StructField structField = new StructField(field.name(), dataType, false, Metadata.empty());
     this.sparkType = dataType;
-    this.descriptor = TypeUtil.convertToParquet(structField);
+    this.descriptor =
+        CometTypeUtils.buildColumnDescriptor(TypeUtil.convertToParquetSpec(structField));
+    this.fieldId = field.fieldId();
   }
 
   public AbstractColumnReader delegate() {
@@ -92,7 +101,26 @@ public void reset() {
     }
 
     this.importer = new CometSchemaImporter(new RootAllocator());
-    this.delegate = Utils.getColumnReader(sparkType, descriptor, importer, batchSize, false, false);
+
+    spec = CometTypeUtils.descriptorToParquetColumnSpec(descriptor);
+
+    boolean useLegacyTime =
+        Boolean.parseBoolean(
+            SQLConf.get()
+                .getConfString(
+                    CometConf.COMET_EXCEPTION_ON_LEGACY_DATE_TIMESTAMP().key(), "false"));
+    boolean useLazyMaterialization =
+        Boolean.parseBoolean(
+            SQLConf.get().getConfString(CometConf.COMET_USE_LAZY_MATERIALIZATION().key(), "false"));
+    this.delegate =
+        Utils.getColumnReader(
+            sparkType,
+            spec,
+            importer,
+            batchSize,
+            true, // Comet sets this to true for native execution
+            useLazyMaterialization,
+            useLegacyTime);
     this.initialized = true;
   }
 
@@ -111,9 +139,9 @@ public DataType sparkType() {
    * <p>NOTE: this should be called before reading a new Parquet column chunk, and after {@link
    * CometColumnReader#reset} is called.
    */
-  public void setPageReader(PageReader pageReader) throws IOException {
+  public void setPageReader(RowGroupReader pageStore) throws IOException {
     Preconditions.checkState(initialized, "Invalid state: 'reset' should be called first");
-    ((ColumnReader) delegate).setPageReader(pageReader);
+    ((ColumnReader) delegate).setRowGroupReader(pageStore, spec);
   }
 
   @Override
-Original file line number
+Diff line change
@@ Expand Up / @@ -845,6 +845,7 @@ project(':iceberg-orc') { @@
     }
     project(':iceberg-parquet') {
       test {
         useJUnitPlatform()
       }
@@ Expand Down @@