Create a marker class for the Comet reader and a few extra nits

pvary · pvary · commit 0256f4a3f977 · 2025-12-08T20:19:01.000+01:00
diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowFormatModels.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowFormatModels.java
@@ -28,7 +28,7 @@ public static void register() {
         new ParquetFormatModel<>(
             ColumnarBatch.class,
             Object.class,
-            (schema, messageType, constantValues, properties) ->
+            (schema, messageType, idToConstant) ->
                 ArrowReader.VectorizedCombinedScanIterator.buildReader(
                     schema,
                     messageType, /* setArrowValidityVector */
diff --git a/core/src/main/java/org/apache/iceberg/formats/FormatModel.java b/core/src/main/java/org/apache/iceberg/formats/FormatModel.java
@@ -58,7 +58,7 @@ public interface FormatModel<D, S> {
    *
    * @return the type of the data structures handled by this model implementation
    */
-  Class<D> type();
+  Class<? extends D> type();
 
   /**
    * Return the schema type class for the object model implementation processed by this factory.
diff --git a/core/src/main/java/org/apache/iceberg/formats/FormatModelRegistry.java b/core/src/main/java/org/apache/iceberg/formats/FormatModelRegistry.java
@@ -123,7 +123,7 @@ public static synchronized void register(FormatModel<?, ?> formatModel) {
    * @return a configured reader builder for the specified format and object model
    */
   public static <D, S> ReadBuilder<D, S> readBuilder(
-      FileFormat format, Class<D> type, InputFile inputFile) {
+      FileFormat format, Class<? extends D> type, InputFile inputFile) {
     FormatModel<D, S> factory = factoryFor(format, type);
     return factory.readBuilder(inputFile);
   }
@@ -144,7 +144,7 @@ public static <D, S> ReadBuilder<D, S> readBuilder(
    * @return a configured data write builder for creating a {@link DataWriter}
    */
   public static <D, S> DataWriteBuilder<D, S> dataWriteBuilder(
-      FileFormat format, Class<D> type, EncryptedOutputFile outputFile) {
+      FileFormat format, Class<? extends D> type, EncryptedOutputFile outputFile) {
     FormatModel<D, S> factory = factoryFor(format, type);
     return CommonWriteBuilderImpl.forDataFile(
         factory.writeBuilder(outputFile), outputFile.encryptingOutputFile().location(), format);
@@ -198,7 +198,7 @@ public static PositionDeleteWriteBuilder positionDeleteWriteBuilder(
   }
 
   @SuppressWarnings("unchecked")
-  private static <D, S> FormatModel<D, S> factoryFor(FileFormat format, Class<D> type) {
+  private static <D, S> FormatModel<D, S> factoryFor(FileFormat format, Class<? extends D> type) {
     FormatModel<D, S> model = (FormatModel<D, S>) MODELS.get(Pair.of(format, type));
     Preconditions.checkArgument(
         model != null, "Format model is not registered for format %s and type %s", format, type);
diff --git a/orc/src/main/java/org/apache/iceberg/orc/ORC.java b/orc/src/main/java/org/apache/iceberg/orc/ORC.java
@@ -703,7 +703,7 @@ public static class ReadBuilder {
     private Function<TypeDescription, OrcRowReader<?>> readerFunc;
     private Function<TypeDescription, OrcBatchReader<?>> batchedReaderFunc;
     private int recordsPerBatch = VectorizedRowBatch.DEFAULT_SIZE;
-    private Set<Integer> constantFieldIds = ImmutableSet.of();
+    private Set<Integer> idToConstant = ImmutableSet.of();
 
     private ReadBuilder(InputFile file) {
       Preconditions.checkNotNull(file, "Input file cannot be null");
@@ -780,8 +780,8 @@ public ReadBuilder withNameMapping(NameMapping newNameMapping) {
       return this;
     }
 
-    ReadBuilder constantValues(Set<Integer> newConstantFieldIds) {
-      this.constantFieldIds = newConstantFieldIds;
+    ReadBuilder idToConstant(Set<Integer> newIdToConstant) {
+      this.idToConstant = newIdToConstant;
       return this;
     }
 
@@ -792,8 +792,7 @@ public <D> CloseableIterable<D> build() {
           conf,
           // This is a behavioral change. Previously there were an error if metadata columns were
           // present in the schema, now they are removed and the correct reader is created
-          TypeUtil.selectNot(
-              schema, Sets.union(constantFieldIds, MetadataColumns.metadataFieldIds())),
+          TypeUtil.selectNot(schema, Sets.union(idToConstant, MetadataColumns.metadataFieldIds())),
           nameMapping,
           start,
           length,
diff --git a/orc/src/main/java/org/apache/iceberg/orc/ORCFormatModel.java b/orc/src/main/java/org/apache/iceberg/orc/ORCFormatModel.java
@@ -103,14 +103,13 @@ public ReadBuilder<D, S> readBuilder(InputFile inputFile) {
 
   @FunctionalInterface
   public interface ReaderFunction<D> {
-    OrcRowReader<D> read(
-        Schema schema, TypeDescription messageType, Map<Integer, ?> constantValues);
+    OrcRowReader<D> read(Schema schema, TypeDescription messageType, Map<Integer, ?> idToConstant);
   }
 
   @FunctionalInterface
   public interface BatchReaderFunction<D> {
     OrcBatchReader<D> read(
-        Schema schema, TypeDescription messageType, Map<Integer, ?> constantValues);
+        Schema schema, TypeDescription messageType, Map<Integer, ?> idToConstant);
   }
 
   @FunctionalInterface
@@ -180,7 +179,7 @@ public ReadBuilder<D, S> recordsPerBatch(int numRowsPerBatch) {
 
     @Override
     public ReadBuilder<D, S> idToConstant(Map<Integer, ?> newIdToConstant) {
-      internal.constantValues(newIdToConstant.keySet());
+      internal.idToConstant(newIdToConstant.keySet());
       this.idToConstant = newIdToConstant;
       return this;
     }
diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java
@@ -45,14 +45,14 @@
 public class ParquetFormatModel<D, S> implements FormatModel<D, S> {
   public static final String WRITER_VERSION_KEY = "parquet.writer.version";
 
-  private final Class<D> type;
+  private final Class<? extends D> type;
   private final Class<S> schemaType;
   private final ReaderFunction<D> readerFunction;
   private final BatchReaderFunction<D> batchReaderFunction;
   private final WriterFunction<S> writerFunction;
 
   private ParquetFormatModel(
-      Class<D> type,
+      Class<? extends D> type,
       Class<S> schemaType,
       ReaderFunction<D> readerFunction,
       BatchReaderFunction<D> batchReaderFunction,
@@ -77,7 +77,7 @@ public ParquetFormatModel(
   }
 
   public ParquetFormatModel(
-      Class<D> type, Class<S> schemaType, BatchReaderFunction<D> batchReaderFunction) {
+      Class<? extends D> type, Class<S> schemaType, BatchReaderFunction<D> batchReaderFunction) {
     this(type, schemaType, null, batchReaderFunction, null);
   }
 
@@ -87,7 +87,7 @@ public FileFormat format() {
   }
 
   @Override
-  public Class<D> type() {
+  public Class<? extends D> type() {
     return type;
   }
 
@@ -109,16 +109,12 @@ public ReadBuilder<D, S> readBuilder(InputFile inputFile) {
   @FunctionalInterface
   public interface ReaderFunction<D> {
     ParquetValueReader<D> read(
-        Schema schema, MessageType messageType, Map<Integer, ?> constantValues);
+        Schema schema, MessageType messageType, Map<Integer, ?> idToConstant);
   }
 
   @FunctionalInterface
   public interface BatchReaderFunction<D> {
-    VectorizedReader<D> read(
-        Schema schema,
-        MessageType messageType,
-        Map<Integer, ?> constantValues,
-        Map<String, String> config);
+    VectorizedReader<D> read(Schema schema, MessageType messageType, Map<Integer, ?> idToConstant);
   }
 
   @FunctionalInterface
@@ -323,7 +319,7 @@ public CloseableIterable<D> build() {
         return internal
             .createBatchedReaderFunc(
                 (icebergSchema, messageType) ->
-                    batchReaderFunction.read(icebergSchema, messageType, idToConstant, config))
+                    batchReaderFunction.read(icebergSchema, messageType, idToConstant))
             .build();
       } else {
         throw new IllegalStateException("Either readerFunction or batchReaderFunction must be set");
diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java
@@ -28,9 +28,9 @@
 import org.apache.iceberg.arrow.vectorized.VectorizedReaderBuilder;
 import org.apache.iceberg.parquet.TypeWithSchemaVisitor;
 import org.apache.iceberg.parquet.VectorizedReader;
-import org.apache.iceberg.spark.ParquetReaderType;
 import org.apache.iceberg.spark.SparkUtil;
 import org.apache.parquet.schema.MessageType;
+import org.apache.spark.sql.vectorized.ColumnVector;
 import org.apache.spark.sql.vectorized.ColumnarBatch;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -43,8 +43,6 @@ public class VectorizedSparkParquetReaders {
   private static final String ENABLE_NULL_CHECK_FOR_GET = "arrow.enable_null_check_for_get";
   private static final String ENABLE_NULL_CHECK_FOR_GET_ENV = "ARROW_ENABLE_NULL_CHECK_FOR_GET";
 
-  public static final String PARQUET_READER_TYPE = "parquet.reader.type";
-
   static {
     try {
       enableUnsafeMemoryAccess();
@@ -56,18 +54,6 @@ public class VectorizedSparkParquetReaders {
 
   private VectorizedSparkParquetReaders() {}
 
-  public static VectorizedReader<ColumnarBatch> buildReader(
-      Schema expectedSchema,
-      MessageType fileSchema,
-      Map<Integer, ?> idToConstant,
-      Map<String, String> config) {
-    if (ParquetReaderType.COMET.name().equals(config.get(PARQUET_READER_TYPE))) {
-      return buildCometReader(expectedSchema, fileSchema, idToConstant);
-    } else {
-      return buildReader(expectedSchema, fileSchema, idToConstant);
-    }
-  }
-
   public static ColumnarBatchReader buildReader(
       Schema expectedSchema,
       MessageType fileSchema,
@@ -91,9 +77,9 @@ public static ColumnarBatchReader buildReader(
     return buildReader(expectedSchema, fileSchema, idToConstant, ArrowAllocation.rootAllocator());
   }
 
-  public static CometColumnarBatchReader buildCometReader(
+  public static VectorizedReader<ColumnarBatch> buildCometReader(
       Schema expectedSchema, MessageType fileSchema, Map<Integer, ?> idToConstant) {
-    return (CometColumnarBatchReader)
+    return (VectorizedReader<ColumnarBatch>)
         TypeWithSchemaVisitor.visit(
             expectedSchema.asStruct(),
             fileSchema,
@@ -104,6 +90,13 @@ public static CometColumnarBatchReader buildCometReader(
                 readers -> new CometColumnarBatchReader(readers, expectedSchema)));
   }
 
+  /** A subclass of ColumnarBatch to identify Comet readers. */
+  public static class CometColumnarBatch extends ColumnarBatch {
+    public CometColumnarBatch(ColumnVector[] columns) {
+      super(columns);
+    }
+  }
+
   // enables unsafe memory access to avoid costly checks to see if index is within bounds
   // as long as it is not configured explicitly (see BoundsChecking in Arrow)
   private static void enableUnsafeMemoryAccess() {
diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java
@@ -35,6 +35,7 @@
 import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
 import org.apache.iceberg.spark.OrcBatchReadConf;
 import org.apache.iceberg.spark.ParquetBatchReadConf;
+import org.apache.iceberg.spark.ParquetReaderType;
 import org.apache.iceberg.spark.data.vectorized.ColumnVectorWithFilter;
 import org.apache.iceberg.spark.data.vectorized.ColumnarBatchUtil;
 import org.apache.iceberg.spark.data.vectorized.UpdatableDeletedColumnVector;
@@ -72,17 +73,20 @@ protected CloseableIterable<ColumnarBatch> newBatchIterable(
       Expression residual,
       Map<Integer, ?> idToConstant,
       @Nonnull SparkDeleteFilter deleteFilter) {
-    ReadBuilder<ColumnarBatch, ?> readBuilder =
-        FormatModelRegistry.readBuilder(format, ColumnarBatch.class, inputFile);
+    ReadBuilder<ColumnarBatch, ?> readBuilder;
     if (parquetConf != null) {
       readBuilder =
-          readBuilder
-              .recordsPerBatch(parquetConf.batchSize())
-              .set(
-                  VectorizedSparkParquetReaders.PARQUET_READER_TYPE,
-                  parquetConf.readerType().name());
-    } else if (orcConf != null) {
-      readBuilder = readBuilder.recordsPerBatch(orcConf.batchSize());
+          parquetConf.readerType() == ParquetReaderType.COMET
+              ? FormatModelRegistry.readBuilder(
+                  format, VectorizedSparkParquetReaders.CometColumnarBatch.class, inputFile)
+              : FormatModelRegistry.readBuilder(format, ColumnarBatch.class, inputFile);
+
+      readBuilder = readBuilder.recordsPerBatch(parquetConf.batchSize());
+    } else {
+      readBuilder = FormatModelRegistry.readBuilder(format, ColumnarBatch.class, inputFile);
+      if (orcConf != null) {
+        readBuilder = readBuilder.recordsPerBatch(orcConf.batchSize());
+      }
     }
 
     CloseableIterable<ColumnarBatch> iterable =
diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFormatModels.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFormatModels.java
@@ -55,6 +55,12 @@ public static void register() {
         new ParquetFormatModel<>(
             ColumnarBatch.class, StructType.class, VectorizedSparkParquetReaders::buildReader));
 
+    FormatModelRegistry.register(
+        new ParquetFormatModel<>(
+            VectorizedSparkParquetReaders.CometColumnarBatch.class,
+            StructType.class,
+            VectorizedSparkParquetReaders::buildCometReader));
+
     FormatModelRegistry.register(
         new ORCFormatModel<>(
             InternalRow.class,

Original file line number	Diff line number	Diff line change
`@@ -58,7 +58,7 @@ public interface FormatModel<D, S> {`
`58`	`58`	`*`
`59`	`59`	`* @return the type of the data structures handled by this model implementation`
`60`	`60`	`*/`
`61`		`- Class<D> type();`
	`61`	`+ Class<? extends D> type();`
`62`	`62`
`63`	`63`	`/**`
`64`	`64`	`* Return the schema type class for the object model implementation processed by this factory.`