apache · Will-Lo · Nov 2, 2023 · Nov 2, 2023 · Nov 2, 2023 · Nov 2, 2023
diff --git a/...lin-modules/gobblin-orc/src/main/java/org/apache/gobblin/writer/GobblinBaseOrcWriter.java b/...lin-modules/gobblin-orc/src/main/java/org/apache/gobblin/writer/GobblinBaseOrcWriter.java
@@ -61,6 +61,7 @@ public abstract class GobblinBaseOrcWriter<S, D> extends FsDataWriter<D> {
   protected int batchSize;
   protected final S inputSchema;
 
+  private final boolean validateORCDuringCommit;
   private final boolean selfTuningWriter;
   private int selfTuneRowsBetweenCheck;
   private double rowBatchMemoryUsageFactor;
@@ -94,6 +95,7 @@ public GobblinBaseOrcWriter(FsDataWriterBuilder<S, D> builder, State properties)
     this.inputSchema = builder.getSchema();
     this.typeDescription = getOrcSchema();
     this.selfTuningWriter = properties.getPropAsBoolean(GobblinOrcWriterConfigs.ORC_WRITER_AUTO_SELFTUNE_ENABLED, false);
+    this.validateORCDuringCommit = properties.getPropAsBoolean(GobblinOrcWriterConfigs.ORC_WRITER_VALIDATE_FILE_DURING_COMMIT, false);
     this.maxOrcBatchSize = properties.getPropAsInt(GobblinOrcWriterConfigs.ORC_WRITER_AUTO_SELFTUNE_MAX_BATCH_SIZE,
         GobblinOrcWriterConfigs.DEFAULT_MAX_ORC_WRITER_BATCH_SIZE);
     this.batchSize = this.selfTuningWriter ?
@@ -259,6 +261,15 @@ public void commit()
       throws IOException {
     closeInternal();
     super.commit();
+    if(this.validateORCDuringCommit) {
+      try {
+        OrcFile.createReader(this.outputFile, new OrcFile.ReaderOptions(conf));
+      } catch (IOException ioException) {
+        log.error("Found error when validating ORC file {} during commit phase", this.outputFile, ioException);
+        log.error("Delete the malformed ORC file is successful: {}", this.fs.delete(this.outputFile, false));
+        throw ioException;
+      }
+    }
     if (this.selfTuningWriter) {
       properties.setProp(GobblinOrcWriterConfigs.RuntimeStateConfigs.ORC_WRITER_ESTIMATED_RECORD_SIZE, String.valueOf(getEstimatedRecordSizeBytes()));
       properties.setProp(GobblinOrcWriterConfigs.RuntimeStateConfigs.ORC_WRITER_ESTIMATED_BYTES_ALLOCATED_CONVERTER_MEMORY,

diff --git a/...-modules/gobblin-orc/src/main/java/org/apache/gobblin/writer/GobblinOrcWriterConfigs.java b/...-modules/gobblin-orc/src/main/java/org/apache/gobblin/writer/GobblinOrcWriterConfigs.java
@@ -22,6 +22,11 @@
  */
 public class GobblinOrcWriterConfigs {
   public static final String ORC_WRITER_PREFIX = "orcWriter.";
+  /**
+   * Configuration for enabling validation of ORC file to detect malformation. If enabled, will throw exception and
+   * delete malformed ORC file during commit
+   */
+  public static final String ORC_WRITER_VALIDATE_FILE_DURING_COMMIT = ORC_WRITER_PREFIX + "validate.commit.file";
   /**
    * Default buffer size in the ORC Writer before sending the records to the native ORC Writer
    */