apache · Will-Lo · Nov 2, 2023 · Nov 2, 2023 · Nov 2, 2023 · Nov 2, 2023
diff --git a/...lin-modules/gobblin-orc/src/main/java/org/apache/gobblin/writer/GobblinBaseOrcWriter.java b/...lin-modules/gobblin-orc/src/main/java/org/apache/gobblin/writer/GobblinBaseOrcWriter.java
@@ -24,6 +24,7 @@
 import java.util.Queue;
 import java.util.concurrent.atomic.AtomicInteger;
 
+import org.apache.gobblin.util.HadoopUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hive.serde2.SerDeException;
 import org.apache.orc.OrcConf;
@@ -61,6 +62,7 @@ public abstract class GobblinBaseOrcWriter<S, D> extends FsDataWriter<D> {
   protected int batchSize;
   protected final S inputSchema;
 
+  private final boolean validateORCDuringCommit;
   private final boolean selfTuningWriter;
   private int selfTuneRowsBetweenCheck;
   private double rowBatchMemoryUsageFactor;
@@ -94,6 +96,7 @@ public GobblinBaseOrcWriter(FsDataWriterBuilder<S, D> builder, State properties)
     this.inputSchema = builder.getSchema();
     this.typeDescription = getOrcSchema();
     this.selfTuningWriter = properties.getPropAsBoolean(GobblinOrcWriterConfigs.ORC_WRITER_AUTO_SELFTUNE_ENABLED, false);
+    this.validateORCDuringCommit = properties.getPropAsBoolean(GobblinOrcWriterConfigs.ORC_WRITER_VALIDATE_FILE_DURING_COMMIT, false);
     this.maxOrcBatchSize = properties.getPropAsInt(GobblinOrcWriterConfigs.ORC_WRITER_AUTO_SELFTUNE_MAX_BATCH_SIZE,
         GobblinOrcWriterConfigs.DEFAULT_MAX_ORC_WRITER_BATCH_SIZE);
     this.batchSize = this.selfTuningWriter ?
@@ -258,7 +261,18 @@ public void close()
   public void commit()
       throws IOException {
     closeInternal();
+    if(this.validateORCDuringCommit) {
+      try {
+        OrcFile.createReader(this.stagingFile, new OrcFile.ReaderOptions(conf));
+      } catch (Exception e) {
+        log.error("Found error when validating ORC file during commit phase", e);
+        HadoopUtils.deletePath(this.fs, this.stagingFile, false);
+        log.error("Delete the malformed ORC file after close the writer: {}", this.stagingFile);
+        throw e;
+      }
+    }
     super.commit();
+
     if (this.selfTuningWriter) {
       properties.setProp(GobblinOrcWriterConfigs.RuntimeStateConfigs.ORC_WRITER_ESTIMATED_RECORD_SIZE, String.valueOf(getEstimatedRecordSizeBytes()));
       properties.setProp(GobblinOrcWriterConfigs.RuntimeStateConfigs.ORC_WRITER_ESTIMATED_BYTES_ALLOCATED_CONVERTER_MEMORY,

diff --git a/...-modules/gobblin-orc/src/main/java/org/apache/gobblin/writer/GobblinOrcWriterConfigs.java b/...-modules/gobblin-orc/src/main/java/org/apache/gobblin/writer/GobblinOrcWriterConfigs.java
@@ -22,6 +22,11 @@
  */
 public class GobblinOrcWriterConfigs {
   public static final String ORC_WRITER_PREFIX = "orcWriter.";
+  /**
+   * Configuration for enabling validation of ORC file to detect malformation. If enabled, will throw exception and
+   * delete malformed ORC file during commit
+   */
+  public static final String ORC_WRITER_VALIDATE_FILE_DURING_COMMIT = ORC_WRITER_PREFIX + "validate.commit.file";
   /**
    * Default buffer size in the ORC Writer before sending the records to the native ORC Writer
    */