apache · jackye1995 · Jun 5, 2022 · danielcweeks · Jun 15, 2022 · danielcweeks
diff --git a/aws/src/main/java/org/apache/iceberg/aws/AwsProperties.java b/aws/src/main/java/org/apache/iceberg/aws/AwsProperties.java
@@ -353,6 +353,30 @@ public class AwsProperties implements Serializable {
   @Deprecated
   public static final boolean CLIENT_ENABLE_ETAG_CHECK_DEFAULT = false;
 
+  /**
+   * Number of times to retry S3 read operation.
+   */
+  public static final String S3_READ_RETRY_NUM_RETRIES = "s3.read.retry.num-retries";
+  public static final int S3_READ_RETRY_NUM_RETRIES_DEFAULT = 7;
+
+  /**
+   * Minimum wait time to retry a S3 read operation
+   */
+  public static final String S3_READ_RETRY_MIN_WAIT_MS = "s3.read.retry.min-wait-ms";
+  public static final long S3_READ_RETRY_MIN_WAIT_MS_DEFAULT = 500; // 0.5 seconds
+
+  /**
+   * Maximum wait time to retry a S3 read operation
+   */
+  public static final String S3_READ_RETRY_MAX_WAIT_MS = "s3.read.retry.max-wait-ms";
+  public static final long S3_READ_RETRY_MAX_WAIT_MS_DEFAULT = 2 * 60 * 1000; // 2 minute
+
+  /**
+   * Total retry time for a S3 read operation
+   */
+  public static final String S3_READ_RETRY_TOTAL_TIMEOUT_MS = "s3.read.retry.total-timeout-ms";
+  public static final long S3_READ_RETRY_TOTAL_TIMEOUT_MS_DEFAULT = 10 * 60 * 1000; // 10 minutes
+
   /**
    * Used by {@link LakeFormationAwsClientFactory}.
    * The table name used as part of lake formation credentials request.
@@ -380,6 +404,10 @@ public class AwsProperties implements Serializable {
   private int s3FileIoDeleteThreads;
   private boolean isS3DeleteEnabled;
   private final Map<String, String> s3BucketToAccessPointMapping;
+  private int s3ReadRetryNumRetries;
+  private long s3ReadRetryMinWaitMs;
+  private long s3ReadRetryMaxWaitMs;
+  private long s3ReadRetryTotalTimeoutMs;
 
   private String glueCatalogId;
   private boolean glueCatalogSkipArchive;
@@ -404,6 +432,10 @@ public AwsProperties() {
     this.s3FileIoDeleteThreads = Runtime.getRuntime().availableProcessors();
     this.isS3DeleteEnabled = S3_DELETE_ENABLED_DEFAULT;
     this.s3BucketToAccessPointMapping = ImmutableMap.of();
+    this.s3ReadRetryNumRetries = S3_READ_RETRY_NUM_RETRIES_DEFAULT;
+    this.s3ReadRetryMinWaitMs = S3_READ_RETRY_MIN_WAIT_MS_DEFAULT;
+    this.s3ReadRetryMaxWaitMs = S3_READ_RETRY_MAX_WAIT_MS_DEFAULT;
+    this.s3ReadRetryTotalTimeoutMs = S3_READ_RETRY_TOTAL_TIMEOUT_MS_DEFAULT;
 
     this.glueCatalogId = null;
     this.glueCatalogSkipArchive = GLUE_CATALOG_SKIP_ARCHIVE_DEFAULT;
@@ -472,6 +504,14 @@ public AwsProperties(Map<String, String> properties) {
             Runtime.getRuntime().availableProcessors());
     this.isS3DeleteEnabled = PropertyUtil.propertyAsBoolean(properties, S3_DELETE_ENABLED, S3_DELETE_ENABLED_DEFAULT);
     this.s3BucketToAccessPointMapping = PropertyUtil.propertiesWithPrefix(properties, S3_ACCESS_POINTS_PREFIX);
+    this.s3ReadRetryNumRetries = PropertyUtil.propertyAsInt(properties, S3_READ_RETRY_NUM_RETRIES,
+        S3_READ_RETRY_NUM_RETRIES_DEFAULT);
+    this.s3ReadRetryMinWaitMs = PropertyUtil.propertyAsLong(properties, S3_READ_RETRY_MIN_WAIT_MS,
+        S3_READ_RETRY_MIN_WAIT_MS_DEFAULT);
+    this.s3ReadRetryMaxWaitMs = PropertyUtil.propertyAsLong(properties, S3_READ_RETRY_MAX_WAIT_MS,
+        S3_READ_RETRY_MAX_WAIT_MS_DEFAULT);
+    this.s3ReadRetryTotalTimeoutMs = PropertyUtil.propertyAsLong(properties, S3_READ_RETRY_TOTAL_TIMEOUT_MS,
+        S3_READ_RETRY_TOTAL_TIMEOUT_MS_DEFAULT);
 
     this.dynamoDbTableName = PropertyUtil.propertyAsString(properties, DYNAMODB_TABLE_NAME,
         DYNAMODB_TABLE_NAME_DEFAULT);
@@ -613,6 +653,38 @@ public void setS3DeleteEnabled(boolean s3DeleteEnabled) {
     this.isS3DeleteEnabled = s3DeleteEnabled;
   }
 
+  public int s3ReadRetryNumRetries() {
+    return s3ReadRetryNumRetries;
+  }
+
+  public void setS3ReadRetryNumRetries(int s3ReadRetryNumRetries) {
+    this.s3ReadRetryNumRetries = s3ReadRetryNumRetries;
+  }
+
+  public long s3ReadRetryMinWaitMs() {
+    return s3ReadRetryMinWaitMs;
+  }
+
+  public void setS3ReadRetryMinWaitMs(long s3ReadRetryMinWaitMs) {
+    this.s3ReadRetryMinWaitMs = s3ReadRetryMinWaitMs;
+  }
+
+  public long s3ReadRetryMaxWaitMs() {
+    return s3ReadRetryMaxWaitMs;
+  }
+
+  public void setS3ReadRetryMaxWaitMs(long s3ReadRetryMaxWaitMs) {
+    this.s3ReadRetryMaxWaitMs = s3ReadRetryMaxWaitMs;
+  }
+
+  public long s3ReadRetryTotalTimeoutMs() {
+    return s3ReadRetryTotalTimeoutMs;
+  }
+
+  public void setS3ReadRetryTotalTimeoutMs(long s3ReadRetryTotalTimeoutMs) {
+    this.s3ReadRetryTotalTimeoutMs = s3ReadRetryTotalTimeoutMs;
+  }
+
   private Set<Tag> toTags(Map<String, String> properties, String prefix) {
     return PropertyUtil.propertiesWithPrefix(properties, prefix)
         .entrySet().stream()

diff --git a/aws/src/main/java/org/apache/iceberg/aws/s3/S3InputStream.java b/aws/src/main/java/org/apache/iceberg/aws/s3/S3InputStream.java
@@ -19,8 +19,11 @@
 
 package org.apache.iceberg.aws.s3;
 
+import java.io.EOFException;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.UncheckedIOException;
+import java.net.HttpURLConnection;
 import java.util.Arrays;
 import org.apache.iceberg.aws.AwsProperties;
 import org.apache.iceberg.io.FileIOMetricsContext;
@@ -33,11 +36,16 @@
 import org.apache.iceberg.relocated.com.google.common.base.Joiner;
 import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
 import org.apache.iceberg.relocated.com.google.common.io.ByteStreams;
+import org.apache.iceberg.util.Tasks;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+import software.amazon.awssdk.awscore.exception.AwsServiceException;
+import software.amazon.awssdk.core.exception.AbortedException;
 import software.amazon.awssdk.core.sync.ResponseTransformer;
+import software.amazon.awssdk.http.Abortable;
 import software.amazon.awssdk.services.s3.S3Client;
 import software.amazon.awssdk.services.s3.model.GetObjectRequest;
+import software.amazon.awssdk.services.s3.model.S3Exception;
 
 class S3InputStream extends SeekableInputStream implements RangeReadable {
   private static final Logger LOG = LoggerFactory.getLogger(S3InputStream.class);
@@ -88,23 +96,69 @@ public void seek(long newPos) {
 
   @Override
   public int read() throws IOException {
-    Preconditions.checkState(!closed, "Cannot read: already closed");
-    positionStream();
+    int[] byteRef = new int[1];
+    try {
+      Tasks.foreach(0)
+          .retry(awsProperties.s3ReadRetryNumRetries())
+          .exponentialBackoff(
+              awsProperties.s3ReadRetryMinWaitMs(),
+              awsProperties.s3ReadRetryMaxWaitMs(),
+              awsProperties.s3ReadRetryTotalTimeoutMs(),
+              2.0 /* exponential */)
+          .shouldRetryTest(S3InputStream::shouldRetry)
+          .throwFailureWhenFinished()
+          .run(ignored -> {
+            try {
+              Preconditions.checkState(!closed, "Cannot read: already closed");
+              positionStream();
+
+              byteRef[0] =  stream.read();
+            } catch (IOException e) {
+              closeStream();
+              throw new UncheckedIOException(e);
+            }
+          });
+    } catch (UncheckedIOException e) {
+      throw e.getCause();
+    }
 
     pos += 1;
     next += 1;
     readBytes.increment();
     readOperations.increment();
 
-    return stream.read();
+    return byteRef[0];
   }
 
   @Override
   public int read(byte[] b, int off, int len) throws IOException {
-    Preconditions.checkState(!closed, "Cannot read: already closed");
-    positionStream();
+    int[] bytesReadRef = new int[1];
+    try {
+      Tasks.foreach(0)
+          .retry(awsProperties.s3ReadRetryNumRetries())
+          .exponentialBackoff(
+              awsProperties.s3ReadRetryMinWaitMs(),
+              awsProperties.s3ReadRetryMaxWaitMs(),
+              awsProperties.s3ReadRetryTotalTimeoutMs(),
+              2.0 /* exponential */)
+          .shouldRetryTest(S3InputStream::shouldRetry)
+          .throwFailureWhenFinished()
+          .run(ignored -> {
+            try {
+              Preconditions.checkState(!closed, "Cannot read: already closed");
+              positionStream();
+
+              bytesReadRef[0] = stream.read(b, off, len);
+            } catch (IOException e) {
+              closeStream();
+              throw new UncheckedIOException(e);
+            }
+          });
+    } catch (UncheckedIOException e) {
+      throw e.getCause();
+    }
 
-    int bytesRead = stream.read(b, off, len);
+    int bytesRead = bytesReadRef[0];
     pos += bytesRead;
     next += bytesRead;
     readBytes.increment((long) bytesRead);
@@ -118,17 +172,65 @@ public void readFully(long position, byte[] buffer, int offset, int length) thro
     Preconditions.checkPositionIndexes(offset, offset + length, buffer.length);
 
     String range = String.format("bytes=%s-%s", position, position + length - 1);
-
-    IOUtil.readFully(readRange(range), buffer, offset, length);
+    try {
+      Tasks.foreach(0)
+          .retry(awsProperties.s3ReadRetryNumRetries())
+          .exponentialBackoff(
+              awsProperties.s3ReadRetryMinWaitMs(),
+              awsProperties.s3ReadRetryMaxWaitMs(),
+              awsProperties.s3ReadRetryTotalTimeoutMs(),
+              2.0 /* exponential */)
+          .shouldRetryTest(S3InputStream::shouldRetry)
+          .throwFailureWhenFinished()
+          .run(ignored -> {
+            InputStream rangeStream = null;
+            try {
+              rangeStream = readRange(range);
+              IOUtil.readFully(rangeStream, buffer, offset, length);
+            } catch (IOException e) {
+              throw new UncheckedIOException(e);
+            } finally {
+              closeServerSideStream(rangeStream);
+            }
+          });
+    } catch (UncheckedIOException e) {
+      throw e.getCause();
+    }
   }
 
   @Override
   public int readTail(byte[] buffer, int offset, int length) throws IOException {
     Preconditions.checkPositionIndexes(offset, offset + length, buffer.length);
 
     String range = String.format("bytes=-%s", length);
+    int[] bytesReadRef = new int[1];
+
+    try {
+      Tasks.foreach(0)
+          .retry(awsProperties.s3ReadRetryNumRetries())
+          .exponentialBackoff(
+              awsProperties.s3ReadRetryMinWaitMs(),
+              awsProperties.s3ReadRetryMaxWaitMs(),
+              awsProperties.s3ReadRetryTotalTimeoutMs(),
+              2.0 /* exponential */)
+          .shouldRetryTest(S3InputStream::shouldRetry)
+          .throwFailureWhenFinished()
+          .run(ignored -> {
+            InputStream rangeStream = null;
+            try {
+              rangeStream = readRange(range);
+              bytesReadRef[0] = IOUtil.readRemaining(rangeStream, buffer, offset, length);
+            } catch (IOException e) {
+              throw new UncheckedIOException(e);
+            } finally {
+              closeServerSideStream(rangeStream);
+            }
+          });
+    } catch (UncheckedIOException e) {
+      throw e.getCause();
+    }
 
-    return IOUtil.readRemaining(readRange(range), buffer, offset, length);
+    return bytesReadRef[0];
   }
 
   private InputStream readRange(String range) {
@@ -172,31 +274,62 @@ private void positionStream() throws IOException {
     }
 
     // close the stream and open at desired position
-    LOG.debug("Seek with new stream for {} to offset {}", location, next);
+    LOG.warn("Seek with new stream for {} to offset {}", location, next);
     pos = next;
     openStream();
   }
 
-  private void openStream() throws IOException {
-    GetObjectRequest.Builder requestBuilder = GetObjectRequest.builder()
-        .bucket(location.bucket())
-        .key(location.key())
-        .range(String.format("bytes=%s-", pos));
-
-    S3RequestUtil.configureEncryption(awsProperties, requestBuilder);
-
+  private void openStream() {
     closeStream();
-    stream = s3.getObject(requestBuilder.build(), ResponseTransformer.toInputStream());
+    stream = readRange(String.format("bytes=%s-", pos));
+  }
+
+  private void closeStream() {
+    closeServerSideStream(stream);
+    stream = null;
   }
 
-  private void closeStream() throws IOException {
-    if (stream != null) {
-      stream.close();
+  private static void closeServerSideStream(InputStream streamToClose) {
+    if (streamToClose != null) {
+      try {
+        if (streamToClose instanceof Abortable) {
+          // Stated in the ResponseInputStream javadoc:
+          // If it is not desired to read remaining data from the stream,
+          // you can explicitly abort the connection via abort().
+          ((Abortable) streamToClose).abort();
+        } else {
+          streamToClose.close();
+        }
+      } catch (IOException | AbortedException e) {
+        // ignore failure to abort or close stream
+      }
     }
   }
 
-  public void setSkipSize(int skipSize) {
-    this.skipSize = skipSize;
+  private static boolean shouldRetry(Exception exception) {
+    if (exception instanceof UncheckedIOException) {
+      if (exception.getCause() instanceof EOFException) {
+        return false;
+      }
+    }
+
+    if (exception instanceof AwsServiceException) {
+      switch (((AwsServiceException) exception).statusCode()) {
+        case HttpURLConnection.HTTP_FORBIDDEN:
+        case HttpURLConnection.HTTP_BAD_REQUEST:
+          return false;
+      }
+    }
+
+    if (exception instanceof S3Exception) {
+      switch (((S3Exception) exception).statusCode()) {
+        case HttpURLConnection.HTTP_NOT_FOUND:
+        case 416: // range not satisfied
+          return false;
+      }
+    }
+
+    return true;
   }
 
   @SuppressWarnings("checkstyle:NoFinalizer")