diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java index 75db90092d151..77e329c7f3094 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java @@ -1529,6 +1529,18 @@ private Constants() { */ public static final String AWS_S3_ACCESSPOINT_REQUIRED = "fs.s3a.accesspoint.required"; + /** + * Explicit request for the SDK region resolution. + * Value: {@code}. + */ + public static final String SDK_REGION = "sdk"; + + /** + * An empty region is the historic fall-through to the SDK. + * Value: "" + */ + public static final String EMPTY_REGION = ""; + /** * Flag for create performance. * This can be set in the {code createFile()} builder. diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/DefaultS3ClientFactory.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/DefaultS3ClientFactory.java index 41e904ec9de1b..51cae69626e67 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/DefaultS3ClientFactory.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/DefaultS3ClientFactory.java @@ -20,16 +20,10 @@ import java.io.IOException; import java.net.URI; -import java.net.URISyntaxException; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import org.apache.hadoop.classification.VisibleForTesting; -import org.apache.hadoop.fs.s3a.impl.AWSClientConfig; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import software.amazon.awssdk.awscore.util.AwsHostNameUtils; import software.amazon.awssdk.core.checksums.RequestChecksumCalculation; import software.amazon.awssdk.core.checksums.ResponseChecksumValidation; import software.amazon.awssdk.core.client.config.ClientOverrideConfiguration; @@ -57,17 +51,14 @@ import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.s3a.impl.AWSClientConfig; +import org.apache.hadoop.fs.s3a.impl.RegionResolution; import org.apache.hadoop.fs.s3a.statistics.impl.AwsStatisticsCollector; import org.apache.hadoop.fs.store.LogExactlyOnce; -import static org.apache.hadoop.fs.s3a.Constants.AWS_REGION; +import static java.util.Objects.requireNonNull; import static org.apache.hadoop.fs.s3a.Constants.AWS_S3_ACCESS_GRANTS_ENABLED; import static org.apache.hadoop.fs.s3a.Constants.AWS_S3_ACCESS_GRANTS_FALLBACK_TO_IAM_ENABLED; -import static org.apache.hadoop.fs.s3a.Constants.AWS_S3_CROSS_REGION_ACCESS_ENABLED; -import static org.apache.hadoop.fs.s3a.Constants.AWS_S3_CROSS_REGION_ACCESS_ENABLED_DEFAULT; -import static org.apache.hadoop.fs.s3a.Constants.AWS_S3_DEFAULT_REGION; -import static org.apache.hadoop.fs.s3a.Constants.CENTRAL_ENDPOINT; -import static org.apache.hadoop.fs.s3a.Constants.FIPS_ENDPOINT; import static org.apache.hadoop.fs.s3a.Constants.HTTP_SIGNER_CLASS_NAME; import static org.apache.hadoop.fs.s3a.Constants.HTTP_SIGNER_ENABLED; import static org.apache.hadoop.fs.s3a.Constants.HTTP_SIGNER_ENABLED_DEFAULT; @@ -77,7 +68,8 @@ import static org.apache.hadoop.fs.s3a.auth.SignerFactory.createHttpSigner; import static org.apache.hadoop.fs.s3a.impl.AWSHeaders.REQUESTER_PAYS_HEADER; import static org.apache.hadoop.fs.s3a.impl.InternalConstants.AUTH_SCHEME_AWS_SIGV_4; -import static org.apache.hadoop.util.Preconditions.checkArgument; +import static org.apache.hadoop.fs.s3a.impl.RegionResolution.RegionResolutionMechanism.Sdk; +import static org.apache.hadoop.fs.s3a.impl.RegionResolution.calculateRegion; /** @@ -92,11 +84,6 @@ public class DefaultS3ClientFactory extends Configured private static final String REQUESTER_PAYS_HEADER_VALUE = "requester"; - private static final String S3_SERVICE_NAME = "s3"; - - private static final Pattern VPC_ENDPOINT_PATTERN = - Pattern.compile("^(?:.+\\.)?([a-z0-9-]+)\\.vpce\\.amazonaws\\.(?:com|com\\.cn)$"); - /** * Subclasses refer to this. */ @@ -104,28 +91,10 @@ public class DefaultS3ClientFactory extends Configured LoggerFactory.getLogger(DefaultS3ClientFactory.class); /** - * A one-off warning of default region chains in use. - */ - private static final LogExactlyOnce WARN_OF_DEFAULT_REGION_CHAIN = - new LogExactlyOnce(LOG); - - /** - * Warning message printed when the SDK Region chain is in use. + * Message printed when the SDK Region chain is in use. */ private static final String SDK_REGION_CHAIN_IN_USE = - "S3A filesystem client is using" - + " the SDK region resolution chain."; - - - /** Exactly once log to inform about ignoring the AWS-SDK Warnings for CSE. */ - private static final LogExactlyOnce IGNORE_CSE_WARN = new LogExactlyOnce(LOG); - - /** - * Error message when an endpoint is set with FIPS enabled: {@value}. - */ - @VisibleForTesting - public static final String ERROR_ENDPOINT_WITH_FIPS = - "Non central endpoint cannot be set when " + FIPS_ENDPOINT + " is true"; + "S3A filesystem client is using the SDK region resolution chain."; /** * A one-off log stating whether S3 Access Grants are enabled. @@ -307,174 +276,58 @@ protected ClientOverrideConfiguration.Builder createClientOverrideConfiguration( /** * This method configures the endpoint and region for a S3 client. - * The order of configuration is: - * - *
    - *
  1. If region is configured via fs.s3a.endpoint.region, use it.
  2. - *
  3. If endpoint is configured via via fs.s3a.endpoint, set it. - * If no region is configured, try to parse region from endpoint.
  4. - *
  5. If no region is configured, and it could not be parsed from the endpoint, - * set the default region as US_EAST_2
  6. - *
  7. If configured region is empty, fallback to SDK resolution chain.
  8. - *
  9. S3 cross region is enabled by default irrespective of region or endpoint - * is set or not.
  10. - *
- * + * See {@link RegionResolution} for the details. * @param builder S3 client builder. * @param parameters parameter object - * @param conf conf configuration object + * @param conf conf configuration object * @param S3 client builder type * @param S3 client type + * @return how the region was resolved. * @throws IllegalArgumentException if endpoint is set when FIPS is enabled. */ - private , ClientT> void configureEndpointAndRegion( - BuilderT builder, S3ClientCreationParameters parameters, Configuration conf) { - final String endpointStr = parameters.getEndpoint(); - final URI endpoint = getS3Endpoint(endpointStr, conf); - - final String configuredRegion = parameters.getRegion(); - Region region = null; - String origin = ""; - - // If the region was configured, set it. - if (configuredRegion != null && !configuredRegion.isEmpty()) { - origin = AWS_REGION; - region = Region.of(configuredRegion); - } + private , ClientT> RegionResolution.Resolution + configureEndpointAndRegion(BuilderT builder, + S3ClientCreationParameters parameters, + Configuration conf) throws IOException { - // FIPs? Log it, then reject any attempt to set an endpoint - final boolean fipsEnabled = parameters.isFipsEnabled(); - if (fipsEnabled) { - LOG.debug("Enabling FIPS mode"); - } - // always setting it guarantees the value is non-null, + final RegionResolution.Resolution resolution = + calculateRegion(parameters, conf); + LOG.debug("Region Resolution: {}", resolution); + + // always setting to true or false guarantees the value is non-null, // which tests expect. - builder.fipsEnabled(fipsEnabled); - - if (endpoint != null) { - boolean endpointEndsWithCentral = - endpointStr.endsWith(CENTRAL_ENDPOINT); - checkArgument(!fipsEnabled || endpointEndsWithCentral, "%s : %s", - ERROR_ENDPOINT_WITH_FIPS, - endpoint); - - // No region was configured, - // determine the region from the endpoint. - if (region == null) { - region = getS3RegionFromEndpoint(endpointStr, - endpointEndsWithCentral); - if (region != null) { - origin = "endpoint"; - } - } + builder.fipsEnabled(resolution.isUseFips()); - // No need to override endpoint with "s3.amazonaws.com". - // Let the client take care of endpoint resolution. Overriding - // the endpoint with "s3.amazonaws.com" causes 400 Bad Request - // errors for non-existent buckets and objects. - // ref: https://github.com/aws/aws-sdk-java-v2/issues/4846 - if (!endpointEndsWithCentral) { - builder.endpointOverride(endpoint); - LOG.debug("Setting endpoint to {}", endpoint); - } else { - origin = "central endpoint with cross region access"; - LOG.debug("Enabling cross region access for endpoint {}", - endpointStr); - } - } + if (Sdk != resolution.getMechanism()) { - if (region != null) { - builder.region(region); - } else if (configuredRegion == null) { - // no region is configured, and none could be determined from the endpoint. - // Use US_EAST_2 as default. - region = Region.of(AWS_S3_DEFAULT_REGION); - builder.region(region); - origin = "cross region access fallback"; - } else if (configuredRegion.isEmpty()) { - // region configuration was set to empty string. - // allow this if people really want it; it is OK to rely on this - // when deployed in EC2. - WARN_OF_DEFAULT_REGION_CHAIN.warn(SDK_REGION_CHAIN_IN_USE); - LOG.debug(SDK_REGION_CHAIN_IN_USE); - origin = "SDK region chain"; - } - boolean isCrossRegionAccessEnabled = conf.getBoolean(AWS_S3_CROSS_REGION_ACCESS_ENABLED, - AWS_S3_CROSS_REGION_ACCESS_ENABLED_DEFAULT); - // s3 cross region access - if (isCrossRegionAccessEnabled) { - builder.crossRegionAccessEnabled(true); + // a region has been determined from configuration, + // or it is falling back to central region. + + final Region region = resolution.getRegion(); + builder.region(requireNonNull(region)); + // s3 cross region access + if (resolution.isCrossRegionAccessEnabled()) { + builder.crossRegionAccessEnabled(true); + } + final URI endpointUri = resolution.getEndpointUri(); + if (endpointUri != null && !resolution.isUseCentralEndpoint()) { + LOG.debug("Setting endpoint to {}", endpointUri); + builder.endpointOverride(endpointUri); + } } - LOG.debug("Setting region to {} from {} with cross region access {}", - region, origin, isCrossRegionAccessEnabled); + return resolution; } /** * Given a endpoint string, create the endpoint URI. - * + *

Kept in as subclasses use it. * @param endpoint possibly null endpoint. * @param conf config to build the URI from. * @return an endpoint uri */ protected static URI getS3Endpoint(String endpoint, final Configuration conf) { - boolean secureConnections = conf.getBoolean(SECURE_CONNECTIONS, DEFAULT_SECURE_CONNECTIONS); - - String protocol = secureConnections ? "https" : "http"; - - if (endpoint == null || endpoint.isEmpty()) { - // don't set an endpoint if none is configured, instead let the SDK figure it out. - return null; - } - - if (!endpoint.contains("://")) { - endpoint = String.format("%s://%s", protocol, endpoint); - } - - try { - return new URI(endpoint); - } catch (URISyntaxException e) { - throw new IllegalArgumentException(e); - } - } - - /** - * Parses the endpoint to get the region. - * If endpoint is the central one, use US_EAST_2. - * - * @param endpoint the configure endpoint. - * @param endpointEndsWithCentral true if the endpoint is configured as central. - * @return the S3 region, null if unable to resolve from endpoint. - */ - @VisibleForTesting - static Region getS3RegionFromEndpoint(final String endpoint, - final boolean endpointEndsWithCentral) { - - if (!endpointEndsWithCentral) { - // S3 VPC endpoint parsing - Matcher matcher = VPC_ENDPOINT_PATTERN.matcher(endpoint); - if (matcher.find()) { - LOG.debug("Mapping to VPCE"); - LOG.debug("Endpoint {} is vpc endpoint; parsing region as {}", endpoint, matcher.group(1)); - return Region.of(matcher.group(1)); - } - - LOG.debug("Endpoint {} is not the default; parsing", endpoint); - return AwsHostNameUtils.parseSigningRegion(endpoint, S3_SERVICE_NAME).orElse(null); - } - - // Select default region here to enable cross-region access. - // If both "fs.s3a.endpoint" and "fs.s3a.endpoint.region" are empty, - // Spark sets "fs.s3a.endpoint" to "s3.amazonaws.com". - // This applies to Spark versions with the changes of SPARK-35878. - // ref: - // https://github.com/apache/spark/blob/v3.5.0/core/ - // src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala#L528 - // If we do not allow cross region access, Spark would not be able to - // access any bucket that is not present in the given region. - // Hence, we should use default region us-east-2 to allow cross-region - // access. - return Region.of(AWS_S3_DEFAULT_REGION); + return RegionResolution.buildEndpointUri(endpoint, secureConnections); } private static , ClientT> void diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/NetworkBinding.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/NetworkBinding.java index 910723f909f57..6f9c249b31a97 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/NetworkBinding.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/NetworkBinding.java @@ -22,6 +22,7 @@ import java.lang.reflect.InvocationTargetException; import java.net.URI; import java.net.URISyntaxException; +import java.util.Locale; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -99,13 +100,14 @@ public static void bindSSLChannelMode(Configuration conf, /** * Is this an AWS endpoint? looks at end of FQDN. - * @param endpoint endpoint - * @return true if the endpoint matches the requirements for an aws endpoint. + * @param endpoint endpoint. + * @return true iff this is non-empty or ends with amazonaws.com or amazonaws.com.cn */ public static boolean isAwsEndpoint(final String endpoint) { + final String host = endpoint.toLowerCase(Locale.ROOT); return (endpoint.isEmpty() - || endpoint.endsWith(".amazonaws.com") - || endpoint.endsWith(".amazonaws.com.cn")); + || host.endsWith(".amazonaws.com") + || host.endsWith(".amazonaws.com.cn")); } /** diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/RegionResolution.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/RegionResolution.java new file mode 100644 index 0000000000000..f902e1f808b69 --- /dev/null +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/RegionResolution.java @@ -0,0 +1,545 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.s3a.impl; + +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; +import java.util.Optional; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import javax.annotation.Nullable; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import software.amazon.awssdk.awscore.util.AwsHostNameUtils; +import software.amazon.awssdk.regions.Region; + +import org.apache.hadoop.classification.VisibleForTesting; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.s3a.Retries; +import org.apache.hadoop.fs.s3a.S3ClientFactory; + +import static java.util.Objects.requireNonNull; +import static org.apache.hadoop.fs.s3a.Constants.AWS_S3_CROSS_REGION_ACCESS_ENABLED; +import static org.apache.hadoop.fs.s3a.Constants.AWS_S3_CROSS_REGION_ACCESS_ENABLED_DEFAULT; +import static org.apache.hadoop.fs.s3a.Constants.CENTRAL_ENDPOINT; +import static org.apache.hadoop.fs.s3a.Constants.DEFAULT_SECURE_CONNECTIONS; +import static org.apache.hadoop.fs.s3a.Constants.EMPTY_REGION; +import static org.apache.hadoop.fs.s3a.Constants.FIPS_ENDPOINT; +import static org.apache.hadoop.fs.s3a.Constants.SDK_REGION; +import static org.apache.hadoop.fs.s3a.Constants.SECURE_CONNECTIONS; +import static org.apache.hadoop.fs.s3a.impl.NetworkBinding.isAwsEndpoint; +import static org.apache.hadoop.fs.s3a.impl.RegionResolution.RegionResolutionMechanism.ExternalEndpoint; +import static org.apache.hadoop.fs.s3a.impl.RegionResolution.RegionResolutionMechanism.FallbackToCentral; +import static org.apache.hadoop.util.Preconditions.checkArgument; +import static software.amazon.awssdk.regions.Region.US_EAST_1; + +/** + * Region resolution. + *

This is complicated and can be a source of support escalations. + *

The V1 SDK was happy to take an endpoint and + * work details out from there, possibly probing us-central-1 and cacheing + * the result. + *

The V2 SDK likes the signing region and endpoint to be declared. + * The S3A connector has tried to mimic the V1 code, but lacks some features + * (use of environment variables, probing of EC2 IAM details) for which + * the SDK is better. + *

    + *
  1. If region is configured via fs.s3a.endpoint.region, use it.
  2. + *
  3. If endpoint is configured via via fs.s3a.endpoint, set it. + * If no region is configured, try to parse region from endpoint.
  4. + *
  5. If no region is configured, and it could not be parsed from the endpoint, + * set the default region as US_EAST_2
  6. + *
  7. If configured region is empty, fallback to SDK resolution chain.
  8. + *
  9. S3 cross region is enabled by default irrespective of region or endpoint + * is set or not.
  10. + *
+ */ +public class RegionResolution { + + protected static final Logger LOG = + LoggerFactory.getLogger(RegionResolution.class); + + /** + * Service to ask SDK to parse. + */ + private static final String S3_SERVICE_NAME = "s3"; + + /** + * Pattern to match vpce endpoints on. + */ + private static final Pattern VPC_ENDPOINT_PATTERN = + Pattern.compile("^(?:.+\\.)?([a-z0-9-]+)\\.vpce\\.amazonaws\\.(?:com|com\\.cn)$"); + + /** + * Error message when an endpoint is set with FIPS enabled: {@value}. + */ + @VisibleForTesting + public static final String ERROR_ENDPOINT_WITH_FIPS = + "Only S3 central endpoint cannot be set when " + FIPS_ENDPOINT + " is true"; + + /** + * Virtual hostnames MUST be used when using the FIPS endpoint. + */ + public static final String FIPS_PATH_ACCESS_INCOMPATIBLE = + "Path style access must be disabled when " + FIPS_ENDPOINT + " is true"; + + /** + * String value for external region: {@value}. + */ + public static final String EXTERNAL = "external"; + + /** + * External region, used for third party endpoints. + */ + public static final Region EXTERNAL_REGION = Region.of(EXTERNAL); + + private RegionResolution() { + } + + /** + * How was the region resolved? + */ + public enum RegionResolutionMechanism { + + /** Endpoint inference. */ + CalculatedFromEndpoint("Calculated from endpoint."), + + /** It's an external endpoint */ + ExternalEndpoint("External endpoint"), + + /** No resolution: falling back to central endpoint. */ + FallbackToCentral("Fallback to central endpoint"), + + /** Connection is a VPCE endpoint which was parsed for the region. */ + ParseVpceEndpoint("Parse VPCE Endpoint"), + + /** SDK requested. */ + Sdk("SDK resolution chain"), + + /** Set in configuration. */ + Specified("Region specified"); + + /** + * Text of the mechanism. + */ + private final String mechanism; + + RegionResolutionMechanism(String mechanism) { + this.mechanism = mechanism; + } + + /** + * String value of the resolution mechanism. + * @return the resolution mechanism. + */ + public String getMechanism() { + return mechanism; + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("RegionResolutionMechanism{"); + sb.append("mechanism='").append(mechanism).append('\''); + sb.append('}'); + return sb.toString(); + } + } + + /** + * The resolution of a region and endpoint.. + */ + public static final class Resolution { + + /** + * Region: if null hand down to the SDK. + */ + private Region region; + + /** + * How was the region resolved? + * Null means unresolved. + */ + private RegionResolutionMechanism mechanism; + + /** + * Should FIPS be enabled? + */ + private boolean useFips; + + /** + * Should cross-region access be enabled? + */ + private boolean crossRegionAccessEnabled; + + /** + * Endpoint as string. + */ + private String endpointStr; + + /** + * Endpoint URI. + */ + private URI endpointUri; + + /** + * Use the central endpoint? + */ + private boolean useCentralEndpoint; + + /** Empty constructor. */ + public Resolution() { + } + + /** + * Instantiate with a region and resolution mechanism. + * @param region region + * @param mechanism resolution mechanism. + */ + public Resolution(final Region region, final RegionResolutionMechanism mechanism) { + this.region = region; + this.mechanism = mechanism; + } + + /** + * Set the region. + * Declares the region as resolved even when the value is null (i.e. resolve to SDK). + * @param resolvedRegion region + * @param resolutionMechanism resolution mechanism. + * @return the builder + */ + public Resolution withRegion( + @Nullable final Region resolvedRegion, + final RegionResolutionMechanism resolutionMechanism) { + this.region = resolvedRegion; + this.mechanism = requireNonNull(resolutionMechanism); + return this; + } + + /** + * Set builder value. + * @param value new value + * @return the builder + */ + public Resolution withUseFips(final boolean value) { + useFips = value; + return this; + } + + /** + * Set builder value. + * @param value new value + * @return the builder + */ + public Resolution withCrossRegionAccessEnabled(final boolean value) { + crossRegionAccessEnabled = value; + return this; + } + + /** + * Set builder value. + * @param value new value + * @return the builder + */ + public Resolution withEndpointStr(final String value) { + endpointStr = value; + return this; + } + + /** + * Endpoint URI. + * @return value if set. + */ + public URI getEndpointUri() { + return endpointUri; + } + + /** + * Set builder value. + * @param value new value + * @return the builder + */ + public Resolution withEndpointUri(final URI value) { + endpointUri = value; + return this; + } + + /** + * Endpoint as string. + * @return value if set. + */ + public String getEndpointStr() { + return endpointStr; + } + + /** + * Region: if null hand down to the SDK. + * @return value if set. + */ + public Region getRegion() { + return region; + } + + /** + * Should FIPS be enabled? + * @return flag state. + */ + public boolean isUseFips() { + return useFips; + } + + /** + * Should cross-region access be enabled? + * @return flag state. + */ + public boolean isCrossRegionAccessEnabled() { + return crossRegionAccessEnabled; + } + + /** + * How was the region resolved? + * Null means unresolved. + * @return value if set. + */ + public RegionResolutionMechanism getMechanism() { + return mechanism; + } + + /** + * Is the region resolved. + * @return true if there's been a resolution. + */ + public boolean isRegionResolved() { + return mechanism != null; + } + + /** + * Use the central endpoint? + * @return flag state. + */ + public boolean isUseCentralEndpoint() { + return useCentralEndpoint; + } + + /** + * Set builder value. + * @param value new value + * @return the builder + */ + public Resolution withUseCentralEndpoint(final boolean value) { + useCentralEndpoint = value; + return this; + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("Resolution{"); + sb.append("region=").append(region); + sb.append(", resolution=").append(mechanism); + sb.append(", useFips=").append(useFips); + sb.append(", crossRegionAccessEnabled=").append(crossRegionAccessEnabled); + sb.append(", endpointUri=").append(endpointUri); + sb.append(", useCentralEndpoint=").append(useCentralEndpoint); + sb.append('}'); + return sb.toString(); + } + } + + /** + * Given a endpoint string, create the endpoint URI. + * @param endpoint possibly null endpoint. + * @param secureConnections use secure HTTPS connection? + * @return an endpoint uri or null if the endpoint was passed in was null/empty + * @throws IllegalArgumentException failure to parse the endpoint. + */ + public static URI buildEndpointUri(String endpoint, final boolean secureConnections) { + + String protocol = secureConnections ? "https" : "http"; + + if (endpoint == null || endpoint.isEmpty()) { + // don't set an endpoint if none is configured, instead let the SDK figure it out. + return null; + } + + if (!endpoint.contains("://")) { + endpoint = String.format("%s://%s", protocol, endpoint); + } + + try { + return new URI(endpoint); + } catch (URISyntaxException e) { + throw new IllegalArgumentException(e); + } + } + + /** + * Parses the endpoint to get the region. + * If endpoint is the central one, use US_EAST_1. + * @param endpoint the configure endpoint. + * @param endpointEndsWithCentral true if the endpoint is configured as central. + * @return the S3 region resolution if possible from parsing the endpoint + */ + @VisibleForTesting + public static Optional determineS3RegionFromEndpoint( + final String endpoint, + final boolean endpointEndsWithCentral) { + + if (!endpointEndsWithCentral) { + // S3 VPC endpoint parsing + Matcher matcher = VPC_ENDPOINT_PATTERN.matcher(endpoint); + if (matcher.find()) { + LOG.debug("Mapping to VPCE"); + LOG.debug("Endpoint {} is VPC endpoint; parsing region as {}", + endpoint, matcher.group(1)); + return Optional.of(new Resolution( + Region.of(matcher.group(1)), + RegionResolutionMechanism.ParseVpceEndpoint)); + } + + LOG.debug("Endpoint {} is not the default; parsing signing region from name.", endpoint); + return AwsHostNameUtils.parseSigningRegion(endpoint, S3_SERVICE_NAME) + .map(r -> + new Resolution(r, RegionResolutionMechanism.CalculatedFromEndpoint)); + } + + // No resolution. + return Optional.empty(); + } + + + /** + * Does the region name refer to an SDK region? + * @param configuredRegion region in the configuration + * @return true if this is considered to refer to an SDK region. + */ + public static boolean isSdkRegion(String configuredRegion) { + return SDK_REGION.equalsIgnoreCase(configuredRegion) + || EMPTY_REGION.equalsIgnoreCase(configuredRegion); + } + + /** + * Calculate the region and the final endpoint. + * @param parameters creation parameters + * @param conf configuration with other options. + * @return the resolved region and endpoint. + * @throws IOException if the client failed to communicate with the IAM service. + * @throws IllegalArgumentException failure to parse endpoint, or FIPS settings. + */ + @Retries.OnceTranslated + public static Resolution calculateRegion( + final S3ClientFactory.S3ClientCreationParameters parameters, + final Configuration conf) throws IOException { + + Resolution resolution = new Resolution(); + + // endpoint; may be null + final String endpointStr = parameters.getEndpoint(); + boolean endpointDeclared = endpointStr != null && !endpointStr.isEmpty(); + final URI endpoint; + if (endpointDeclared) { + endpoint = buildEndpointUri(endpointStr, + conf.getBoolean(SECURE_CONNECTIONS, DEFAULT_SECURE_CONNECTIONS)); + } else { + // set to null if endpointStr is null/empty + endpoint = null; + } + + final String configuredRegion = parameters.getRegion(); + + // If the region was configured, set it. + // this includes special handling of the sdk, ec2 and "" regions. + if (configuredRegion != null) { + checkArgument(!"null".equals(configuredRegion), + "null is region name"); + if (isSdkRegion(configuredRegion)) { + resolution.withRegion(null, RegionResolutionMechanism.Sdk); + } else { + resolution.withRegion(Region.of(configuredRegion), + RegionResolutionMechanism.Specified); + } + } + + // central endpoint if no endpoint has been set, or it is explicitly + // requested + boolean endpointEndsWithCentral = !endpointDeclared + || endpointStr.endsWith(CENTRAL_ENDPOINT); + + if (!resolution.isRegionResolved()) { + // parse from the endpoint and set if calculated + LOG.debug("Attempting to determine region from endpoint {}; endpointEndsWithCentral={}", + endpointStr, endpointEndsWithCentral); + determineS3RegionFromEndpoint(endpointStr, endpointEndsWithCentral).ifPresent(r -> + resolution.withRegion(r.getRegion(), r.getMechanism())); + } + + // cross region setting. + resolution.withCrossRegionAccessEnabled( + conf.getBoolean(AWS_S3_CROSS_REGION_ACCESS_ENABLED, + AWS_S3_CROSS_REGION_ACCESS_ENABLED_DEFAULT)); + + // fips settings. + final boolean fipsEnabled = parameters.isFipsEnabled(); + resolution.withUseFips(fipsEnabled); + if (fipsEnabled) { + // validate the FIPS settings + checkArgument(endpoint == null || endpointEndsWithCentral, + "%s : %s", ERROR_ENDPOINT_WITH_FIPS, endpoint); + checkArgument(!parameters.isPathStyleAccess(), + FIPS_PATH_ACCESS_INCOMPATIBLE); + } + + if (!resolution.isRegionResolved()) { + // still not resolved. + if (!endpointDeclared || isAwsEndpoint(endpointStr)) { + // still failing to resolve the region + // fall back to central + resolution.withRegion(US_EAST_1, FallbackToCentral); + } else { + // we are not resolved and not an aws region. + // set the region to being "external" + resolution.withRegion(EXTERNAL_REGION, ExternalEndpoint); + } + } + + // No need to override endpoint with "s3.amazonaws.com". + // Let the client take care of endpoint resolution. Overriding + // the endpoint with "s3.amazonaws.com" causes 400 Bad Request + // errors for non-existent buckets and objects. + // ref: https://github.com/aws/aws-sdk-java-v2/issues/4846 + if (endpointEndsWithCentral) { + resolution.withUseCentralEndpoint(true); + } else { + LOG.debug("Setting endpoint to {}", endpoint); + resolution.withEndpointStr(endpointStr) + .withEndpointUri(endpoint) + .withUseCentralEndpoint(false); + } + + final Region r = resolution.getRegion(); + if (r != null && !Region.regions().contains(r)) { + // note that the region isn't known. + // not an issue for third party stores, otherwise it may be a region newer than + // that expected by the SDK. Hence: only log at debug. + LOG.debug("Region {} is not recognized by this SDK", r); + } + return resolution; + } + +} diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/connecting.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/connecting.md index 3645a652b64cb..c1794d00841c3 100644 --- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/connecting.md +++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/connecting.md @@ -61,7 +61,7 @@ The S3A connector supports S3 cross region access via AWS SDK which is enabled b Not supported: * AWS [Snowball](https://aws.amazon.com/snowball/). -As of December 2023, AWS S3 uses Transport Layer Security (TLS) [version 1.2](https://aws.amazon.com/blogs/security/tls-1-2-required-for-aws-endpoints/) to secure the communications channel; the S3A client is does this through +As of December 2023, AWS S3 uses Transport Layer Security (TLS) [version 1.2](https://aws.amazon.com/blogs/security/tls-1-2-required-for-aws-endpoints/) to secure the communications channel; the S3A client does this through the Apache [HttpClient library](https://hc.apache.org/index.html). ### Third party stores @@ -74,80 +74,191 @@ _MUST_ be installed on the JVMs on hosts within the Hadoop cluster. See [Working with Third-party S3 Stores](third_party_stores.html) *after* reading this document. -## Connection Settings +## Endpoint and Region Settings -There are three core settings to connect to an S3 store, endpoint, region and whether or not to use path style access. +There are three core settings to connect to an S3 store, endpoint, region and whether to use path style access. + +The term "endpoint" means the URL or hostname of the remote s3 store. +The default S3 endpoint is `s3.amazonaws.com` +When a request is made to a bucket and path style access is false, the hostname to +make HTTP requests from is prefixed to the endpoint. A bucket `example` would +end up with a name `example.s3.amazonaws.com`. + +S3 Buckets are hosted in different AWS regions. + +Each region has its own S3 endpoint, documented [by Amazon](http://docs.aws.amazon.com/general/latest/gr/rande.html#s3_region). + +1. Applications running in EC2 infrastructure do not pay for IO to/from + *local S3 buckets*. They will be billed for access to remote buckets. Always + use local buckets and local copies of data, wherever possible. +2. With the V4 signing protocol, AWS requires the explicit region endpoint + to be used —hence S3A must be configured to use the specific endpoint. This + is done by setting the region in the configuration option `fs.s3a.endpoint.region`, + or by explicitly setting `fs.s3a.endpoint` and `fs.s3a.endpoint.region`. +3. All endpoints other than the default region only support interaction + with buckets local to that S3 instance. +4. Standard S3 buckets support "cross-region" access where use of the original `us-east-1` + endpoint allows access to the data, but newer storage types, particularly S3 Express are + not supported. + +If the wrong endpoint is used, the request will fail. This may be reported as a 301/redirect error, +or as a "400 Bad Request": take these as cues to check the endpoint setting of +a bucket. + +The up-to-date list of regions is [Available online](https://docs.aws.amazon.com/general/latest/gr/s3.html). + +Knowing the region of a bucket is key to be able to communicate and authenticate with +an S3 bucket. ```xml - - fs.s3a.endpoint - AWS S3 endpoint to connect to. An up-to-date list is - provided in the AWS Documentation: regions and endpoints. Without this - property, the endpoint/hostname of the S3 Store is inferred from - the value of fs.s3a.endpoint.region, fs.s3a.endpoint.fips and more. - - fs.s3a.endpoint.region REGION - AWS Region of the data + AWS Region of the bucket + + + + fs.s3a.endpoint + AWS S3 endpoint to connect to. + Leave blank for the SDK to determine it from the region and/or other settings. + fs.s3a.path.style.access false Enable S3 path style access by disabling the default virtual hosting behaviour. - Needed for AWS PrivateLink, S3 AccessPoints, and, generally, third party stores. + Needed for AWS PrivateLink, S3 AccessPoints, and third party stores. Default: false. ``` -Historically the S3A connector has preferred the endpoint as defined by the option `fs.s3a.endpoint`. +There are also some secondary options. The `fs.s3a.endpoint.fips` is covered in its own section; +the option `fs.s3a.cross.region.access.enabled` is generally left alone -this SDK feature is +often critical when configuring a cluster to work with data round the world. + +```xml + + fs.s3a.cross.region.access.enabled + true + SDK to fall back to cross-region bucket access + + + + fs.s3a.endpoint.fips + false + Use the FIPS endpoint + +``` + +Historically the S3A connector preferred the endpoint as defined by the option `fs.s3a.endpoint`. With the move to the AWS V2 SDK, there is more emphasis on the region, set by the `fs.s3a.endpoint.region` option. -Normally, declaring the region in `fs.s3a.endpoint.region` should be sufficient to set up the network connection to correctly connect to an AWS-hosted S3 store. +Normally, declaring the region in `fs.s3a.endpoint.region` should be sufficient to set up the network +connection to correctly connect to an _AWS-hosted S3 store_. + +When connecting to third-party stores, the `fs.s3a.endpoint` option becomes critical; +the value of `fs.s3a.endpoint.region` can still tune s3a client behavior. ### S3 endpoint and region settings in detail -* Configs `fs.s3a.endpoint` and `fs.s3a.endpoint.region` are used to set values - for S3 endpoint and region respectively. -* If `fs.s3a.endpoint.region` is configured with valid AWS region value, S3A will - configure the S3 client to use this value. If this is set to a region that does - not match your bucket, you will receive a 301 redirect response. -* If `fs.s3a.endpoint.region` is not set and `fs.s3a.endpoint` is set with valid - endpoint value, S3A will attempt to parse the region from the endpoint and - configure S3 client to use the region value. -* If both `fs.s3a.endpoint` and `fs.s3a.endpoint.region` are not set, S3A will - use `us-east-2` as default region and enable cross region access. In this case, - S3A does not attempt to override the endpoint while configuring the S3 client. -* If `fs.s3a.endpoint` is not set and `fs.s3a.endpoint.region` is set to an empty - string, S3A will configure S3 client without any region or endpoint override. - This will allow fallback to S3 SDK region resolution chain. More details - [here](https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/region-selection.html). -* If `fs.s3a.endpoint` is set to central endpoint `s3.amazonaws.com` and - `fs.s3a.endpoint.region` is not set, S3A will use `us-east-2` as default region - and enable cross region access. In this case, S3A does not attempt to override - the endpoint while configuring the S3 client. -* If `fs.s3a.endpoint` is set to central endpoint `s3.amazonaws.com` and - `fs.s3a.endpoint.region` is also set to some region, S3A will use that region - value and enable cross region access. In this case, S3A does not attempt to - override the endpoint while configuring the S3 client. - -When the cross region access is enabled while configuring the S3 client, even if the -region set is incorrect, S3 SDK determines the region. This is done by making the -request, and if the SDK receives 301 redirect response, it determines the region at -the cost of a HEAD request, and caches it. - -Please note that some endpoint and region settings that require cross region access +1. Configuration options `fs.s3a.endpoint.region` and `fs.s3a.endpoint` are used to set values + for the S3 region and endpoint respectively. +2. If `fs.s3a.endpoint.region` is configured with valid AWS region value, S3A will + configure the S3 client to use this value. If this is set to a region that does + not match your bucket, you will receive a 301 redirect response. +3. If `fs.s3a.endpoint.region` is not set and `fs.s3a.endpoint` is set to an AWS regional endpoint + S3A will determine the region by parsing the endpoint string. + This works for VPCE, `amazonaws.com` and `amazonaws.cn` endpoints. +4. If `fs.s3a.endpoint.region` is set to `sdk` then region resolution is handled + by the SDK. It's process is documented + [here](https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/region-selection.html). +5. If both `fs.s3a.endpoint` and `fs.s3a.endpoint.region` are unset, S3A will + use `us-east-1` as default region and expect cross-region access. +6. If `fs.s3a.endpoint` is not set and `fs.s3a.endpoint.region` is set to "" + string, S3A will use the SDK Resolution, as when the region is set to `sdk`. + (this is different from the resolution point 5 as the string is empty, rather than null) +7. If `fs.s3a.endpoint` is set to the central endpoint `s3.amazonaws.com` and + `fs.s3a.endpoint.region` is not set, S3A will use `us-east-1` as the default region + and expect cross-region access. + +When the cross-region access is set, the AWS SDK determines the region if when unknown. +This is done by making the request, and if the SDK receives 301 redirect response, issues +a HEAD request to the bucket to determine its location. +This is cached for the duration of the JVM. + +This cross-region resolution requires that the host performing the lookup has network access +to the central region. If the host is in an AWS VPC which lacks such network +access, cross region lookup will fail. + +Please note that some endpoint and region settings that require cross-region access are complex and improving over time. Hence, they may be considered unstable. -*Important:* do not use `auto`, `ec2`, or `sdk` as these may be used -in the future for specific region-binding algorithms. +If you are working with third party stores, please check [third party stores in detail](./third_party_stores.html). + +If this seems confusing: you are correct! + +Here is what to do + +#### Deploying on EC2 and working with AWS S3 buckets mostly in the local region + +1. Leave `fs.s3a.endpoint` unset. +2. Set `fs.s3a.endpoint.region` to `sdk`. +3. Leave `fs.s3a.cross.region.access.enabled` as `true`. + +This hands off resolution to the SDK, which will use the IAM service to determine the local region. +The SDK will use this to build the endpoint URL and sign all requests. + +Remote buckets will be accessed via probes to `s3.amazonaws.com`, relying on +cross-region access to resolve their location. + + +#### On-prem access to AWS S3 where the bucket region is known + +1. Leave `fs.s3a.endpoint` unset. +2. Set `fs.s3a.endpoint.region` to the region of the bucket. + +The AWS SDK will choose the correct endpoint for the bucket region and sign requests +appropriately. + +#### On-prem access to AWS S3 where the bucket region is **not** known + +1. Leave `fs.s3a.endpoint` unset. +2. Set `fs.s3a.endpoint.region` to `sdk` +3. Leave `fs.s3a.cross.region.access.enabled` as `true`. + +The AWS SDK will attempt to connect to the bucket via the central `s3.amazonaws.com` region; +if it is elsewhere it will determine the correct location. + +#### On-prem access to AWS S3 through VPCE + +1. Set `fs.s3a.endpoint` to the VPCE endpoint +2. Set `fs.s3a.endpoint.region` to the region of the bucket, *or leave unset* +3. Set `fs.s3a.path.style.access` to `true`. + +```xml + + fs.s3a.bucket.example.endpoint + https://bucket.vpce-05ba4f2400000-x92g7xzc.s3.us-west-2.vpce.amazonaws.com/ + + + + fs.s3a.bucket.example.path.style.access + true + +``` + +#### Third party stores + +See [Third Party Stores](./third_party_stores.html) for the full details and example settings. +* Set `fs.s3a.endpoint` to the full URL of the service, or, if it supports virtual hostnames, +to the domain name to which virtual hosts are prefixed. +* Set `fs.s3a.endpoint.region` to `external`. +* If working with an HTTP endpoint, set `fs.s3a.bucket.connection.ssl.enabled` to false. -If you are working with third party stores, please check [third party stores in detail](third_party_stores.html). ### Network timeouts @@ -285,6 +396,12 @@ Core aspects of pool settings are: ``` +Using OpenSSL is 5-10% faster than using the java 8 TLS implementation, that is: *SQL queries complete faster*. + +It is hard to set up and a bit brittle, but if possible, use it! + + + ### Proxy Settings Connections to S3A stores can be made through an HTTP or HTTPS proxy. @@ -343,38 +460,6 @@ if long-lived connections have problems. ## Using Per-Bucket Configuration to access data round the world -S3 Buckets are hosted in different "regions", the default being "US-East-1". -The S3A client talks to this region by default, issuing HTTP requests -to the server `s3.amazonaws.com`. - -S3A can work with buckets from any region. Each region has its own -S3 endpoint, documented [by Amazon](http://docs.aws.amazon.com/general/latest/gr/rande.html#s3_region). - -1. Applications running in EC2 infrastructure do not pay for IO to/from -*local S3 buckets*. They will be billed for access to remote buckets. Always -use local buckets and local copies of data, wherever possible. -2. With the V4 signing protocol, AWS requires the explicit region endpoint -to be used —hence S3A must be configured to use the specific endpoint. This -is done by setting the regon in the configuration option `fs.s3a.endpoint.region`, -or by explicitly setting `fs.s3a.endpoint` and `fs.s3a.endpoint.region`. -3. All endpoints other than the default region only support interaction -with buckets local to that S3 instance. -4. Standard S3 buckets support "cross-region" access where use of the original `us-east-1` - endpoint allows access to the data, but newer storage types, particularly S3 Express are - not supported. - - - -If the wrong endpoint is used, the request will fail. This may be reported as a 301/redirect error, -or as a 400 Bad Request: take these as cues to check the endpoint setting of -a bucket. - -The up to date list of regions is [Available online](https://docs.aws.amazon.com/general/latest/gr/s3.html). - -This list can be used to specify the endpoint of individual buckets, for example -for buckets in the us-west-2 and EU/Ireland endpoints. - - ```xml fs.s3a.bucket.us-west-2-dataset.endpoint.region diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/third_party_stores.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/third_party_stores.md index 0336efa677c0b..f1b4f3ed3fb8c 100644 --- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/third_party_stores.md +++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/third_party_stores.md @@ -102,7 +102,7 @@ AWS SDK requires the name of a region is supplied for signing, and that region m Third-party stores don't normally care about the name of a region, *only that a region is supplied*. You should set `fs.s3a.endpoint.region` to anything except the following reserved names: `sdk`, `ec2` and `auto`. -We have plans for those. +Recommended: `external` ## Other issues @@ -445,7 +445,6 @@ The S3A client's creation of an endpoint URL generates an unknown host. ``` - ``` ls: software.amazon.awssdk.core.exception.SdkClientException: Received an UnknownHostException when attempting to interact with a service. @@ -491,7 +490,7 @@ at [ECS Test Drive](https://portal.ecstestdrive.com/) were ```xml fs.s3a.endpoint.region - dell + external arbitrary name other than sdk, ec2, auto or null @@ -564,7 +563,7 @@ this makes renaming and deleting significantly slower. fs.s3a.endpoint.region - gcs + external @@ -640,3 +639,67 @@ It is also a way to regression test foundational S3A third-party store compatibi _Note_ If anyone is set up to test this regularly, please let the hadoop developer team know if regressions do surface, as it is not a common test configuration. We do use it to help test compatibility during SDK updates. + +## RustFS localhost with no https + +RustFS is an easy to deploy S3 store. + +In tests of the S3A connector in December 2025 we observed: +1. Eventual consistency in path deletion (LIST responses included recently deleted objects; HEAD correctly returned 404) +2. Eventual consistency in lists of multipart object uploads (`s3guard uploads` command, *and* s3a committer cleanup) +3. Case inconsistency when running on a MacOS system; not tested elsewhere. +4. Other minor issues in niche API calls (`getBucketMetadata()`) which don't affect normal use. + +Listing inconsistency after directory deletion is the key issue which may break applications as it means that +* Newly deleted directories may still return objects. +* Newly renamed objects may still be listable at the source paths. +* The logic which determines whether an empty directory marker should be reinserted after a child path deletion may not behave correctly. + +It may be safe for use with tables which are designed to work on inconsistent object stores (Apache Iceberg and rivals), but +it does not, as of December 2025 appear safe for use with classic Hive directory structured tables, through Hive, Spark or other applications. +Nor are the S3A committers guaranteed to work safely. + +Use at your own risk. Running the `hadoop-aws` test suite against your store would be the ideal way to see if later +versions have changed their behavior. + +Example settings for a local rust bucket. Note that `fs.s3a.bucket.rustybucket.connection.ssl.enabled` has been set to false +as the SDK doesn't look at the http/https prefix of the endpoint to determine which protocol to use. + +```xml + + fs.s3a.bucket.rustybucket.access.key + rustfsadmin + + + + fs.s3a.bucket.rustybucket.secret.key + rustfsadmin + + + + fs.s3a.bucket.rustybucket.endpoint + http://localhost:9000 + + + + + fs.s3a.bucket.rustybucket.connection.ssl.enabled + false + + + + fs.s3a.bucket.rustybucket.endpoint.region + external + + + + fs.s3a.bucket.rustybucket.path.style.access + true + + + + fs.s3a.bucket.rustybucket.create.conditional.enabled + false + +``` + diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/AbstractS3AMockTest.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/AbstractS3AMockTest.java index 81a295345a8fc..31ab2668b558a 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/AbstractS3AMockTest.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/AbstractS3AMockTest.java @@ -27,6 +27,7 @@ import java.net.URI; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.test.AbstractHadoopTestBase; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; @@ -35,7 +36,7 @@ /** * Abstract base class for S3A unit tests using a mock S3 client. */ -public abstract class AbstractS3AMockTest { +public abstract class AbstractS3AMockTest extends AbstractHadoopTestBase { protected static final String BUCKET = "mock-bucket"; protected static final AwsServiceException NOT_FOUND = diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AEndpointRegion.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AEndpointRegion.java index 25efe7a06e5ae..7259526b2f88a 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AEndpointRegion.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AEndpointRegion.java @@ -29,8 +29,10 @@ import org.assertj.core.api.Assertions; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Test; +import org.opentest4j.AssertionFailedError; import software.amazon.awssdk.awscore.AwsExecutionAttribute; import software.amazon.awssdk.awscore.exception.AwsServiceException; +import software.amazon.awssdk.core.exception.SdkClientException; import software.amazon.awssdk.core.interceptor.Context; import software.amazon.awssdk.core.interceptor.ExecutionAttributes; import software.amazon.awssdk.core.interceptor.ExecutionInterceptor; @@ -54,11 +56,12 @@ import static org.apache.hadoop.fs.s3a.Constants.FIPS_ENDPOINT; import static org.apache.hadoop.fs.s3a.Constants.PATH_STYLE_ACCESS; import static org.apache.hadoop.fs.s3a.Constants.S3_ENCRYPTION_ALGORITHM; -import static org.apache.hadoop.fs.s3a.DefaultS3ClientFactory.ERROR_ENDPOINT_WITH_FIPS; +import static org.apache.hadoop.fs.s3a.Constants.SDK_REGION; import static org.apache.hadoop.fs.s3a.S3ATestUtils.assume; import static org.apache.hadoop.fs.s3a.S3ATestUtils.assumeNotS3ExpressFileSystem; import static org.apache.hadoop.fs.s3a.S3ATestUtils.assumeStoreAwsHosted; import static org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides; +import static org.apache.hadoop.fs.s3a.impl.RegionResolution.ERROR_ENDPOINT_WITH_FIPS; import static org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils.DEFAULT_REQUESTER_PAYS_BUCKET_NAME; import static org.apache.hadoop.io.IOUtils.closeStream; import static org.apache.hadoop.test.LambdaTestUtils.intercept; @@ -113,6 +116,9 @@ public class ITestS3AEndpointRegion extends AbstractS3ATestBase { * Text to include in assertions. */ private static final AtomicReference EXPECTED_MESSAGE = new AtomicReference<>(); + + public static final String INCORRECT_REGION_SET = "Incorrect region set"; + /** * New FS instance which will be closed in teardown. */ @@ -223,6 +229,32 @@ public void testWithRegionConfig() throws Throwable { expectInterceptorException(client); } + /** + * This hands off resolution to the SDK which may fail if nothing can be found + * (non-EC2; no AWS_REGION env var or through {@code ~/.aws/config}. + * There's separate handling for the different failure modes so this + * test will work in all deployments. + */ + @Test + public void testWithSDKRegionConfig() throws Throwable { + describe("Create a client with an SDK region"); + Configuration conf = getConfiguration(); + + try { + S3Client client = createS3Client(conf, CENTRAL_ENDPOINT, SDK_REGION, null, false); + + expectInterceptorException(client); + } catch (SdkClientException e) { + Assertions.assertThat(e) + .describedAs("Exception raised due to unable to resolve region") + .hasMessageContaining("region"); + } catch (AssertionFailedError e) { + Assertions.assertThat(e) + .describedAs("Exception raised region resolution working on local system") + .hasMessageContaining(INCORRECT_REGION_SET); + } + } + @Test public void testWithFips() throws Throwable { describe("Create a client with fips enabled"); @@ -646,7 +678,7 @@ public void beforeExecution(Context.BeforeExecution context, } Assertions.assertThat(reg) - .describedAs("Incorrect region set in %s. Client Config=%s", + .describedAs(INCORRECT_REGION_SET + " in %s. Client Config=%s", state, EXPECTED_MESSAGE.get()) .isEqualTo(region); diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestS3AEndpointParsing.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestS3AEndpointParsing.java index 8be0708cad542..3c234381d1c55 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestS3AEndpointParsing.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestS3AEndpointParsing.java @@ -18,10 +18,14 @@ package org.apache.hadoop.fs.s3a; +import java.util.Optional; + import org.assertj.core.api.Assertions; import org.junit.jupiter.api.Test; import software.amazon.awssdk.regions.Region; +import org.apache.hadoop.fs.s3a.impl.RegionResolution; + public class TestS3AEndpointParsing extends AbstractS3AMockTest { private static final String VPC_ENDPOINT = "vpce-1a2b3c4d-5e6f.s3.us-west-2.vpce.amazonaws.com"; @@ -29,15 +33,21 @@ public class TestS3AEndpointParsing extends AbstractS3AMockTest { private static final String US_WEST_2 = "us-west-2"; private static final String EU_WEST_1 = "eu-west-1"; - @Test - public void testVPCEndpoint() { - Region region = DefaultS3ClientFactory.getS3RegionFromEndpoint(VPC_ENDPOINT, false); - Assertions.assertThat(region).isEqualTo(Region.of(US_WEST_2)); - } - - @Test - public void testNonVPCEndpoint() { - Region region = DefaultS3ClientFactory.getS3RegionFromEndpoint(NON_VPC_ENDPOINT, false); - Assertions.assertThat(region).isEqualTo(Region.of(EU_WEST_1)); - } + @Test + public void testVPCEndpoint() { + Optional + region = RegionResolution.determineS3RegionFromEndpoint(VPC_ENDPOINT, false); + Assertions.assertThat(region).get() + .extracting(RegionResolution.Resolution::getRegion) + .isEqualTo(Region.of(US_WEST_2)); + } + + @Test + public void testNonVPCEndpoint() { + Optional + region = RegionResolution.determineS3RegionFromEndpoint(NON_VPC_ENDPOINT, false); + Assertions.assertThat(region).get() + .extracting(RegionResolution.Resolution::getRegion) + .isEqualTo(Region.of(EU_WEST_1)); + } } diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/TestRegionResolution.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/TestRegionResolution.java new file mode 100644 index 0000000000000..f26ddd3403def --- /dev/null +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/TestRegionResolution.java @@ -0,0 +1,318 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.s3a.impl; + +import java.io.IOException; + +import org.assertj.core.api.Assertions; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import software.amazon.awssdk.regions.Region; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.s3a.S3ClientFactory; +import org.apache.hadoop.test.AbstractHadoopTestBase; + +import static org.apache.hadoop.fs.s3a.Constants.CENTRAL_ENDPOINT; +import static org.apache.hadoop.fs.s3a.Constants.SDK_REGION; +import static org.apache.hadoop.fs.s3a.impl.RegionResolution.ERROR_ENDPOINT_WITH_FIPS; +import static org.apache.hadoop.fs.s3a.impl.RegionResolution.calculateRegion; +import static org.apache.hadoop.test.LambdaTestUtils.intercept; + +/** + * Test region resolution logic in {@link RegionResolution}. + * These are based on {@code ITestS3AEndpointRegion}. + */ +public class TestRegionResolution extends AbstractHadoopTestBase { + + private static final Logger LOG = + LoggerFactory.getLogger(TestRegionResolution.class); + + private static final String US_EAST_1 = "us-east-1"; + + private static final String US_EAST_2 = "us-east-2"; + + private static final String US_WEST_2 = "us-west-2"; + + private static final String EU_WEST_2 = "eu-west-2"; + + private static final String CN_NORTHWEST_1 = "cn-northwest-1"; + + private static final String US_GOV_EAST_1 = "us-gov-east-1"; + + private static final String EU_WEST_2_ENDPOINT = "s3.eu-west-2.amazonaws.com"; + + private static final String CN_ENDPOINT = "s3.cn-northwest-1.amazonaws.com.cn"; + + private static final String GOV_ENDPOINT = "s3-fips.us-gov-east-1.amazonaws.com"; + + private static final String VPC_ENDPOINT = "vpce-1a2b3c4d-5e6f.s3.us-west-2.vpce.amazonaws.com"; + + private static final String CN_VPC_ENDPOINT = + "vpce-1a2b3c4d-5e6f.s3.cn-northwest-1.vpce.amazonaws.com.cn"; + + + private Configuration getConfiguration() { + return new Configuration(false); + } + + /** + * Describe a test. This is a replacement for javadocs + * where the tests role is printed in the log output + * @param text description + */ + protected void describe(String text) { + LOG.info(text); + } + + private RegionResolution.Resolution resolve(Configuration conf, + String endpoint, + String configuredRegion, + boolean isFips, + String expectedRegion, + final RegionResolution.RegionResolutionMechanism expectedMechanism) throws IOException { + S3ClientFactory.S3ClientCreationParameters parameters = + new S3ClientFactory.S3ClientCreationParameters() + .withEndpoint(endpoint) + .withRegion(configuredRegion) + .withFipsEnabled(isFips); + final RegionResolution.Resolution resolved = calculateRegion(parameters, conf); + + // check the region + if (expectedRegion != null) { + Assertions.assertThat(resolved.getRegion()) + .describedAs("Resolved region %s", resolved) + .isNotNull() + .isEqualTo(Region.of(expectedRegion)); + } else { + Assertions.assertThat(resolved.getRegion()) + .describedAs("Resolved region %s", resolved) + .isNull(); + } + + // supplied resolution + if (expectedMechanism != null) { + assertMechanism(expectedMechanism, resolved); + } + return resolved; + } + + /** + * Assert that a resolution used a specific mechanism. + * @param expectedMechanism expected mechanism. + * @param resolved resolved region + */ + private static void assertMechanism( + final RegionResolution.RegionResolutionMechanism expectedMechanism, + final RegionResolution.Resolution resolved) { + Assertions.assertThat(resolved.getMechanism()) + .describedAs("Resolution mechanism of %s", resolved) + .isEqualTo(expectedMechanism); + } + + @Test + public void testWithVPCE() throws IOException { + resolve(getConfiguration(), VPC_ENDPOINT, null, false, US_WEST_2, + RegionResolution.RegionResolutionMechanism.ParseVpceEndpoint); + } + + @Test + public void testWithChinaVPCE() throws IOException { + final RegionResolution.Resolution r = + resolve(getConfiguration(), CN_VPC_ENDPOINT, null, false, + CN_NORTHWEST_1, RegionResolution.RegionResolutionMechanism.ParseVpceEndpoint); + assertEndpoint(r, CN_VPC_ENDPOINT); + assertUseCentralValue(r, false); + } + + @Test + public void testCentralEndpointNoRegion() throws IOException { + final RegionResolution.Resolution r = + resolve(getConfiguration(), CENTRAL_ENDPOINT, null, false, + US_EAST_1, + RegionResolution.RegionResolutionMechanism.FallbackToCentral); + assertEndpoint(r, null); + assertUseCentralValue(r, true); + } + + @Test + public void testCentralEndpointWithRegion() throws IOException { + final RegionResolution.Resolution r = + resolve(getConfiguration(), CENTRAL_ENDPOINT, US_WEST_2, false, + US_WEST_2, RegionResolution.RegionResolutionMechanism.Specified); + assertEndpoint(r, null); + assertUseCentralValue(r, true); + } + + @Test + public void testConfiguredRegion() throws IOException { + final RegionResolution.Resolution r = + resolve(getConfiguration(), null, EU_WEST_2, false, + EU_WEST_2, RegionResolution.RegionResolutionMechanism.Specified); + // this still uses the central endpoint. + assertEndpoint(r, null); + assertUseCentralValue(r, true); + } + + @Test + public void testSDKRegion() throws IOException { + final RegionResolution.Resolution r = + resolve(getConfiguration(), null, SDK_REGION, false, + null, RegionResolution.RegionResolutionMechanism.Sdk); + // SDK handles endpoint logic. + assertEndpoint(r, null); + assertUseCentralValue(r, true); + } + + @Test + public void testSDKUpperCaseRegion() throws IOException { + final RegionResolution.Resolution r = + resolve(getConfiguration(), null, "SDK", false, + null, RegionResolution.RegionResolutionMechanism.Sdk); + // SDK handles endpoint logic. + assertEndpoint(r, null); + assertUseCentralValue(r, true); + } + + @Test + public void testEmptyStringRegion() throws IOException { + final RegionResolution.Resolution r = + resolve(getConfiguration(), null, "", false, + null, RegionResolution.RegionResolutionMechanism.Sdk); + // SDK handles endpoint logic. + assertEndpoint(r, null); + assertUseCentralValue(r, true); + } + + @Test + public void testWithFipsNoEndpoint() throws IOException { + describe("Create a client with fips enabled"); + + resolve(getConfiguration(), + null, EU_WEST_2, true, + EU_WEST_2, RegionResolution.RegionResolutionMechanism.Specified); + } + + /** + * Attempting to create a client with fips enabled and an endpoint specified + * fails during client construction. + */ + @Test + public void testWithFipsAndEndpoint() throws Exception { + describe("Create a client with fips and an endpoint"); + + intercept(IllegalArgumentException.class, ERROR_ENDPOINT_WITH_FIPS, () -> + resolve(getConfiguration(), US_WEST_2, null, true, US_EAST_1, null)); + } + + @Test + public void testWithRegionConfig() throws IOException { + describe("Create a client with a configured region"); + + resolve(getConfiguration(), null, EU_WEST_2, false, + EU_WEST_2, RegionResolution.RegionResolutionMechanism.Specified); + } + + @Test + public void testEUWest2Endpoint() throws IOException { + describe("specifying an eu-west-2 endpoint selects that region"); + + resolve(getConfiguration(), EU_WEST_2_ENDPOINT, null, false, + EU_WEST_2, RegionResolution.RegionResolutionMechanism.CalculatedFromEndpoint); + } + + @Test + public void testWithRegionAndEndpointConfig() throws IOException { + describe("Test that when both region and endpoint are configured, region takes precedence"); + + resolve(getConfiguration(), EU_WEST_2_ENDPOINT, US_WEST_2, false, + US_WEST_2, RegionResolution.RegionResolutionMechanism.Specified); + } + + @Test + public void testWithChinaEndpoint() throws IOException { + describe("Test with a china endpoint"); + final RegionResolution.Resolution r = + resolve(getConfiguration(), CN_ENDPOINT, null, false, + CN_NORTHWEST_1, + RegionResolution.RegionResolutionMechanism.CalculatedFromEndpoint); + assertEndpoint(r, CN_ENDPOINT); + assertUseCentralValue(r, false); + } + + @Test + public void testWithGovCloudEndpoint() throws IOException { + describe("Test with a gov cloud endpoint"); + final RegionResolution.Resolution r = + resolve(getConfiguration(), GOV_ENDPOINT, null, false, + US_GOV_EAST_1, + RegionResolution.RegionResolutionMechanism.CalculatedFromEndpoint); + assertEndpoint(r, GOV_ENDPOINT); + assertUseCentralValue(r, false); + } + + @Test + public void testNullIsForbidden() throws Throwable { + describe("The region null is forbidden as a red flag of configuration problems"); + intercept(IllegalArgumentException.class, () -> + resolve(getConfiguration(), null, "null", false, + null, null)); + } + + @Test + public void testGcsRegion() throws Throwable { + resolve(getConfiguration(), "https://storage.googleapis.com", null, false, + RegionResolution.EXTERNAL, + RegionResolution.RegionResolutionMechanism.ExternalEndpoint); + } + + @Test + public void testLocalhostRegion() throws Throwable { + resolve(getConfiguration(), "127.0.0.1", null, false, + RegionResolution.EXTERNAL, + RegionResolution.RegionResolutionMechanism.ExternalEndpoint); + } + + /** + * Assert that an endpoint matches the expected value. + * @param r resolution + * @param expected expected value. + */ + private static void assertEndpoint(final RegionResolution.Resolution r, + final String expected) { + Assertions.assertThat(r.getEndpointStr()) + .describedAs("Endpoint of %s", r) + .isEqualTo(expected); + } + + /** + * assert that the resolution {@code isUseCentralEndpoint()} value + * matches that expected. + * @param r resolution + * @param expected expected value. + */ + private static void assertUseCentralValue(final RegionResolution.Resolution r, + final boolean expected) { + Assertions.assertThat(r.isUseCentralEndpoint()) + .describedAs("Endpoint of %s", r) + .isEqualTo(expected); + } + +} diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/tools/ITestBucketTool.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/tools/ITestBucketTool.java index bbe9d74824b7a..0e7fc76cc9054 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/tools/ITestBucketTool.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/tools/ITestBucketTool.java @@ -35,10 +35,12 @@ import org.apache.hadoop.util.ExitUtil; import static org.apache.hadoop.fs.s3a.Constants.AWS_REGION; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.assume; import static org.apache.hadoop.fs.s3a.S3ATestUtils.assumeNotS3ExpressFileSystem; import static org.apache.hadoop.fs.s3a.S3ATestUtils.assumeS3ExpressFileSystem; import static org.apache.hadoop.fs.s3a.S3ATestUtils.assumeStoreAwsHosted; import static org.apache.hadoop.fs.s3a.S3ATestUtils.expectErrorCode; +import static org.apache.hadoop.fs.s3a.impl.RegionResolution.isSdkRegion; import static org.apache.hadoop.fs.s3a.impl.S3ExpressStorage.STORE_CAPABILITY_S3_EXPRESS_STORAGE; import static org.apache.hadoop.fs.s3a.tools.BucketTool.CREATE; import static org.apache.hadoop.fs.s3a.tools.BucketTool.NO_ZONE_SUPPLIED; @@ -142,6 +144,9 @@ public void testRecreateTestBucketS3Express() throws Throwable { public void testRecreateTestBucketNonS3Express() throws Throwable { assumeNotS3ExpressFileSystem(fs); assumeStoreAwsHosted(fs); + // fix a region if resolution is handed down to sdk + assume("Skipping as SDK region logic active", + !isSdkRegion(region)); intercept(AWSBadRequestException.class, OWNED, () -> bucketTool.exec("bucket", d(CREATE), d(OPT_REGION), region,