diff --git a/azure/src/main/java/org/apache/iceberg/azure/adlsv2/ADLSFileIO.java b/azure/src/main/java/org/apache/iceberg/azure/adlsv2/ADLSFileIO.java index 0bfce9d6055b..555b395e0d0e 100644 --- a/azure/src/main/java/org/apache/iceberg/azure/adlsv2/ADLSFileIO.java +++ b/azure/src/main/java/org/apache/iceberg/azure/adlsv2/ADLSFileIO.java @@ -111,7 +111,7 @@ DataLakeFileSystemClient client(ADLSLocation location) { new DataLakeFileSystemClientBuilder().httpClient(HTTP); location.container().ifPresent(clientBuilder::fileSystemName); - azureProperties.applyClientConfiguration(location.storageAccount(), clientBuilder); + azureProperties.applyClientConfiguration(location.storageEndpoint(), clientBuilder); return clientBuilder.buildClient(); } diff --git a/azure/src/main/java/org/apache/iceberg/azure/adlsv2/ADLSLocation.java b/azure/src/main/java/org/apache/iceberg/azure/adlsv2/ADLSLocation.java index e73093512b82..e024a5149343 100644 --- a/azure/src/main/java/org/apache/iceberg/azure/adlsv2/ADLSLocation.java +++ b/azure/src/main/java/org/apache/iceberg/azure/adlsv2/ADLSLocation.java @@ -18,26 +18,35 @@ */ package org.apache.iceberg.azure.adlsv2; +import java.net.URI; +import java.net.URISyntaxException; import java.util.Optional; import java.util.regex.Matcher; import java.util.regex.Pattern; -import org.apache.iceberg.exceptions.ValidationException; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; /** - * This class represents a fully qualified location in Azure expressed as a URI. + * This class represents a fully qualified location in Azure Data Lake Storage, expressed as a URI. * *

Locations follow the conventions used by Hadoop's Azure support, i.e. * - *

{@code abfs[s]://[@]/}
+ *
{@code abfs[s]://[@].dfs.core.windows.net/}
* - *

See Hadoop Azure - * Support + * or + * + *

{@code wasb[s]://@.blob.core.windows.net/}
+ * + * For compatibility, paths using the wasb scheme are also accepted but will be processed via the + * Azure Data Lake Storage Gen2 APIs and not the Blob Storage APIs. + * + *

See Hadoop + * Azure Support */ class ADLSLocation { - private static final Pattern URI_PATTERN = Pattern.compile("^abfss?://([^/?#]+)(.*)?$"); + private static final Pattern URI_PATTERN = Pattern.compile("^(abfss?|wasbs?)://[^/?#]+.*$"); - private final String storageAccount; + private final String storageEndpoint; private final String container; private final String path; @@ -50,27 +59,23 @@ class ADLSLocation { Preconditions.checkArgument(location != null, "Invalid location: null"); Matcher matcher = URI_PATTERN.matcher(location); - - ValidationException.check(matcher.matches(), "Invalid ADLS URI: %s", location); - - String authority = matcher.group(1); - String[] parts = authority.split("@", -1); - if (parts.length > 1) { - this.container = parts[0]; - this.storageAccount = parts[1]; - } else { - this.container = null; - this.storageAccount = authority; + if (!matcher.matches()) { + throw new IllegalArgumentException(String.format("Invalid ADLS URI: %s", location)); } - String uriPath = matcher.group(2); - uriPath = uriPath == null ? "" : uriPath.startsWith("/") ? uriPath.substring(1) : uriPath; - this.path = uriPath.split("\\?", -1)[0].split("#", -1)[0]; + try { + URI uri = new URI(location); + this.container = uri.getUserInfo(); + this.storageEndpoint = uri.getHost(); + this.path = stripLeadingSlash(uri.getRawPath()); + } catch (URISyntaxException e) { + throw new IllegalArgumentException(String.format("Invalid ADLS URI: %s", location), e); + } } - /** Returns Azure storage account. */ - public String storageAccount() { - return storageAccount; + /** Returns Azure storage service endpoint. */ + public String storageEndpoint() { + return storageEndpoint; } /** Returns Azure container name. */ @@ -82,4 +87,12 @@ public Optional container() { public String path() { return path; } + + private static String stripLeadingSlash(String path) { + if (path.startsWith("/")) { + return path.substring(1); + } else { + return path; + } + } } diff --git a/azure/src/test/java/org/apache/iceberg/azure/adlsv2/ADLSLocationTest.java b/azure/src/test/java/org/apache/iceberg/azure/adlsv2/ADLSLocationTest.java index 867b54b4c7e3..6edede187153 100644 --- a/azure/src/test/java/org/apache/iceberg/azure/adlsv2/ADLSLocationTest.java +++ b/azure/src/test/java/org/apache/iceberg/azure/adlsv2/ADLSLocationTest.java @@ -21,7 +21,8 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; -import org.apache.iceberg.exceptions.ValidationException; +import java.net.URI; +import java.net.URISyntaxException; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.ValueSource; @@ -33,17 +34,33 @@ public void testLocationParsing(String scheme) { String p1 = scheme + "://container@account.dfs.core.windows.net/path/to/file"; ADLSLocation location = new ADLSLocation(p1); - assertThat(location.storageAccount()).isEqualTo("account.dfs.core.windows.net"); + assertThat(location.storageEndpoint()).isEqualTo("account.dfs.core.windows.net"); assertThat(location.container().get()).isEqualTo("container"); assertThat(location.path()).isEqualTo("path/to/file"); } - @Test - public void testEncodedString() { - String p1 = "abfs://container@account.dfs.core.windows.net/path%20to%20file"; + @ParameterizedTest + @ValueSource(strings = {"wasb", "wasbs"}) + public void testWasbLocationParsing(String scheme) { + String p1 = scheme + "://container@account.blob.core.windows.net/path/to/file"; ADLSLocation location = new ADLSLocation(p1); - assertThat(location.storageAccount()).isEqualTo("account.dfs.core.windows.net"); + assertThat(location.storageEndpoint()).isEqualTo("account.blob.core.windows.net"); + assertThat(location.container().get()).isEqualTo("container"); + assertThat(location.path()).isEqualTo("path/to/file"); + } + + @ParameterizedTest + @ValueSource( + strings = { + "abfs://container@account.dfs.core.windows.net/path%20to%20file", + "wasb://container@account.blob.core.windows.net/path%20to%20file" + }) + public void testEncodedString(String path) throws URISyntaxException { + ADLSLocation location = new ADLSLocation(path); + String expectedEndpoint = new URI(path).getHost(); + + assertThat(location.storageEndpoint()).isEqualTo(expectedEndpoint); assertThat(location.container().get()).isEqualTo("container"); assertThat(location.path()).isEqualTo("path%20to%20file"); } @@ -51,53 +68,81 @@ public void testEncodedString() { @Test public void testMissingScheme() { assertThatThrownBy(() -> new ADLSLocation("/path/to/file")) - .isInstanceOf(ValidationException.class) + .isInstanceOf(IllegalArgumentException.class) .hasMessage("Invalid ADLS URI: /path/to/file"); } @Test public void testInvalidScheme() { assertThatThrownBy(() -> new ADLSLocation("s3://bucket/path/to/file")) - .isInstanceOf(ValidationException.class) + .isInstanceOf(IllegalArgumentException.class) .hasMessage("Invalid ADLS URI: s3://bucket/path/to/file"); } @Test - public void testNoContainer() { - String p1 = "abfs://account.dfs.core.windows.net/path/to/file"; - ADLSLocation location = new ADLSLocation(p1); + public void testInvalidURI() { + String invalidUri = "abfs://container@account.dfs.core.windows.net/#invalidPath#"; + assertThatThrownBy(() -> new ADLSLocation(invalidUri)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage(String.format("Invalid ADLS URI: %s", invalidUri)); + } - assertThat(location.storageAccount()).isEqualTo("account.dfs.core.windows.net"); + @ParameterizedTest + @ValueSource( + strings = { + "abfs://account.dfs.core.windows.net/path/to/file", + "wasb://account.blob.core.windows.net/path/to/file" + }) + public void testNoContainer(String path) throws URISyntaxException { + ADLSLocation location = new ADLSLocation(path); + String expectedEndpoint = new URI(path).getHost(); + + assertThat(location.storageEndpoint()).isEqualTo(expectedEndpoint); assertThat(location.container().isPresent()).isFalse(); assertThat(location.path()).isEqualTo("path/to/file"); } - @Test - public void testNoPath() { - String p1 = "abfs://container@account.dfs.core.windows.net"; - ADLSLocation location = new ADLSLocation(p1); - - assertThat(location.storageAccount()).isEqualTo("account.dfs.core.windows.net"); + @ParameterizedTest + @ValueSource( + strings = { + "abfs://container@account.dfs.core.windows.net", + "wasb://container@account.blob.core.windows.net" + }) + public void testNoPath(String path) throws URISyntaxException { + ADLSLocation location = new ADLSLocation(path); + String expectedEndpoint = new URI(path).getHost(); + + assertThat(location.storageEndpoint()).isEqualTo(expectedEndpoint); assertThat(location.container().get()).isEqualTo("container"); assertThat(location.path()).isEqualTo(""); } - @Test - public void testQueryAndFragment() { - String p1 = "abfs://container@account.dfs.core.windows.net/path/to/file?query=foo#123"; - ADLSLocation location = new ADLSLocation(p1); - - assertThat(location.storageAccount()).isEqualTo("account.dfs.core.windows.net"); + @ParameterizedTest + @ValueSource( + strings = { + "abfs://container@account.dfs.core.windows.net/path/to/file?query=foo#123", + "wasb://container@account.blob.core.windows.net/path/to/file?query=foo#123" + }) + public void testQueryAndFragment(String path) throws URISyntaxException { + ADLSLocation location = new ADLSLocation(path); + String expectedEndpoint = new URI(path).getHost(); + + assertThat(location.storageEndpoint()).isEqualTo(expectedEndpoint); assertThat(location.container().get()).isEqualTo("container"); assertThat(location.path()).isEqualTo("path/to/file"); } - @Test - public void testQueryAndFragmentNoPath() { - String p1 = "abfs://container@account.dfs.core.windows.net?query=foo#123"; - ADLSLocation location = new ADLSLocation(p1); - - assertThat(location.storageAccount()).isEqualTo("account.dfs.core.windows.net"); + @ParameterizedTest + @ValueSource( + strings = { + "abfs://container@account.dfs.core.windows.net?query=foo#123", + "wasb://container@account.blob.core.windows.net?query=foo#123" + }) + public void testQueryAndFragmentNoPath(String path) throws URISyntaxException { + ADLSLocation location = new ADLSLocation(path); + String expectedEndpoint = new URI(path).getHost(); + + assertThat(location.storageEndpoint()).isEqualTo(expectedEndpoint); assertThat(location.container().get()).isEqualTo("container"); assertThat(location.path()).isEqualTo(""); } diff --git a/core/src/main/java/org/apache/iceberg/io/ResolvingFileIO.java b/core/src/main/java/org/apache/iceberg/io/ResolvingFileIO.java index a858045aab8b..a8adf979f85a 100644 --- a/core/src/main/java/org/apache/iceberg/io/ResolvingFileIO.java +++ b/core/src/main/java/org/apache/iceberg/io/ResolvingFileIO.java @@ -62,7 +62,9 @@ public class ResolvingFileIO implements HadoopConfigurable, DelegateFileIO { "s3n", S3_FILE_IO_IMPL, "gs", GCS_FILE_IO_IMPL, "abfs", ADLS_FILE_IO_IMPL, - "abfss", ADLS_FILE_IO_IMPL); + "abfss", ADLS_FILE_IO_IMPL, + "wasb", ADLS_FILE_IO_IMPL, + "wasbs", ADLS_FILE_IO_IMPL); private final Map ioInstances = Maps.newConcurrentMap(); private final AtomicBoolean isClosed = new AtomicBoolean(false);