diff --git a/core/src/main/java/org/apache/iceberg/hadoop/HadoopCatalog.java b/core/src/main/java/org/apache/iceberg/hadoop/HadoopCatalog.java index 68797927e269..97e1ecaacadc 100644 --- a/core/src/main/java/org/apache/iceberg/hadoop/HadoopCatalog.java +++ b/core/src/main/java/org/apache/iceberg/hadoop/HadoopCatalog.java @@ -23,18 +23,19 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.io.UncheckedIOException; +import java.nio.file.AccessDeniedException; +import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Map; import java.util.Set; -import java.util.stream.Collectors; -import java.util.stream.Stream; import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; +import org.apache.hadoop.fs.RemoteIterator; import org.apache.iceberg.BaseMetastoreCatalog; import org.apache.iceberg.CatalogProperties; import org.apache.iceberg.CatalogUtil; @@ -170,7 +171,9 @@ public String name() { private boolean shouldSuppressPermissionError(IOException ioException) { if (suppressPermissionError) { - return ioException.getMessage() != null && ioException.getMessage().contains("AuthorizationPermissionMismatch"); + return ioException instanceof AccessDeniedException || + (ioException.getMessage() != null && + ioException.getMessage().contains("AuthorizationPermissionMismatch")); } return false; } @@ -220,14 +223,15 @@ public List listTables(Namespace namespace) { if (!isDirectory(nsPath)) { throw new NoSuchNamespaceException("Namespace does not exist: %s", namespace); } - - for (FileStatus s : fs.listStatus(nsPath)) { - if (!s.isDirectory()) { + RemoteIterator it = fs.listStatusIterator(nsPath); + while (it.hasNext()) { + FileStatus status = it.next(); + if (!status.isDirectory()) { // Ignore the path which is not a directory. continue; } - Path path = s.getPath(); + Path path = status.getPath(); if (isTableDir(path)) { TableIdentifier tblIdent = TableIdentifier.of(namespace, path.getName()); tblIdents.add(tblIdent); @@ -329,11 +333,17 @@ public List listNamespaces(Namespace namespace) { } try { - return Stream.of(fs.listStatus(nsPath)) - .map(FileStatus::getPath) - .filter(this::isNamespace) - .map(path -> append(namespace, path.getName())) - .collect(Collectors.toList()); + // using the iterator listing allows for paged downloads + // from HDFS and prefetching from object storage. + List namespaces = new ArrayList<>(); + RemoteIterator it = fs.listStatusIterator(nsPath); + while (it.hasNext()) { + Path path = it.next().getPath(); + if (isNamespace(path)) { + namespaces.add(append(namespace, path.getName())); + } + } + return namespaces; } catch (IOException ioe) { throw new RuntimeIOException(ioe, "Failed to list namespace under: %s", namespace); } diff --git a/core/src/test/java/org/apache/iceberg/hadoop/TestHadoopCatalog.java b/core/src/test/java/org/apache/iceberg/hadoop/TestHadoopCatalog.java index 64ae92e77caa..08fa022a0ab4 100644 --- a/core/src/test/java/org/apache/iceberg/hadoop/TestHadoopCatalog.java +++ b/core/src/test/java/org/apache/iceberg/hadoop/TestHadoopCatalog.java @@ -262,13 +262,13 @@ public void testListTables() throws Exception { List tbls1 = catalog.listTables(Namespace.of("db")); Set tblSet = Sets.newHashSet(tbls1.stream().map(t -> t.name()).iterator()); - Assert.assertEquals(tblSet.size(), 2); + Assert.assertEquals(2, tblSet.size()); Assert.assertTrue(tblSet.contains("tbl1")); Assert.assertTrue(tblSet.contains("tbl2")); List tbls2 = catalog.listTables(Namespace.of("db", "ns1")); - Assert.assertEquals(tbls2.size(), 1); - Assert.assertTrue(tbls2.get(0).name().equals("tbl3")); + Assert.assertEquals("table identifiers", 1, tbls2.size()); + Assert.assertEquals("table name", "tbl3", tbls2.get(0).name()); AssertHelpers.assertThrows("should throw exception", NoSuchNamespaceException.class, "Namespace does not exist: ", () -> { @@ -337,24 +337,24 @@ public void testListNamespace() throws Exception { List nsp1 = catalog.listNamespaces(Namespace.of("db")); Set tblSet = Sets.newHashSet(nsp1.stream().map(t -> t.toString()).iterator()); - Assert.assertEquals(tblSet.size(), 3); + Assert.assertEquals(3, tblSet.size()); Assert.assertTrue(tblSet.contains("db.ns1")); Assert.assertTrue(tblSet.contains("db.ns2")); Assert.assertTrue(tblSet.contains("db.ns3")); List nsp2 = catalog.listNamespaces(Namespace.of("db", "ns1")); - Assert.assertEquals(nsp2.size(), 1); + Assert.assertEquals(1, nsp2.size()); Assert.assertTrue(nsp2.get(0).toString().equals("db.ns1.ns2")); List nsp3 = catalog.listNamespaces(); Set tblSet2 = Sets.newHashSet(nsp3.stream().map(t -> t.toString()).iterator()); - Assert.assertEquals(tblSet2.size(), 2); + Assert.assertEquals(2, tblSet2.size()); Assert.assertTrue(tblSet2.contains("db")); Assert.assertTrue(tblSet2.contains("db2")); List nsp4 = catalog.listNamespaces(); Set tblSet3 = Sets.newHashSet(nsp4.stream().map(t -> t.toString()).iterator()); - Assert.assertEquals(tblSet3.size(), 2); + Assert.assertEquals(2, tblSet3.size()); Assert.assertTrue(tblSet3.contains("db")); Assert.assertTrue(tblSet3.contains("db2"));