From 7f0c846bf2e44bc43ee44dca5b278c0323f70ab8 Mon Sep 17 00:00:00 2001 From: deveshsingh Date: Thu, 25 Jul 2024 15:58:22 +0530 Subject: [PATCH 1/7] HDDS-11231. Ozone Recon - Make Recon restart more resilient and handle restart or start failures. --- ...estReconInsightsForDeletedDirectories.java | 2 +- .../impl/OzoneManagerServiceProviderImpl.java | 47 ++++++++++++++++++- 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/recon/TestReconInsightsForDeletedDirectories.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/recon/TestReconInsightsForDeletedDirectories.java index ca8fcae6643b..8656682cd151 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/recon/TestReconInsightsForDeletedDirectories.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/recon/TestReconInsightsForDeletedDirectories.java @@ -476,7 +476,7 @@ private boolean assertTableRowCount(int expectedCount, return count.get() == expectedCount; } - private void syncDataFromOM() { + private void syncDataFromOM() throws IOException { // Sync data from Ozone Manager to Recon. OzoneManagerServiceProviderImpl impl = (OzoneManagerServiceProviderImpl) cluster.getReconServer().getOzoneManagerServiceProvider(); diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/impl/OzoneManagerServiceProviderImpl.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/impl/OzoneManagerServiceProviderImpl.java index bde89eea1da8..41531993479d 100644 --- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/impl/OzoneManagerServiceProviderImpl.java +++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/impl/OzoneManagerServiceProviderImpl.java @@ -38,9 +38,12 @@ import java.util.concurrent.atomic.AtomicBoolean; import java.util.stream.Collectors; +import com.google.common.collect.Iterators; import com.google.common.util.concurrent.ThreadFactoryBuilder; import org.apache.hadoop.hdds.recon.ReconConfigKeys; import org.apache.hadoop.hdds.utils.db.RocksDatabase; +import org.apache.hadoop.hdds.utils.db.Table; +import org.apache.hadoop.hdds.utils.db.TableIterator; import org.apache.hadoop.hdds.utils.db.managed.ManagedWriteBatch; import org.apache.hadoop.hdds.utils.db.managed.ManagedWriteOptions; import org.apache.hadoop.hdfs.web.URLConnectionFactory; @@ -260,6 +263,18 @@ public void start() { omMetadataManager.start(configuration); } catch (IOException ioEx) { LOG.error("Error starting Recon OM Metadata Manager.", ioEx); + } catch (RuntimeException runtimeException) { + LOG.warn("Unexpected runtime error starting Recon OM Metadata Manager.", runtimeException); + LOG.warn("Trying to delete existing recon OM snapshot DB and fetch new one."); + metrics.incrNumSnapshotRequests(); + LOG.info("Fetching full snapshot from Ozone Manager"); + // Update local Recon OM DB to new snapshot. + try { + boolean success = updateReconOmDBWithNewSnapshot(); + LOG.info("Fetched full new snapshot from Ozone Manager: {}", success); + } catch (IOException e) { + throw new RuntimeException(e); + } } reconTaskController.start(); long initialDelay = configuration.getTimeDuration( @@ -539,7 +554,7 @@ boolean innerGetAndApplyDeltaUpdatesFromOM(long fromSequenceNumber, * full snapshot from Ozone Manager. */ @VisibleForTesting - public boolean syncDataFromOM() { + public boolean syncDataFromOM() throws IOException { if (isSyncDataFromOMRunning.compareAndSet(false, true)) { try { LOG.info("Syncing data from Ozone Manager."); @@ -613,6 +628,7 @@ public boolean syncDataFromOM() { reconContext.updateErrors(ReconContext.ErrorCode.GET_OM_DB_SNAPSHOT_FAILED); } } + printFileAndKeyTableCount(omMetadataManager); } finally { isSyncDataFromOMRunning.set(false); } @@ -623,6 +639,35 @@ public boolean syncDataFromOM() { return true; } + private void printFileAndKeyTableCount(ReconOMMetadataManager omMetadataManager) throws IOException { + Table fileTable = omMetadataManager.getTable("fileTable"); + Table keyTable = omMetadataManager.getTable("keyTable"); + if (keyTable == null) { + LOG.error("Table keyTable not found in OM Metadata."); + return; + } + + if (LOG.isDebugEnabled()) { + try (TableIterator> iterator + = keyTable.iterator()) { + long count = Iterators.size(iterator); + LOG.debug("keyTable Table count: {}", count); + } + } + + if (fileTable == null) { + LOG.error("Table fileTable not found in OM Metadata."); + } + + if (LOG.isDebugEnabled()) { + try (TableIterator> iterator + = fileTable.iterator()) { + long count = Iterators.size(iterator); + LOG.debug("fileTable Table count: {}", count); + } + } + } + public void checkAndValidateReconDbPermissions() { File dbDir = new File(reconDbDir.getPath()); if (!dbDir.exists()) { From 196de86651e6bb6b5b8191e20c4ab5ed3c1ed3e2 Mon Sep 17 00:00:00 2001 From: deveshsingh Date: Thu, 25 Jul 2024 16:22:45 +0530 Subject: [PATCH 2/7] HDDS-11231. Ozone Recon - Make Recon restart more resilient and handle restart or start failures. --- .../ozone/recon/spi/impl/OzoneManagerServiceProviderImpl.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/impl/OzoneManagerServiceProviderImpl.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/impl/OzoneManagerServiceProviderImpl.java index 41531993479d..faecd79fccbf 100644 --- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/impl/OzoneManagerServiceProviderImpl.java +++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/impl/OzoneManagerServiceProviderImpl.java @@ -628,7 +628,7 @@ public boolean syncDataFromOM() throws IOException { reconContext.updateErrors(ReconContext.ErrorCode.GET_OM_DB_SNAPSHOT_FAILED); } } - printFileAndKeyTableCount(omMetadataManager); + printFileAndKeyTableCount(); } finally { isSyncDataFromOMRunning.set(false); } @@ -639,7 +639,7 @@ public boolean syncDataFromOM() throws IOException { return true; } - private void printFileAndKeyTableCount(ReconOMMetadataManager omMetadataManager) throws IOException { + private void printFileAndKeyTableCount() throws IOException { Table fileTable = omMetadataManager.getTable("fileTable"); Table keyTable = omMetadataManager.getTable("keyTable"); if (keyTable == null) { From 38e6e1a401ce1c63ea369279c37b82ff1488a208 Mon Sep 17 00:00:00 2001 From: deveshsingh Date: Fri, 26 Jul 2024 09:08:32 +0530 Subject: [PATCH 3/7] HDDS-11231. Handled review comments. --- .../recon/spi/impl/OzoneManagerServiceProviderImpl.java | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/impl/OzoneManagerServiceProviderImpl.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/impl/OzoneManagerServiceProviderImpl.java index faecd79fccbf..99b0e09d0b79 100644 --- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/impl/OzoneManagerServiceProviderImpl.java +++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/impl/OzoneManagerServiceProviderImpl.java @@ -644,10 +644,9 @@ private void printFileAndKeyTableCount() throws IOException { Table keyTable = omMetadataManager.getTable("keyTable"); if (keyTable == null) { LOG.error("Table keyTable not found in OM Metadata."); - return; } - if (LOG.isDebugEnabled()) { + if (LOG.isDebugEnabled() && null != keyTable) { try (TableIterator> iterator = keyTable.iterator()) { long count = Iterators.size(iterator); @@ -659,7 +658,7 @@ private void printFileAndKeyTableCount() throws IOException { LOG.error("Table fileTable not found in OM Metadata."); } - if (LOG.isDebugEnabled()) { + if (LOG.isDebugEnabled() && null != fileTable) { try (TableIterator> iterator = fileTable.iterator()) { long count = Iterators.size(iterator); From 65d3a3df731efcd969ae526e9ca96a67f15710df Mon Sep 17 00:00:00 2001 From: deveshsingh Date: Mon, 29 Jul 2024 12:13:38 +0530 Subject: [PATCH 4/7] HDDS-11231. Handled review comments. --- .../impl/OzoneManagerServiceProviderImpl.java | 37 ++++++++----------- 1 file changed, 15 insertions(+), 22 deletions(-) diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/impl/OzoneManagerServiceProviderImpl.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/impl/OzoneManagerServiceProviderImpl.java index 99b0e09d0b79..6738eb018bfd 100644 --- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/impl/OzoneManagerServiceProviderImpl.java +++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/impl/OzoneManagerServiceProviderImpl.java @@ -552,6 +552,8 @@ boolean innerGetAndApplyDeltaUpdatesFromOM(long fromSequenceNumber, /** * Based on current state of Recon's OM DB, we either get delta updates or * full snapshot from Ozone Manager. + * @return true or false if sync operation between Recon and OM was successful or failed. + * @throws IOException */ @VisibleForTesting public boolean syncDataFromOM() throws IOException { @@ -628,7 +630,7 @@ public boolean syncDataFromOM() throws IOException { reconContext.updateErrors(ReconContext.ErrorCode.GET_OM_DB_SNAPSHOT_FAILED); } } - printFileAndKeyTableCount(); + printOMDBMetaInfo(); } finally { isSyncDataFromOMRunning.set(false); } @@ -639,30 +641,21 @@ public boolean syncDataFromOM() throws IOException { return true; } - private void printFileAndKeyTableCount() throws IOException { - Table fileTable = omMetadataManager.getTable("fileTable"); - Table keyTable = omMetadataManager.getTable("keyTable"); - if (keyTable == null) { - LOG.error("Table keyTable not found in OM Metadata."); - } - - if (LOG.isDebugEnabled() && null != keyTable) { - try (TableIterator> iterator - = keyTable.iterator()) { - long count = Iterators.size(iterator); - LOG.debug("keyTable Table count: {}", count); - } - } + private void printOMDBMetaInfo() throws IOException { + printTableCount("fileTable"); + printTableCount("keyTable"); + } - if (fileTable == null) { - LOG.error("Table fileTable not found in OM Metadata."); + private void printTableCount(String tableName) throws IOException { + Table table = omMetadataManager.getTable(tableName); + if (table == null) { + LOG.error("Table {} not found in OM Metadata.", tableName); + return; } - - if (LOG.isDebugEnabled() && null != fileTable) { - try (TableIterator> iterator - = fileTable.iterator()) { + if (LOG.isDebugEnabled()) { + try (TableIterator> iterator = table.iterator()) { long count = Iterators.size(iterator); - LOG.debug("fileTable Table count: {}", count); + LOG.debug("{} Table count: {}", tableName, count); } } } From 09bd9980943006a28fe2e3c314dab1f8706571a0 Mon Sep 17 00:00:00 2001 From: deveshsingh Date: Mon, 29 Jul 2024 16:41:42 +0530 Subject: [PATCH 5/7] HDDS-11231. Handled review comments. --- .../recon/spi/impl/OzoneManagerServiceProviderImpl.java | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/impl/OzoneManagerServiceProviderImpl.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/impl/OzoneManagerServiceProviderImpl.java index 6738eb018bfd..a4e68e03ebeb 100644 --- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/impl/OzoneManagerServiceProviderImpl.java +++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/impl/OzoneManagerServiceProviderImpl.java @@ -553,10 +553,9 @@ boolean innerGetAndApplyDeltaUpdatesFromOM(long fromSequenceNumber, * Based on current state of Recon's OM DB, we either get delta updates or * full snapshot from Ozone Manager. * @return true or false if sync operation between Recon and OM was successful or failed. - * @throws IOException */ @VisibleForTesting - public boolean syncDataFromOM() throws IOException { + public boolean syncDataFromOM() { if (isSyncDataFromOMRunning.compareAndSet(false, true)) { try { LOG.info("Syncing data from Ozone Manager."); @@ -641,12 +640,12 @@ public boolean syncDataFromOM() throws IOException { return true; } - private void printOMDBMetaInfo() throws IOException { + private void printOMDBMetaInfo() { printTableCount("fileTable"); printTableCount("keyTable"); } - private void printTableCount(String tableName) throws IOException { + private void printTableCount(String tableName) { Table table = omMetadataManager.getTable(tableName); if (table == null) { LOG.error("Table {} not found in OM Metadata.", tableName); @@ -656,6 +655,8 @@ private void printTableCount(String tableName) throws IOException { try (TableIterator> iterator = table.iterator()) { long count = Iterators.size(iterator); LOG.debug("{} Table count: {}", tableName, count); + } catch (IOException ioException) { + LOG.error("Unexpected error while iterating table for table count: {}", tableName); } } } From dc9df054d28efa5caacda4078e2bc1a92d05a428 Mon Sep 17 00:00:00 2001 From: deveshsingh Date: Mon, 29 Jul 2024 17:53:54 +0530 Subject: [PATCH 6/7] HDDS-11231. Handled review comments. --- .../recon/spi/impl/OzoneManagerServiceProviderImpl.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/impl/OzoneManagerServiceProviderImpl.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/impl/OzoneManagerServiceProviderImpl.java index a4e68e03ebeb..91a97725387b 100644 --- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/impl/OzoneManagerServiceProviderImpl.java +++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/impl/OzoneManagerServiceProviderImpl.java @@ -271,7 +271,11 @@ public void start() { // Update local Recon OM DB to new snapshot. try { boolean success = updateReconOmDBWithNewSnapshot(); - LOG.info("Fetched full new snapshot from Ozone Manager: {}", success); + if (success) { + LOG.info("Successfully fetched a full snapshot from Ozone Manager"); + } else { + LOG.error("Failed fetching a full snapshot from Ozone Manager"); + } } catch (IOException e) { throw new RuntimeException(e); } From eb236ded0e44cc338125e5380a7d28a29170b816 Mon Sep 17 00:00:00 2001 From: deveshsingh Date: Tue, 30 Jul 2024 10:34:59 +0530 Subject: [PATCH 7/7] HDDS-11231. Handled review comments. --- .../ozone/recon/spi/impl/OzoneManagerServiceProviderImpl.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/impl/OzoneManagerServiceProviderImpl.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/impl/OzoneManagerServiceProviderImpl.java index 91a97725387b..491d631249cd 100644 --- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/impl/OzoneManagerServiceProviderImpl.java +++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/impl/OzoneManagerServiceProviderImpl.java @@ -277,7 +277,8 @@ public void start() { LOG.error("Failed fetching a full snapshot from Ozone Manager"); } } catch (IOException e) { - throw new RuntimeException(e); + LOG.error("Unexpected IOException occurred while trying to fetch a full snapshot: {}", e); + throw new RuntimeException(runtimeException); } } reconTaskController.start();