From 33b46615a918d409be18fb855f0eceb593edbff1 Mon Sep 17 00:00:00 2001 From: ashishk Date: Wed, 7 Feb 2024 17:11:23 +0530 Subject: [PATCH 1/6] HDDS-10256. Block allocation retry when SCM is in safe mode. --- .../hadoop/ozone/om/TestScmSafeMode.java | 51 +++++++++++++++++++ .../ozone/om/request/key/OMKeyRequest.java | 33 ++++++++---- 2 files changed, 73 insertions(+), 11 deletions(-) diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestScmSafeMode.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestScmSafeMode.java index 0a8c256b46a3..f7ffabcf803e 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestScmSafeMode.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestScmSafeMode.java @@ -17,6 +17,12 @@ */ package org.apache.hadoop.ozone.om; +import org.apache.hadoop.fs.CommonConfigurationKeysPublic; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.SafeMode; +import org.apache.hadoop.fs.SafeModeAction; import org.apache.hadoop.hdds.utils.IOUtils; import org.apache.commons.lang3.RandomStringUtils; import org.apache.hadoop.hdds.HddsConfigKeys; @@ -55,11 +61,14 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.concurrent.TimeoutException; import static org.apache.hadoop.hdds.client.ReplicationType.RATIS; import static org.apache.hadoop.hdds.client.ReplicationFactor.ONE; import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_DEADNODE_INTERVAL; import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_STALENODE_INTERVAL; +import static org.apache.hadoop.ozone.OzoneConsts.OZONE_OFS_URI_SCHEME; +import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_ADDRESS_KEY; import static org.assertj.core.api.Assertions.assertThat; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; @@ -350,4 +359,46 @@ public void testSCMSafeModeDisabled() throws Exception { cluster.restartStorageContainerManager(true); assertFalse(scm.isInSafeMode()); } + + @Test + public void testCreateRetryWhileSCMSafeMode() throws Exception { + // Test1: Test safe mode when there are no containers in system. + cluster.stop(); + + try { + cluster = builder.build(); + } catch (IOException e) { + fail("Cluster startup failed."); + } + + final String rootPath = String.format("%s://%s/", + OZONE_OFS_URI_SCHEME, conf.get(OZONE_OM_ADDRESS_KEY)); + conf.set(CommonConfigurationKeysPublic.FS_DEFAULT_NAME_KEY, rootPath); + + try (FileSystem fs = FileSystem.get(conf)) { + assertTrue(((SafeMode)fs).setSafeMode(SafeModeAction.GET)); + + Thread t = new Thread(() -> { + try { + LOG.info("Sleep 10 seconds and then start DataNodes."); + Thread.sleep(10 * 1000); + + cluster.startHddsDatanodes(); + cluster.waitForClusterToBeReady(); + cluster.waitTobeOutOfSafeMode(); + } catch (InterruptedException | TimeoutException e) { + throw new RuntimeException(e); + } + }); + t.start(); + + final Path file = new Path("file"); + try (FSDataOutputStream outputStream = fs.create(file, true)) { + LOG.info("Successfully created a file"); + } + t.join(); + } + + assertFalse(cluster.getStorageContainerManager().isInSafeMode()); + } } diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/key/OMKeyRequest.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/key/OMKeyRequest.java index c06aa186cc75..5315c83822bd 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/key/OMKeyRequest.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/key/OMKeyRequest.java @@ -199,18 +199,29 @@ protected List< OmKeyLocationInfo > allocateBlock(ScmClient scmClient, List locationInfos = new ArrayList<>(numBlocks); String remoteUser = getRemoteUser().getShortUserName(); List allocatedBlocks; - try { - allocatedBlocks = scmClient.getBlockClient() - .allocateBlock(scmBlockSize, numBlocks, replicationConfig, serviceID, - excludeList, clientMachine); - } catch (SCMException ex) { - omMetrics.incNumBlockAllocateCallFails(); - if (ex.getResult() - .equals(SCMException.ResultCodes.SAFE_MODE_EXCEPTION)) { - throw new OMException(ex.getMessage(), - OMException.ResultCodes.SCM_IN_SAFE_MODE); + int retryCount = 5; + while (true) { + try { + allocatedBlocks = scmClient.getBlockClient() + .allocateBlock(scmBlockSize, numBlocks, replicationConfig, serviceID, + excludeList, clientMachine); + } catch (SCMException ex) { + omMetrics.incNumBlockAllocateCallFails(); + if (ex.getResult().equals(SCMException.ResultCodes.SAFE_MODE_EXCEPTION) && retryCount > 0) { + retryCount--; + // SCM is in safe mode, retry again + try { + Thread.sleep(3000); + continue; + } catch (InterruptedException e) { + throw new OMException(ex.getMessage(), OMException.ResultCodes.SCM_IN_SAFE_MODE); + } + } else if (ex.getResult().equals(SCMException.ResultCodes.SAFE_MODE_EXCEPTION) && retryCount == 0) { + throw new OMException(ex.getMessage(), OMException.ResultCodes.SCM_IN_SAFE_MODE); + } + throw ex; } - throw ex; + break; } for (AllocatedBlock allocatedBlock : allocatedBlocks) { BlockID blockID = new BlockID(allocatedBlock.getBlockID()); From 19252934dd7a0b21831497da82bfb4904e6608f2 Mon Sep 17 00:00:00 2001 From: ashishk Date: Thu, 8 Feb 2024 23:13:22 +0530 Subject: [PATCH 2/6] Fix review comments --- .../apache/hadoop/ozone/om/request/key/OMKeyRequest.java | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/key/OMKeyRequest.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/key/OMKeyRequest.java index c0472545def0..b891638eac33 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/key/OMKeyRequest.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/key/OMKeyRequest.java @@ -111,6 +111,10 @@ public abstract class OMKeyRequest extends OMClientRequest { private BucketLayout bucketLayout = BucketLayout.DEFAULT; + public static final int BLOCK_ALLOCATION_RETRY_COUNT = 5; + + public static final int BLOCK_ALLOCATION_RETRY_WAIT_TIME_MS = 3000; + public OMKeyRequest(OMRequest omRequest) { super(omRequest); } @@ -199,7 +203,7 @@ protected List< OmKeyLocationInfo > allocateBlock(ScmClient scmClient, List locationInfos = new ArrayList<>(numBlocks); String remoteUser = getRemoteUser().getShortUserName(); List allocatedBlocks; - int retryCount = 5; + int retryCount = BLOCK_ALLOCATION_RETRY_COUNT; while (true) { try { allocatedBlocks = scmClient.getBlockClient() @@ -208,10 +212,11 @@ protected List< OmKeyLocationInfo > allocateBlock(ScmClient scmClient, } catch (SCMException ex) { omMetrics.incNumBlockAllocateCallFails(); if (ex.getResult().equals(SCMException.ResultCodes.SAFE_MODE_EXCEPTION) && retryCount > 0) { + LOG.debug("Allocate block failed as SCM is in safe mode, number of retries remaining: {}", retryCount); retryCount--; // SCM is in safe mode, retry again try { - Thread.sleep(3000); + Thread.sleep(BLOCK_ALLOCATION_RETRY_WAIT_TIME_MS); continue; } catch (InterruptedException e) { throw new OMException(ex.getMessage(), OMException.ResultCodes.SCM_IN_SAFE_MODE); From c1ba765cfa5a2ca9858ee044f6686a53d1fcaa3f Mon Sep 17 00:00:00 2001 From: ashishk Date: Fri, 9 Feb 2024 12:39:50 +0530 Subject: [PATCH 3/6] Move retry logic to client --- ...ManagerProtocolClientSideTranslatorPB.java | 39 ++++++++++++++++--- .../ozone/om/request/key/OMKeyRequest.java | 38 ++++++------------ 2 files changed, 44 insertions(+), 33 deletions(-) diff --git a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/OzoneManagerProtocolClientSideTranslatorPB.java b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/OzoneManagerProtocolClientSideTranslatorPB.java index bd40dfcf0240..434ef5717ee7 100644 --- a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/OzoneManagerProtocolClientSideTranslatorPB.java +++ b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/OzoneManagerProtocolClientSideTranslatorPB.java @@ -229,6 +229,7 @@ import static org.apache.hadoop.ozone.OzoneConsts.OM_S3_CALLER_CONTEXT_PREFIX; import static org.apache.hadoop.ozone.om.exceptions.OMException.ResultCodes; +import static org.apache.hadoop.ozone.om.exceptions.OMException.ResultCodes.SCM_IN_SAFE_MODE; import static org.apache.hadoop.ozone.om.exceptions.OMException.ResultCodes.TOKEN_ERROR_OTHER; import static org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos.CancelPrepareRequest; import static org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos.CancelPrepareResponse; @@ -258,6 +259,10 @@ public final class OzoneManagerProtocolClientSideTranslatorPB = new ThreadLocal<>(); private boolean s3AuthCheck; + + public static final int BLOCK_ALLOCATION_RETRY_COUNT = 5; + public static final int BLOCK_ALLOCATION_RETRY_WAIT_TIME_MS = 3000; + public OzoneManagerProtocolClientSideTranslatorPB(OmTransport omTransport, String clientId) { this.clientID = clientId; @@ -728,8 +733,7 @@ public OpenKeySession openKey(OmKeyArgs args) throws IOException { .setCreateKeyRequest(req) .build(); - CreateKeyResponse keyResponse = - handleError(submitRequest(omRequest)).getCreateKeyResponse(); + CreateKeyResponse keyResponse = handleSubmitRequestAndSCMSafeModeRetry(omRequest).getCreateKeyResponse(); return new OpenKeySession(keyResponse.getID(), OmKeyInfo.getFromProtobuf(keyResponse.getKeyInfo()), keyResponse.getOpenVersion()); @@ -774,8 +778,7 @@ public OmKeyLocationInfo allocateBlock(OmKeyArgs args, long clientId, .setAllocateBlockRequest(req) .build(); - AllocateBlockResponse resp = handleError(submitRequest(omRequest)) - .getAllocateBlockResponse(); + AllocateBlockResponse resp = handleSubmitRequestAndSCMSafeModeRetry(omRequest).getAllocateBlockResponse(); return OmKeyLocationInfo.getFromProtobuf(resp.getKeyLocation()); } @@ -2243,12 +2246,36 @@ public OpenKeySession createFile(OmKeyArgs args, OMRequest omRequest = createOMRequest(Type.CreateFile) .setCreateFileRequest(createFileRequest) .build(); - CreateFileResponse resp = - handleError(submitRequest(omRequest)).getCreateFileResponse(); + CreateFileResponse resp = handleSubmitRequestAndSCMSafeModeRetry(omRequest).getCreateFileResponse(); + return new OpenKeySession(resp.getID(), OmKeyInfo.getFromProtobuf(resp.getKeyInfo()), resp.getOpenVersion()); } + + @Nonnull + private OMResponse handleSubmitRequestAndSCMSafeModeRetry(OMRequest omRequest) throws IOException { + int retryCount = BLOCK_ALLOCATION_RETRY_COUNT; + while (true) { + try { + return handleError(submitRequest(omRequest)); + } catch (OMException e) { + if (e.getResult().equals(SCM_IN_SAFE_MODE) && retryCount > 0) { + retryCount--; + try { + Thread.sleep(BLOCK_ALLOCATION_RETRY_WAIT_TIME_MS); + continue; + } catch (InterruptedException ex) { + throw new OMException(ex.getMessage(), ResultCodes.SCM_IN_SAFE_MODE); + } + } else if (e.getResult().equals(SCM_IN_SAFE_MODE) && retryCount == 0) { + throw new OMException(e.getMessage(), ResultCodes.SCM_IN_SAFE_MODE); + } + throw e; + } + } + } + @Override public List listStatus(OmKeyArgs args, boolean recursive, String startKey, long numEntries, boolean allowPartialPrefixes) diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/key/OMKeyRequest.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/key/OMKeyRequest.java index b891638eac33..d7cdd3632005 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/key/OMKeyRequest.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/key/OMKeyRequest.java @@ -111,10 +111,6 @@ public abstract class OMKeyRequest extends OMClientRequest { private BucketLayout bucketLayout = BucketLayout.DEFAULT; - public static final int BLOCK_ALLOCATION_RETRY_COUNT = 5; - - public static final int BLOCK_ALLOCATION_RETRY_WAIT_TIME_MS = 3000; - public OMKeyRequest(OMRequest omRequest) { super(omRequest); } @@ -203,30 +199,18 @@ protected List< OmKeyLocationInfo > allocateBlock(ScmClient scmClient, List locationInfos = new ArrayList<>(numBlocks); String remoteUser = getRemoteUser().getShortUserName(); List allocatedBlocks; - int retryCount = BLOCK_ALLOCATION_RETRY_COUNT; - while (true) { - try { - allocatedBlocks = scmClient.getBlockClient() - .allocateBlock(scmBlockSize, numBlocks, replicationConfig, serviceID, - excludeList, clientMachine); - } catch (SCMException ex) { - omMetrics.incNumBlockAllocateCallFails(); - if (ex.getResult().equals(SCMException.ResultCodes.SAFE_MODE_EXCEPTION) && retryCount > 0) { - LOG.debug("Allocate block failed as SCM is in safe mode, number of retries remaining: {}", retryCount); - retryCount--; - // SCM is in safe mode, retry again - try { - Thread.sleep(BLOCK_ALLOCATION_RETRY_WAIT_TIME_MS); - continue; - } catch (InterruptedException e) { - throw new OMException(ex.getMessage(), OMException.ResultCodes.SCM_IN_SAFE_MODE); - } - } else if (ex.getResult().equals(SCMException.ResultCodes.SAFE_MODE_EXCEPTION) && retryCount == 0) { - throw new OMException(ex.getMessage(), OMException.ResultCodes.SCM_IN_SAFE_MODE); - } - throw ex; + try { + allocatedBlocks = scmClient.getBlockClient() + .allocateBlock(scmBlockSize, numBlocks, replicationConfig, serviceID, + excludeList, clientMachine); + } catch (SCMException ex) { + omMetrics.incNumBlockAllocateCallFails(); + if (ex.getResult() + .equals(SCMException.ResultCodes.SAFE_MODE_EXCEPTION)) { + throw new OMException(ex.getMessage(), + OMException.ResultCodes.SCM_IN_SAFE_MODE); } - break; + throw ex; } for (AllocatedBlock allocatedBlock : allocatedBlocks) { BlockID blockID = new BlockID(allocatedBlock.getBlockID()); From 5f18f9787e9c39650af744a1218d7d417538a862 Mon Sep 17 00:00:00 2001 From: ashishk Date: Fri, 9 Feb 2024 13:52:12 +0530 Subject: [PATCH 4/6] Optimize test case --- .../java/org/apache/hadoop/ozone/om/TestScmSafeMode.java | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestScmSafeMode.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestScmSafeMode.java index c0a07fdf878d..a5e3c69a5368 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestScmSafeMode.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestScmSafeMode.java @@ -358,14 +358,17 @@ public void testCreateRetryWhileSCMSafeMode() throws Exception { final String rootPath = String.format("%s://%s/", OZONE_OFS_URI_SCHEME, conf.get(OZONE_OM_ADDRESS_KEY)); conf.set(CommonConfigurationKeysPublic.FS_DEFAULT_NAME_KEY, rootPath); + OMMetrics omMetrics = cluster.getOzoneManager().getMetrics(); + long allocateBlockReqCount = omMetrics.getNumBlockAllocateFails(); try (FileSystem fs = FileSystem.get(conf)) { assertTrue(((SafeMode)fs).setSafeMode(SafeModeAction.GET)); Thread t = new Thread(() -> { try { - LOG.info("Sleep 10 seconds and then start DataNodes."); - Thread.sleep(10 * 1000); + LOG.info("Wait for allocate block fails at least once"); + GenericTestUtils.waitFor(() -> omMetrics.getNumBlockAllocateFails() > allocateBlockReqCount, + 100, 10000); cluster.startHddsDatanodes(); cluster.waitForClusterToBeReady(); From 933682807141164b422d65b32f531663b3a76d92 Mon Sep 17 00:00:00 2001 From: ashishk Date: Fri, 9 Feb 2024 14:45:00 +0530 Subject: [PATCH 5/6] Add CLI message to show wait reason --- .../protocolPB/OzoneManagerProtocolClientSideTranslatorPB.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/OzoneManagerProtocolClientSideTranslatorPB.java b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/OzoneManagerProtocolClientSideTranslatorPB.java index 434ef5717ee7..1fde5d6bfd18 100644 --- a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/OzoneManagerProtocolClientSideTranslatorPB.java +++ b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/OzoneManagerProtocolClientSideTranslatorPB.java @@ -2261,6 +2261,8 @@ private OMResponse handleSubmitRequestAndSCMSafeModeRetry(OMRequest omRequest) t return handleError(submitRequest(omRequest)); } catch (OMException e) { if (e.getResult().equals(SCM_IN_SAFE_MODE) && retryCount > 0) { + System.err.println("SCM is in safe mode, request will be retried after " + + BLOCK_ALLOCATION_RETRY_WAIT_TIME_MS + "ms"); retryCount--; try { Thread.sleep(BLOCK_ALLOCATION_RETRY_WAIT_TIME_MS); From 643789d2448002fc335c994cb9939ee9eae42bc5 Mon Sep 17 00:00:00 2001 From: ashishk Date: Fri, 9 Feb 2024 16:09:44 +0530 Subject: [PATCH 6/6] Update CLI message --- .../protocolPB/OzoneManagerProtocolClientSideTranslatorPB.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/OzoneManagerProtocolClientSideTranslatorPB.java b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/OzoneManagerProtocolClientSideTranslatorPB.java index 1fde5d6bfd18..7d029ba044de 100644 --- a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/OzoneManagerProtocolClientSideTranslatorPB.java +++ b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/OzoneManagerProtocolClientSideTranslatorPB.java @@ -2261,7 +2261,7 @@ private OMResponse handleSubmitRequestAndSCMSafeModeRetry(OMRequest omRequest) t return handleError(submitRequest(omRequest)); } catch (OMException e) { if (e.getResult().equals(SCM_IN_SAFE_MODE) && retryCount > 0) { - System.err.println("SCM is in safe mode, request will be retried after " + + System.err.println("SCM is in safe mode. Will retry in " + BLOCK_ALLOCATION_RETRY_WAIT_TIME_MS + "ms"); retryCount--; try {