From a2ac281614bc72196dcbd79bd3d723a8656ab0ef Mon Sep 17 00:00:00 2001 From: sarvekshayr Date: Tue, 8 Jul 2025 16:28:24 +0530 Subject: [PATCH 1/4] HDDS-13405. ozone admin container create runs forever without kinit HDDS-13405. ozone admin container create runs forever without kinit New changes --- .../org/apache/hadoop/hdds/utils/HAUtils.java | 36 +++++++++++++++---- 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HAUtils.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HAUtils.java index 9c3b4fefe092..09d7f0db3009 100644 --- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HAUtils.java +++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HAUtils.java @@ -65,6 +65,7 @@ import org.apache.hadoop.io.retry.RetryPolicies; import org.apache.hadoop.io.retry.RetryPolicy; import org.apache.hadoop.ozone.OzoneSecurityUtil; +import org.apache.hadoop.security.AccessControlException; import org.apache.hadoop.security.UserGroupInformation; import org.apache.ratis.util.ExitUtils; import org.apache.ratis.util.FileUtils; @@ -347,21 +348,44 @@ public static List getExistingSstFiles(File db) throws IOException { /** * Retry forever until CA list matches expected count. + * Fails fast on authentication exceptions. * @param task - task to get CA list. * @return CA list. */ private static List getCAListWithRetry(Callable> task, long waitDuration) throws IOException { - RetryPolicy retryPolicy = RetryPolicies.retryForeverWithFixedSleep( - waitDuration, TimeUnit.SECONDS); - RetriableTask> retriableTask = - new RetriableTask<>(retryPolicy, "getCAList", task); + RetryPolicy retryPolicy = new RetryPolicy() { + private final RetryPolicy defaultPolicy = RetryPolicies.retryForeverWithFixedSleep( + waitDuration, TimeUnit.SECONDS); + + @Override + public RetryAction shouldRetry(Exception e, int retries, int failovers, boolean isIdempotent) throws Exception { + if (containsAccessControlException(e)) { + return new RetryAction(RetryAction.RetryDecision.FAIL); + } + return defaultPolicy.shouldRetry(e, retries, failovers, isIdempotent); + } + }; + + RetriableTask> retriableTask = new RetriableTask<>(retryPolicy, "getCAList", task); try { return retriableTask.call(); } catch (Exception ex) { - throw new SCMSecurityException("Unable to obtain complete CA " + - "list", ex); + if (containsAccessControlException(ex)) { + throw new AccessControlException(); + } + throw new SCMSecurityException("Unable to obtain complete CA list", ex); + } + } + + private static boolean containsAccessControlException(Throwable e) { + while (e != null) { + if (e instanceof AccessControlException) { + return true; + } + e = e.getCause(); } + return false; } private static List waitForCACerts( From 75f74306a9b724778988f579307be3dc022402c2 Mon Sep 17 00:00:00 2001 From: sarvekshayr Date: Thu, 10 Jul 2025 10:53:57 +0530 Subject: [PATCH 2/4] Added robot test --- .../src/main/compose/ozonesecure-ha/test.sh | 2 ++ .../smoketest/admincli/container-create.robot | 24 +++++++++++++++++++ 2 files changed, 26 insertions(+) create mode 100644 hadoop-ozone/dist/src/main/smoketest/admincli/container-create.robot diff --git a/hadoop-ozone/dist/src/main/compose/ozonesecure-ha/test.sh b/hadoop-ozone/dist/src/main/compose/ozonesecure-ha/test.sh index bd4df3af6ed7..509ca42725cb 100755 --- a/hadoop-ozone/dist/src/main/compose/ozonesecure-ha/test.sh +++ b/hadoop-ozone/dist/src/main/compose/ozonesecure-ha/test.sh @@ -35,6 +35,8 @@ start_docker_env execute_command_in_container kms hadoop key create ${OZONE_BUCKET_KEY_NAME} +execute_robot_test s3g admincli/container-create.robot + execute_robot_test s3g kinit.robot execute_robot_test s3g freon diff --git a/hadoop-ozone/dist/src/main/smoketest/admincli/container-create.robot b/hadoop-ozone/dist/src/main/smoketest/admincli/container-create.robot new file mode 100644 index 000000000000..83e1b94902b4 --- /dev/null +++ b/hadoop-ozone/dist/src/main/smoketest/admincli/container-create.robot @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one or moreD +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +*** Settings *** +Documentation Test ozone admin container create command without kinit on a SCM HA cluster +Library BuiltIn +Resource ../lib/os.robot + +*** Test Cases *** +Create container without kinit + ${output} = Execute And Ignore Error ozone admin container create + Should contain ${output} Permission denied From f76087a1b75c26f12d03ebfa355ca6e0daf58d5c Mon Sep 17 00:00:00 2001 From: sarvekshayr Date: Thu, 10 Jul 2025 12:00:23 +0530 Subject: [PATCH 3/4] Move container-create.robot to scmha --- hadoop-ozone/dist/src/main/compose/ozonesecure-ha/test.sh | 3 ++- .../main/smoketest/{admincli => scmha}/container-create.robot | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) rename hadoop-ozone/dist/src/main/smoketest/{admincli => scmha}/container-create.robot (96%) diff --git a/hadoop-ozone/dist/src/main/compose/ozonesecure-ha/test.sh b/hadoop-ozone/dist/src/main/compose/ozonesecure-ha/test.sh index 509ca42725cb..6d0b4442ffa6 100755 --- a/hadoop-ozone/dist/src/main/compose/ozonesecure-ha/test.sh +++ b/hadoop-ozone/dist/src/main/compose/ozonesecure-ha/test.sh @@ -35,7 +35,8 @@ start_docker_env execute_command_in_container kms hadoop key create ${OZONE_BUCKET_KEY_NAME} -execute_robot_test s3g admincli/container-create.robot +#Run this test before kinit on a SCM HA secure cluster +execute_robot_test s3g scmha/container-create.robot execute_robot_test s3g kinit.robot diff --git a/hadoop-ozone/dist/src/main/smoketest/admincli/container-create.robot b/hadoop-ozone/dist/src/main/smoketest/scmha/container-create.robot similarity index 96% rename from hadoop-ozone/dist/src/main/smoketest/admincli/container-create.robot rename to hadoop-ozone/dist/src/main/smoketest/scmha/container-create.robot index 83e1b94902b4..812a66a9cf61 100644 --- a/hadoop-ozone/dist/src/main/smoketest/admincli/container-create.robot +++ b/hadoop-ozone/dist/src/main/smoketest/scmha/container-create.robot @@ -14,7 +14,7 @@ # limitations under the License. *** Settings *** -Documentation Test ozone admin container create command without kinit on a SCM HA cluster +Documentation Test ozone admin container create command without kinit on a SCM HA secure cluster Library BuiltIn Resource ../lib/os.robot From 17646c297e7276b3a494867ee0ed690297fba305 Mon Sep 17 00:00:00 2001 From: Sarveksha Yeshavantha Raju <79865743+sarvekshayr@users.noreply.github.com> Date: Thu, 10 Jul 2025 23:34:08 +0530 Subject: [PATCH 4/4] Add a warning LOG Co-authored-by: Aryan Gupta <44232823+aryangupta1998@users.noreply.github.com> --- .../src/main/java/org/apache/hadoop/hdds/utils/HAUtils.java | 1 + 1 file changed, 1 insertion(+) diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HAUtils.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HAUtils.java index 09d7f0db3009..8492755d6091 100644 --- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HAUtils.java +++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HAUtils.java @@ -361,6 +361,7 @@ private static List getCAListWithRetry(Callable> task, @Override public RetryAction shouldRetry(Exception e, int retries, int failovers, boolean isIdempotent) throws Exception { if (containsAccessControlException(e)) { + LOG.warn("AccessControlException encountered during getCAList; failing fast without retry."); return new RetryAction(RetryAction.RetryDecision.FAIL); } return defaultPolicy.shouldRetry(e, retries, failovers, isIdempotent);