-
Notifications
You must be signed in to change notification settings - Fork 588
HDDS-6280. Support Container Balancer HA #3423
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 14 commits
3933e57
c7633ba
b1a80d7
4b7cd67
bc99004
0501ef6
276d748
93dbdff
6cbdf21
c65f297
d0c56b3
f2c279e
d1d090f
f592ebd
61620cc
4b40d99
066f284
20fe445
58f4e81
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -141,6 +141,7 @@ public ContainerBalancer(StorageContainerManager scm) { | |
|
|
||
| this.lock = new ReentrantLock(); | ||
| findSourceStrategy = new FindSourceGreedy(nodeManager); | ||
| scm.getSCMServiceManager().register(this); | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -196,12 +197,8 @@ private void balance() { | |
| return; | ||
| } | ||
| // otherwise, try to stop balancer | ||
| try { | ||
| stopBalancer(); | ||
| } catch (IOException | IllegalContainerBalancerStateException e) { | ||
| LOG.warn("Tried and failed to stop Container Balancer when it " + | ||
| "could not initialize an iteration", e); | ||
| } | ||
| tryStopBalancer("Could not initialize ContainerBalancer's " + | ||
| "iteration number " + i); | ||
| return; | ||
| } | ||
|
|
||
|
|
@@ -211,30 +208,18 @@ private void balance() { | |
|
|
||
| // persist next iteration index | ||
| if (iR == IterationResult.ITERATION_COMPLETED) { | ||
| lock.lock(); | ||
| try { | ||
| saveConfiguration( | ||
| config.toProtobufBuilder() | ||
| .setShouldRun(true) | ||
| .setNextIterationIndex(i + 1) | ||
| .build()); | ||
| saveConfiguration(config, true, i + 1); | ||
| } catch (IOException e) { | ||
| LOG.warn("Could not persist next iteration index value for " + | ||
| "ContainerBalancer after completing an iteration", e); | ||
| } finally { | ||
| lock.unlock(); | ||
| } | ||
| } | ||
|
|
||
| // if no new move option is generated, it means the cluster cannot be | ||
| // balanced anymore; so just stop balancer | ||
| if (iR == IterationResult.CAN_NOT_BALANCE_ANY_MORE) { | ||
| try { | ||
| stopBalancer(); | ||
| } catch (IOException | IllegalContainerBalancerStateException e) { | ||
| LOG.warn("Tried and failed to stop Container Balancer when result " + | ||
| "of the latest iteration was " + iR, e); | ||
| } | ||
| tryStopBalancer(iR.toString()); | ||
| return; | ||
| } | ||
|
|
||
|
|
@@ -260,13 +245,8 @@ private void balance() { | |
| } | ||
|
|
||
| // finally, stop balancer if it hasn't been stopped already | ||
| try { | ||
| if (isBalancerRunning()) { | ||
| stopBalancer(); | ||
| } | ||
| } catch (IOException | IllegalContainerBalancerStateException e) { | ||
| LOG.warn("Failed to stop Container Balancer after it completed all " + | ||
| "iterations", e); | ||
| if (isBalancerRunning()) { | ||
| tryStopBalancer("Completed all iterations."); | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -279,7 +259,7 @@ private void balance() { | |
| */ | ||
| private boolean initializeIteration() { | ||
| if (scmContext.isInSafeMode()) { | ||
| LOG.warn("Container Balancer cannot operate while SCM is in Safe Mode."); | ||
| LOG.error("Container Balancer cannot operate while SCM is in Safe Mode."); | ||
| return false; | ||
| } | ||
| if (!scmContext.isLeader()) { | ||
|
|
@@ -861,11 +841,13 @@ public void notifyStatusChanged() { | |
| try { | ||
| if (!scmContext.isLeader() || scmContext.isInSafeMode()) { | ||
| if (isBalancerRunning()) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should we check and set the ServiceStatus here? for example
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In addition to my previous reply, I think holding state in ServiceStatus along with persisting it in RocksDB would make the logic a bit complex. We'd then have three ways of checking state - ServiceStatus, RocksDB, and checking the current thread for null. What do you think @JacksonYao287 ? |
||
| LOG.info("Stopping ContainerBalancer in this scm on status change"); | ||
| stop(); | ||
| } | ||
| } else { | ||
| if (shouldRun()) { | ||
| try { | ||
| LOG.info("Starting ContainerBalancer in this scm on status change"); | ||
| start(); | ||
| } catch (IllegalContainerBalancerStateException | | ||
| InvalidContainerBalancerConfigurationException e) { | ||
|
|
@@ -1071,16 +1053,33 @@ public void stopBalancer() | |
| try { | ||
| // should be leader, out of safe mode, and currently running | ||
| validateState(true); | ||
| saveConfiguration(config.toProtobufBuilder() | ||
| .setShouldRun(false) | ||
| .setNextIterationIndex(0) | ||
| .build()); | ||
| saveConfiguration(config, false, 0); | ||
| stop(); | ||
| } finally { | ||
| lock.unlock(); | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Tries to stop ContainerBalancer. Logs the reason for stopping. Calls | ||
| * {@link ContainerBalancer#stopBalancer()}. | ||
| * @param stopReason a string specifying the reason for stopping | ||
| * ContainerBalancer. | ||
| */ | ||
| private void tryStopBalancer(String stopReason) { | ||
| lock.lock(); | ||
| try { | ||
| LOG.info("Stopping ContainerBalancer. Reason for stopping: {}", | ||
| stopReason); | ||
| stopBalancer(); | ||
| } catch (IllegalContainerBalancerStateException | IOException e) { | ||
| LOG.warn("Tried to stop ContainerBalancer but failed. Reason for " + | ||
| "stopping: {}", stopReason, e); | ||
| } finally { | ||
| lock.unlock(); | ||
| } | ||
| } | ||
|
|
||
| private void stopBalancingThread() { | ||
| Thread balancingThread; | ||
| lock.lock(); | ||
|
|
@@ -1103,6 +1102,20 @@ private void stopBalancingThread() { | |
| LOG.info("Container Balancer stopped successfully."); | ||
| } | ||
|
|
||
| private void saveConfiguration(ContainerBalancerConfiguration configuration, | ||
| boolean shouldRun, int index) | ||
| throws IOException { | ||
| lock.lock(); | ||
| try { | ||
| saveConfiguration(configuration.toProtobufBuilder() | ||
| .setShouldRun(shouldRun) | ||
| .setNextIterationIndex(index) | ||
| .build()); | ||
| } finally { | ||
| lock.unlock(); | ||
| } | ||
| } | ||
|
|
||
| private void validateConfiguration(ContainerBalancerConfiguration conf) | ||
| throws InvalidContainerBalancerConfigurationException { | ||
| // maxSizeEnteringTarget and maxSizeLeavingSource should by default be | ||
|
|
@@ -1158,27 +1171,24 @@ public void setOzoneConfiguration( | |
| } | ||
|
|
||
| /** | ||
| * Persists the configuration that ContainerBalancer will use after validating | ||
| * state and the specified configuration. | ||
| * @param configuration ContainerBalancerConfiguration to persist | ||
| * | ||
| * Persists the configuration that ContainerBalancer will use after | ||
| * validating state and the specified configuration. | ||
| * @param configuration ContainerBalancerConfiguration to persist | ||
| * @throws InvalidContainerBalancerConfigurationException on failure to | ||
| * validate the specified configuration | ||
| * @throws IllegalContainerBalancerStateException if this SCM is not leader | ||
| * or not out of safe mode or if ContainerBalancer is currently running in | ||
| * this SCM | ||
| * @throws IOException on failure to persist configuration | ||
| */ | ||
| private void setBalancerConfigOnStartBalancer( | ||
| ContainerBalancerConfiguration configuration) | ||
| throws InvalidContainerBalancerConfigurationException, | ||
| IllegalContainerBalancerStateException, IOException { | ||
| validateState(false); | ||
| validateConfiguration(configuration); | ||
| lock.lock(); | ||
| try { | ||
| saveConfiguration(configuration.toProtobufBuilder() | ||
| .setShouldRun(true) | ||
| .setNextIterationIndex(0) | ||
| .build()); | ||
| this.config = configuration; | ||
| } finally { | ||
| lock.unlock(); | ||
| } | ||
| saveConfiguration(configuration, true, 0); | ||
| this.config = configuration; | ||
| } | ||
|
|
||
| /** | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,39 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * <p> | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * <p> | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
| package org.apache.hadoop.hdds.scm.ha.io; | ||
|
|
||
| import com.google.protobuf.ByteString; | ||
| import com.google.protobuf.InvalidProtocolBufferException; | ||
|
|
||
| /** | ||
| * A dummy codec that serializes a ByteString object to ByteString. | ||
| */ | ||
| public class ByteStringCodec implements Codec { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We already have a ByteStringCodec class. Is it possible to reuse it?
lokeshj1703 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| @Override | ||
| public ByteString serialize(Object object) | ||
| throws InvalidProtocolBufferException { | ||
| return (ByteString) object; | ||
| } | ||
|
|
||
| @Override | ||
| public Object deserialize(Class<?> type, ByteString value) | ||
| throws InvalidProtocolBufferException { | ||
| return value; | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can we add a balancer running check
currentThread = nullinsidestopBalancer, so that the two operations will be protected by a single lock. if we first callisBalancerRunning()and then callstopBalancer(), there might be a case that between the two operation in a single thread,stopbalanceris called from another thread.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
stopBalancercallsvalidateState(true), which checks if balancer is currently running. Is this what you're looking for?