Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,8 @@ public final class HddsConfigKeys {
HDDS_X509_GRACE_DURATION_TOKEN_CHECKS_ENABLED_DEFAULT = true;
public static final String HDDS_NEW_KEY_CERT_DIR_NAME_SUFFIX = "-next";
public static final String HDDS_BACKUP_KEY_CERT_DIR_NAME_SUFFIX = "-previous";
public static final String HDDS_NEW_KEY_CERT_DIR_NAME_PROGRESS_SUFFIX =
"-progress";
public static final String HDDS_X509_CA_ROTATION_CHECK_INTERNAL =
"hdds.x509.ca.rotation.check.interval";
public static final String HDDS_X509_CA_ROTATION_CHECK_INTERNAL_DEFAULT =
Expand All @@ -216,6 +218,10 @@ public final class HddsConfigKeys {
// format hh:mm:ss, representing hour, minute, and second
public static final String HDDS_X509_CA_ROTATION_TIME_OF_DAY_DEFAULT =
"02:00:00";
public static final String HDDS_X509_CA_ROTATION_ACK_TIMEOUT =
"hdds.x509.ca.rotation.ack.timeout";
public static final String HDDS_X509_CA_ROTATION_ACK_TIMEOUT_DEFAULT =
"PT15M";

public static final String HDDS_CONTAINER_REPLICATION_COMPRESSION =
"hdds.container.replication.compression";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,5 +136,6 @@ public enum ResultCodes {
INVALID_PIPELINE_STATE,
DUPLICATED_PIPELINE_ID,
TIMEOUT,
CA_ROTATION_IN_PROGRESS
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@
import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_DEFAULT_KEY_ALGORITHM;
import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_DEFAULT_KEY_LEN;
import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_DEFAULT_SECURITY_PROVIDER;
import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_X509_CA_ROTATION_ACK_TIMEOUT;
import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_X509_CA_ROTATION_ACK_TIMEOUT_DEFAULT;
import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_X509_CA_ROTATION_CHECK_INTERNAL;
import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_X509_CA_ROTATION_CHECK_INTERNAL_DEFAULT;
import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_X509_CA_ROTATION_TIME_OF_DAY;
Expand Down Expand Up @@ -127,6 +129,7 @@ public class SecurityConfig {
private final String caRotationTimeOfDay;
private final Pattern caRotationTimeOfDayPattern =
Pattern.compile("\\d{2}:\\d{2}:\\d{2}");
private final Duration caAckTimeout;
private final SslProvider grpcSSLProvider;

/**
Expand Down Expand Up @@ -218,6 +221,11 @@ public SecurityConfig(ConfigurationSource configuration) {
}
caRotationTimeOfDay = "1970-01-01T" + timeOfDayString;

String ackTimeString = configuration.get(
HDDS_X509_CA_ROTATION_ACK_TIMEOUT,
HDDS_X509_CA_ROTATION_ACK_TIMEOUT_DEFAULT);
caAckTimeout = Duration.parse(ackTimeString);

validateCertificateValidityConfig();

this.externalRootCaCert = configuration.get(
Expand Down Expand Up @@ -287,12 +295,32 @@ private void validateCertificateValidityConfig() {
throw new IllegalArgumentException(msg);
}

if (caCheckInterval.isNegative() || caCheckInterval.isZero()) {
String msg = "Property " + HDDS_X509_CA_ROTATION_CHECK_INTERNAL +
" should not be zero or negative";
LOG.error(msg);
throw new IllegalArgumentException(msg);
}

if (caCheckInterval.compareTo(renewalGracePeriod) >= 0) {
throw new IllegalArgumentException("Property value of " +
HDDS_X509_CA_ROTATION_CHECK_INTERNAL +
" should be smaller than " + HDDS_X509_RENEW_GRACE_DURATION);
}

if (caAckTimeout.isNegative() || caAckTimeout.isZero()) {
String msg = "Property " + HDDS_X509_CA_ROTATION_ACK_TIMEOUT +
" should not be zero or negative";
LOG.error(msg);
throw new IllegalArgumentException(msg);
}

if (caAckTimeout.compareTo(renewalGracePeriod) >= 0) {
throw new IllegalArgumentException("Property value of " +
HDDS_X509_CA_ROTATION_ACK_TIMEOUT +
" should be smaller than " + HDDS_X509_RENEW_GRACE_DURATION);
}

if (tokenSanityChecksEnabled
&& blockTokenExpiryDurationMs > renewalGracePeriod.toMillis()) {
throw new IllegalArgumentException(" Certificate grace period " +
Expand Down Expand Up @@ -396,6 +424,18 @@ public Path getCertificateLocation(String component) {
return Paths.get(metadataDir, component, certificateDir);
}

/**
* Returns the File path to where this component store key and certificates.
*
* @param component - Component Name - String.
* @return Path location.
*/
public Path getLocation(String component) {
Preconditions.checkNotNull(this.metadataDir, "Metadata directory can't be"
+ " null. Please check configs.");
return Paths.get(metadataDir, component);
}

/**
* Gets the Key Size, The default key size is 2048, since the default
* algorithm used is RSA. User can change this by setting the "hdds.key.len"
Expand Down Expand Up @@ -508,6 +548,10 @@ public String getCaRotationTimeOfDay() {
return caRotationTimeOfDay;
}

public Duration getCaAckTimeout() {
return caAckTimeout;
}

/**
* Return true if using test certificates with authority as localhost. This
* should be used only for unit test where certificates are generated by
Expand Down
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We had a long discussion on this one today, and on Tuesday as well, let me summarize what we have found and agreed on.

The initial problem with the workaround from my side was that it just seems inappropriate to preserve all the old key manager references, and go over all of them, I wanted to understand why it works, and what else we can do.

The final cause for the problem as we determined is within the Netty tcnative code as you have commented as well. The following happens:
During the setup of the connections there is an SSLContext to be created, that uses the TrustManager's getAcceptedIssuers() method, and within the ReferenceCountedOpenSslServerContext the results are cached to the native layer via BIO. This cached value is not updated ever after initialization, but based on our debugging, that value is used to present to the KeyManager counterpart for mTLS authentication when this SSLContext is used for communication via the bidirectional Netty channel under the Ratis implementation.
During the communication, the KeyManager's chooseEngineClientAlias(String, Principal[], SSLEngine) method is being called, and the cached data is used to present the Principals to the chooseEngineClientAlias() method.

The original KeyManager we have the reference for has a mechanism to select the certificate and key alias based on the provided principals, and after the first rotation, the new KeyManager will not have anything to provide to the original certificates that were present to the native layer via the getAcceptedIssuers() call, therefore once we have rotated the certificate, the chooseEngineClientAlias() call returned null, and therefore in the Ratis layer the mTLS authentication failed, then was retried without a chance to succeed.

The workaround works, as at the end of the day it goes back in time to the first KeyManager, which still is able to resolve and then return the same alias that was/is used for all the earlier and the current KeyManager to store the certificates to be used after the actual rotation.

Based on these we agreed that this class should not have the workaround implemented this way, but instead we should stick to the alias we use at the creation of the KeyManager in the load methods.

The reason is that, we just put one private key and one certificate to every KeyManager instance when we create it in the loadKeyManager() method, and we can be sure that once there is query for a key and certificate, we want to use the only key and certificate that is present.

We discussed an alternative, where we use the same SubjectDN for all the rootCA certificates, but that causes operational trouble when we want to understand certificate chaining during debugging.

Furthermore I believe that as we have just one key and one certificate in every KeyManager, we can safely return the same alias from all the methods that are there to request an alias (chooseEngineClientAlias, chooseEngineServerAlias, getClientAliases, chooseClientAlias, getServerAliases, chooseServerAlias), and we can safely return the only key and only certificate from the getPrivateKey and getCertificateChain methods, but that is up for you to consider @ChenSammi I am fine with delegating the rest down to a non-custom engine provided KeyManager instance.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yea, it's a wonder discussion. @fapifta ,thanks for the further digging. I will update the comments with more detail info explain the work around the the issue. My current thought is keeping the workaround in a small scope as possbile, try not to bring any unnecessary side effect. So besides chooseEngineClientAlias which needs the workaround now, for other KeyManager functions which doesn't affect by this issue, I tend to keep them untouched, unless later we found they are affected too.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree, we can look into that later, agree to keep the scope low, just wanted to note that this might as well be a good approach.

Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
import java.security.PrivateKey;
import java.security.cert.X509Certificate;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Locale;
import java.util.concurrent.atomic.AtomicReference;
Expand All @@ -60,12 +61,14 @@ public class ReloadingX509KeyManager extends X509ExtendedKeyManager {
*/
static final char[] EMPTY_PASSWORD = new char[0];
private final AtomicReference<X509ExtendedKeyManager> keyManagerRef;

/**
* Current private key and cert used in keyManager. Used to detect if these
* materials are changed.
*/
private PrivateKey currentPrivateKey;
private List<String> currentCertIdsList = new ArrayList<>();
private String alias;

/**
* Construct a <code>Reloading509KeystoreManager</code>.
Expand All @@ -85,15 +88,62 @@ public ReloadingX509KeyManager(String type, CertificateClient caClient)
@Override
public String chooseEngineClientAlias(String[] strings,
Principal[] principals, SSLEngine sslEngine) {
return keyManagerRef.get()
String ret = keyManagerRef.get()
.chooseEngineClientAlias(strings, principals, sslEngine);

if (ret == null) {
/*
Workaround to address that netty tc-native cannot handle the dynamic
key and certificate refresh well. What happens is during the setup of
the grpc channel, an SSLContext is created, which is
ReferenceCountedOpenSslServerContext in the native tc-native case.
This class uses the TrustManager's getAcceptedIssuers() as the trusted
CA certificate list. The list is not updated after channel is built.
With the list being used to present the Principals during the mTLS
authentication via the Netty channel under Ratis implementation,
the counterpart(client) KeyManager's
chooseEngineClientAlias(String, Principal[], SSLEngine) method is
called with this old root certificate subject principal, which is now
not available in the new Key Manager after refreshed, so the method
will return null, which cause the mutual TLS connection establish
failure.

Example error message:
Engine client aliases for RSA, DH_RSA, EC, EC_RSA, EC_EC,
O=CID-f9f2b2cf-a784-49d7-8577-5d3b13bf0b46,
OU=9f52487c-f8f9-45ee-bb56-aca60b56327f,
[email protected],
org.apache.ratis.thirdparty.io.netty.handler.ssl.OpenSslEngine@5eec0d10
is null

Example success message:
Engine client aliases for RSA, DH_RSA, EC, EC_RSA, EC_EC,
O=CID-f9f2b2cf-a784-49d7-8577-5d3b13bf0b46,
OU=9f52487c-f8f9-45ee-bb56-aca60b56327f,
[email protected],
org.apache.ratis.thirdparty.io.netty.handler.ssl.OpenSslEngine@5eec0d10
is scm/sub-ca_key
*/
ret = alias;
LOG.info("Engine client aliases for {}, {}, {} is returned as {}",
strings == null ? "" : Arrays.toString(strings),
principals == null ? "" : Arrays.toString(principals),
sslEngine == null ? "" : sslEngine, ret);
}
return ret;
}

@Override
public String chooseEngineServerAlias(String s, Principal[] principals,
SSLEngine sslEngine) {
return keyManagerRef.get()
String ret = keyManagerRef.get()
.chooseEngineServerAlias(s, principals, sslEngine);
if (ret == null && LOG.isDebugEnabled()) {
LOG.debug("Engine server aliases for {}, {}, {} is null", s,
principals == null ? "" : Arrays.toString(principals),
sslEngine == null ? "" : sslEngine);
}
return ret;
}

@Override
Expand Down Expand Up @@ -138,7 +188,7 @@ public ReloadingX509KeyManager loadFrom(CertificateClient caClient) {
try {
X509ExtendedKeyManager manager = loadKeyManager(caClient);
if (manager != null) {
this.keyManagerRef.set(manager);
keyManagerRef.set(manager);
LOG.info("ReloadingX509KeyManager is reloaded");
}
} catch (Exception ex) {
Expand All @@ -155,9 +205,8 @@ private X509ExtendedKeyManager loadKeyManager(CertificateClient caClient)
if (currentPrivateKey != null && currentPrivateKey.equals(privateKey) &&
currentCertIdsList.size() > 0 &&
newCertList.size() == currentCertIdsList.size() &&
!newCertList.stream().filter(
c -> !currentCertIdsList.contains(c.getSerialNumber().toString()))
.findAny().isPresent()) {
newCertList.stream().allMatch(c ->
currentCertIdsList.contains(c.getSerialNumber().toString()))) {
// Security materials(key and certificates) keep the same.
return null;
}
Expand All @@ -166,10 +215,15 @@ private X509ExtendedKeyManager loadKeyManager(CertificateClient caClient)
KeyStore keystore = KeyStore.getInstance(type);
keystore.load(null, null);

keystore.setKeyEntry(caClient.getComponentName() + "_key",
privateKey, EMPTY_PASSWORD,
alias = caClient.getComponentName() + "_key";
keystore.setKeyEntry(alias, privateKey, EMPTY_PASSWORD,
newCertList.toArray(new X509Certificate[0]));

LOG.info("Key manager is loaded with certificate chain");
for (X509Certificate x509Certificate : newCertList) {
LOG.info(x509Certificate.toString());
}

KeyManagerFactory keyMgrFactory = KeyManagerFactory.getInstance(
KeyManagerFactory.getDefaultAlgorithm());
keyMgrFactory.init(keystore, EMPTY_PASSWORD);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,19 @@
import javax.net.ssl.TrustManager;
import javax.net.ssl.TrustManagerFactory;
import javax.net.ssl.X509TrustManager;
import javax.security.auth.x500.X500Principal;
import java.io.IOException;
import java.security.GeneralSecurityException;
import java.security.KeyStore;
import java.security.KeyStoreException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
import java.util.concurrent.atomic.AtomicReference;
import java.util.stream.Collectors;

/**
* A {@link TrustManager} implementation that exposes a method,
Expand All @@ -55,7 +60,7 @@ public final class ReloadingX509TrustManager implements X509TrustManager {
/**
* Current Root CA cert in trustManager, to detect if certificate is changed.
*/
private String currentRootCACertId = null;
private List<String> currentRootCACertIds = new ArrayList<>();

/**
* Creates a reloadable trustmanager. The trustmanager reloads itself
Expand All @@ -80,7 +85,16 @@ public void checkClientTrusted(X509Certificate[] chain, String authType)
throws CertificateException {
X509TrustManager tm = trustManagerRef.get();
if (tm != null) {
tm.checkClientTrusted(chain, authType);
try {
tm.checkClientTrusted(chain, authType);
} catch (CertificateException e) {
LOG.info("Client certificate chain {} for authType {} is not trusted",
chain == null ? "" : Arrays.stream(chain)
.map(X509Certificate::getSubjectX500Principal)
.map(X500Principal::toString)
.collect(Collectors.joining(",")), authType);
throw e;
}
} else {
throw new CertificateException("Unknown client chain certificate: " +
chain[0].toString());
Expand All @@ -92,7 +106,16 @@ public void checkServerTrusted(X509Certificate[] chain, String authType)
throws CertificateException {
X509TrustManager tm = trustManagerRef.get();
if (tm != null) {
tm.checkServerTrusted(chain, authType);
try {
tm.checkServerTrusted(chain, authType);
} catch (CertificateException e) {
LOG.info("Client certificate chain {} for authType {} is not trusted",
chain == null ? "" : Arrays.stream(chain)
.map(X509Certificate::getSubjectX500Principal)
.map(X500Principal::toString)
.collect(Collectors.joining(",")), authType);
throw e;
}
} else {
throw new CertificateException("Unknown server chain certificate: " +
chain[0].toString());
Expand Down Expand Up @@ -127,23 +150,22 @@ public ReloadingX509TrustManager loadFrom(CertificateClient caClient) {
X509TrustManager loadTrustManager(CertificateClient caClient)
throws GeneralSecurityException, IOException {
// SCM certificate client sets root CA as CA cert instead of root CA cert
X509Certificate rootCACert = caClient.getRootCACertificate() == null ?
caClient.getCACertificate() : caClient.getRootCACertificate();
Set<X509Certificate> certList = caClient.getAllRootCaCerts();
Set<X509Certificate> rootCACerts = certList.isEmpty() ?
caClient.getAllCaCerts() : certList;

String rootCACertId = rootCACert.getSerialNumber().toString();
// Certificate keeps the same.
if (currentRootCACertId != null &&
currentRootCACertId.equals(rootCACertId)) {
if (rootCACerts.size() > 0 &&
currentRootCACertIds.size() == rootCACerts.size() &&
rootCACerts.stream().allMatch(c ->
currentRootCACertIds.contains(c.getSerialNumber().toString()))) {
return null;
}

X509TrustManager trustManager = null;
KeyStore ks = KeyStore.getInstance(type);
ks.load(null, null);
Set<X509Certificate> caCertsToInsert =
caClient.getRootCACertificate() == null ? caClient.getAllCaCerts() :
caClient.getAllRootCaCerts();
insertCertsToKeystore(caCertsToInsert, ks);
insertCertsToKeystore(rootCACerts, ks);

TrustManagerFactory trustManagerFactory = TrustManagerFactory.getInstance(
TrustManagerFactory.getDefaultAlgorithm());
Expand All @@ -155,15 +177,19 @@ X509TrustManager loadTrustManager(CertificateClient caClient)
break;
}
}
currentRootCACertId = rootCACertId;
currentRootCACertIds.clear();
rootCACerts.forEach(
c -> currentRootCACertIds.add(c.getSerialNumber().toString()));
return trustManager;
}

private void insertCertsToKeystore(Iterable<X509Certificate> certs,
KeyStore ks) throws KeyStoreException {
LOG.info("Trust manager is loaded with certificates");
for (X509Certificate certToInsert : certs) {
String certId = certToInsert.getSerialNumber().toString();
ks.setCertificateEntry(certId, certToInsert);
LOG.info(certToInsert.toString());
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,8 @@ public synchronized void writeCertificate(Path basePath, String fileName,
try (FileOutputStream file = new FileOutputStream(certificateFile)) {
file.write(pemEncodedCertificate.getBytes(DEFAULT_CHARSET));
}

LOG.info("Save certificate to {}", certificateFile.getAbsolutePath());
LOG.info("Certificate {}", pemEncodedCertificate);
Files.setPosixFilePermissions(certificateFile.toPath(), permissionSet);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -480,8 +480,11 @@ private OzoneConsts() {
public static final String SCM_ROOT_CA_COMPONENT_NAME =
Paths.get(SCM_CA_CERT_STORAGE_DIR, SCM_CA_PATH).toString();

public static final String SCM_SUB_CA_PREFIX = "scm-sub@";
public static final String SCM_ROOT_CA_PREFIX = "scm@";
// %s to distinguish different certificates
public static final String SCM_SUB_CA = "scm-sub";
public static final String SCM_SUB_CA_PREFIX = SCM_SUB_CA + "-%s@";
public static final String SCM_ROOT_CA = "scm";
public static final String SCM_ROOT_CA_PREFIX = SCM_ROOT_CA + "-%s@";

// Layout Version written into Meta Table ONLY during finalization.
public static final String LAYOUT_VERSION_KEY = "#LAYOUTVERSION";
Expand Down
Loading