diff --git a/hadoop-hdds/common/src/main/resources/ozone-default.xml b/hadoop-hdds/common/src/main/resources/ozone-default.xml
index 658a0d67d5ab..3dad6045698e 100644
--- a/hadoop-hdds/common/src/main/resources/ozone-default.xml
+++ b/hadoop-hdds/common/src/main/resources/ozone-default.xml
@@ -3504,6 +3504,23 @@
If the buffer overflows, task reinitialization will be triggered.
+
+ ozone.recon.dn.metrics.collection.thread.count
+ 10
+ OZONE, RECON, DN
+
+ The number of threads running to get metrics from jmx endpoint.
+
+
+
+ ozone.recon.dn.metrics.collection.minimum.api.delay
+ 30s
+ OZONE, RECON, DN
+
+ Minimum delay in API to start a new task for Jmx collection.
+ It behaves like a rate limiter to avoid unnecessary task creation.
+
+
ozone.scm.datanode.admin.monitor.interval
30s
diff --git a/hadoop-ozone/dist/src/main/compose/ozone/docker-config b/hadoop-ozone/dist/src/main/compose/ozone/docker-config
index f2a9e0447932..81e05a7f2791 100644
--- a/hadoop-ozone/dist/src/main/compose/ozone/docker-config
+++ b/hadoop-ozone/dist/src/main/compose/ozone/docker-config
@@ -56,7 +56,7 @@ OZONE-SITE.XML_ozone.http.basedir=/tmp/ozone_http
OZONE-SITE.XML_hdds.container.ratis.datastream.enabled=true
OZONE-SITE.XML_ozone.fs.hsync.enabled=true
-
+OZONE-SITE.XML_ozone.recon.dn.metrics.collection.minimum.api.delay=5s
OZONE_CONF_DIR=/etc/hadoop
OZONE_LOG_DIR=/var/log/hadoop
diff --git a/hadoop-ozone/integration-test-recon/src/test/java/org/apache/hadoop/ozone/recon/TestStorageDistributionEndpoint.java b/hadoop-ozone/integration-test-recon/src/test/java/org/apache/hadoop/ozone/recon/TestStorageDistributionEndpoint.java
index c43958fe1549..e8a5dab62eef 100644
--- a/hadoop-ozone/integration-test-recon/src/test/java/org/apache/hadoop/ozone/recon/TestStorageDistributionEndpoint.java
+++ b/hadoop-ozone/integration-test-recon/src/test/java/org/apache/hadoop/ozone/recon/TestStorageDistributionEndpoint.java
@@ -22,19 +22,22 @@
import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_WAIT_TIME_AFTER_SAFE_MODE_EXIT;
import static org.apache.hadoop.hdds.client.ReplicationFactor.THREE;
import static org.apache.hadoop.hdds.client.ReplicationType.RATIS;
+import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_HA_DBTRANSACTIONBUFFER_FLUSH_INTERVAL;
import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_HA_RATIS_SNAPSHOT_GAP;
import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL;
import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_BLOCK_DELETING_SERVICE_INTERVAL;
-import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_BLOCK_DELETING_SERVICE_TIMEOUT;
+import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_DIR_DELETING_SERVICE_INTERVAL;
import static org.apache.hadoop.ozone.recon.TestReconEndpointUtil.getReconWebAddress;
import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
import com.fasterxml.jackson.databind.ObjectMapper;
import java.time.Duration;
import java.util.Collections;
import java.util.List;
+import java.util.Map;
import java.util.concurrent.TimeUnit;
import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
import org.apache.hadoop.fs.FSDataOutputStream;
@@ -46,6 +49,8 @@
import org.apache.hadoop.hdds.client.ReplicationConfig;
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
import org.apache.hadoop.hdds.scm.ScmConfig;
+import org.apache.hadoop.hdds.scm.container.ContainerInfo;
+import org.apache.hadoop.hdds.scm.events.SCMEvents;
import org.apache.hadoop.hdds.scm.server.StorageContainerManager;
import org.apache.hadoop.hdds.utils.IOUtils;
import org.apache.hadoop.ozone.HddsDatanodeService;
@@ -66,6 +71,9 @@
import org.apache.hadoop.ozone.om.helpers.OmKeyArgs;
import org.apache.hadoop.ozone.om.helpers.OmKeyLocationInfo;
import org.apache.hadoop.ozone.om.helpers.OmKeyLocationInfoGroup;
+import org.apache.hadoop.ozone.recon.api.DataNodeMetricsService;
+import org.apache.hadoop.ozone.recon.api.types.DataNodeMetricsServiceResponse;
+import org.apache.hadoop.ozone.recon.api.types.ScmPendingDeletion;
import org.apache.hadoop.ozone.recon.api.types.StorageCapacityDistributionResponse;
import org.apache.hadoop.ozone.recon.spi.impl.OzoneManagerServiceProviderImpl;
import org.apache.ozone.test.GenericTestUtils;
@@ -100,6 +108,7 @@ public class TestStorageDistributionEndpoint {
private static final ObjectMapper MAPPER = new ObjectMapper();
private static final String STORAGE_DIST_ENDPOINT = "/api/v1/storageDistribution";
+ private static final String PENDING_DELETION_ENDPOINT = "/api/v1/pendingDeletion";
static List replicationConfigs() {
return Collections.singletonList(
@@ -110,17 +119,14 @@ static List replicationConfigs() {
@BeforeAll
public static void setup() throws Exception {
conf = new OzoneConfiguration();
- conf.setTimeDuration(OZONE_BLOCK_DELETING_SERVICE_INTERVAL, 100,
- TimeUnit.MILLISECONDS);
- conf.setTimeDuration(OZONE_BLOCK_DELETING_SERVICE_TIMEOUT, 100,
- TimeUnit.MILLISECONDS);
- conf.setTimeDuration(OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL,
- 100, TimeUnit.MILLISECONDS);
+ conf.setTimeDuration(OZONE_DIR_DELETING_SERVICE_INTERVAL, 100, TimeUnit.MILLISECONDS);
+ conf.setTimeDuration(OZONE_BLOCK_DELETING_SERVICE_INTERVAL, 100, TimeUnit.MILLISECONDS);
+ conf.setTimeDuration(OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL, 100, TimeUnit.MILLISECONDS);
conf.setLong(OZONE_SCM_HA_RATIS_SNAPSHOT_GAP, 1L);
- conf.setTimeDuration(HDDS_HEARTBEAT_INTERVAL, 50,
- TimeUnit.MILLISECONDS);
- conf.setTimeDuration(HDDS_CONTAINER_REPORT_INTERVAL, 200,
- TimeUnit.MILLISECONDS);
+ conf.setTimeDuration(HDDS_HEARTBEAT_INTERVAL, 50, TimeUnit.MILLISECONDS);
+ conf.setTimeDuration(HDDS_CONTAINER_REPORT_INTERVAL, 200, TimeUnit.MILLISECONDS);
+ conf.setTimeDuration(OZONE_SCM_HA_DBTRANSACTIONBUFFER_FLUSH_INTERVAL, 500, TimeUnit.MILLISECONDS);
+ conf.set(ReconServerConfigKeys.OZONE_RECON_DN_METRICS_COLLECTION_MINIMUM_API_DELAY, "1s");
// Enhanced SCM configuration for faster block deletion processing
ScmConfig scmConfig = conf.getObject(ScmConfig.class);
@@ -129,18 +135,9 @@ public static void setup() throws Exception {
conf.set(HDDS_SCM_WAIT_TIME_AFTER_SAFE_MODE_EXIT, "0s");
// Enhanced DataNode configuration to move pending deletion from SCM to DN faster
- DatanodeConfiguration dnConf =
- conf.getObject(DatanodeConfiguration.class);
- dnConf.setBlockDeletionInterval(Duration.ofMillis(100));
- // Increase block delete queue limit to allow more queued commands on DN
- dnConf.setBlockDeleteQueueLimit(50);
- // Reduce the interval for delete command worker processing
- dnConf.setBlockDeleteCommandWorkerInterval(Duration.ofMillis(100));
- // Increase blocks deleted per interval to speed up deletion
- dnConf.setBlockDeletionLimit(5000);
+ DatanodeConfiguration dnConf = conf.getObject(DatanodeConfiguration.class);
+ dnConf.setBlockDeletionInterval(Duration.ofMillis(30000));
conf.setFromObject(dnConf);
- // Increase DN delete threads for faster parallel processing
- conf.setInt("ozone.datanode.block.delete.threads.max", 10);
recon = new ReconService(conf);
cluster = MiniOzoneCluster.newBuilder(conf)
@@ -190,19 +187,110 @@ public void testStorageDistributionEndpoint(ReplicationConfig replicationConfig)
}
}
waitForKeysCreated(replicationConfig);
- Thread.sleep(10000);
- StringBuilder urlBuilder = new StringBuilder();
- urlBuilder.append(getReconWebAddress(conf))
- .append(STORAGE_DIST_ENDPOINT);
- String response = TestReconEndpointUtil.makeHttpCall(conf, urlBuilder);
- StorageCapacityDistributionResponse storageResponse =
- MAPPER.readValue(response, StorageCapacityDistributionResponse.class);
+ GenericTestUtils.waitFor(() -> {
+ try {
+ StringBuilder urlBuilder = new StringBuilder();
+ urlBuilder.append(getReconWebAddress(conf))
+ .append(STORAGE_DIST_ENDPOINT);
+ String response = TestReconEndpointUtil.makeHttpCall(conf, urlBuilder);
+ StorageCapacityDistributionResponse storageResponse =
+ MAPPER.readValue(response, StorageCapacityDistributionResponse.class);
- assertEquals(20, storageResponse.getGlobalNamespace().getTotalKeys());
- assertEquals(60, storageResponse.getGlobalNamespace().getTotalUsedSpace());
- assertEquals(0, storageResponse.getUsedSpaceBreakDown().getOpenKeyBytes());
- assertEquals(60, storageResponse.getUsedSpaceBreakDown().getCommittedKeyBytes());
- assertEquals(3, storageResponse.getDataNodeUsage().size());
+ assertEquals(20, storageResponse.getGlobalNamespace().getTotalKeys());
+ assertEquals(60, storageResponse.getGlobalNamespace().getTotalUsedSpace());
+ assertEquals(0, storageResponse.getUsedSpaceBreakDown().getOpenKeyBytes());
+ assertEquals(60, storageResponse.getUsedSpaceBreakDown().getCommittedKeyBytes());
+ assertEquals(3, storageResponse.getDataNodeUsage().size());
+
+ return true;
+ } catch (Exception e) {
+ LOG.debug("Waiting for storage distribution assertions to pass", e);
+ return false;
+ }
+ }, 1000, 30000);
+ closeAllContainers();
+ fs.delete(dir1, true);
+ GenericTestUtils.waitFor(() -> {
+ try {
+ syncDataFromOM();
+ StringBuilder urlBuilder = new StringBuilder();
+ urlBuilder.append(getReconWebAddress(conf))
+ .append(PENDING_DELETION_ENDPOINT + "?component=om");
+ String response = TestReconEndpointUtil.makeHttpCall(conf, urlBuilder);
+ Map pendingDeletionMap =
+ MAPPER.readValue(response, Map.class);
+ assertEquals(30, pendingDeletionMap.get("totalSize"));
+ assertEquals(30, pendingDeletionMap.get("pendingDirectorySize") + pendingDeletionMap.get("pendingKeySize"));
+ return true;
+ } catch (Throwable e) {
+ LOG.debug("Waiting for storage distribution assertions to pass", e);
+ return false;
+ }
+ }, 1000, 30000);
+
+ GenericTestUtils.waitFor(() -> {
+ try {
+ StringBuilder urlBuilder = new StringBuilder();
+ urlBuilder.append(getReconWebAddress(conf))
+ .append(PENDING_DELETION_ENDPOINT + "?component=scm");
+ String response = TestReconEndpointUtil.makeHttpCall(conf, urlBuilder);
+ ScmPendingDeletion pendingDeletion =
+ MAPPER.readValue(response, ScmPendingDeletion.class);
+ assertEquals(30, pendingDeletion.getTotalReplicatedBlockSize());
+ assertEquals(10, pendingDeletion.getTotalBlocksize());
+ assertEquals(10, pendingDeletion.getTotalBlocksCount());
+ return true;
+ } catch (Throwable e) {
+ LOG.debug("Waiting for storage distribution assertions to pass", e);
+ return false;
+ }
+ }, 2000, 30000);
+ GenericTestUtils.waitFor(() ->
+ scm.getClientProtocolServer().getDeletedBlockSummary().getTotalBlockCount() == 0,
+ 1000, 30000);
+ GenericTestUtils.waitFor(() -> {
+ try {
+ scm.getScmHAManager().asSCMHADBTransactionBuffer().flush();
+ StringBuilder urlBuilder = new StringBuilder();
+ urlBuilder.append(getReconWebAddress(conf))
+ .append(PENDING_DELETION_ENDPOINT + "?component=dn");
+ String response = TestReconEndpointUtil.makeHttpCall(conf, urlBuilder);
+ DataNodeMetricsServiceResponse pendingDeletion =
+ MAPPER.readValue(response, DataNodeMetricsServiceResponse.class);
+ assertNotNull(pendingDeletion);
+ assertEquals(30, pendingDeletion.getTotalPendingDeletion());
+ assertEquals(DataNodeMetricsService.MetricCollectionStatus.SUCCEEDED, pendingDeletion.getStatus());
+ assertEquals(pendingDeletion.getTotalNodesQueried(), pendingDeletion.getPendingDeletionPerDataNode().size());
+ assertEquals(0, pendingDeletion.getTotalNodeQueryFailures());
+ pendingDeletion.getPendingDeletionPerDataNode().forEach(dn -> {
+ assertEquals(10, dn.getPendingBlockSize());
+ });
+ return true;
+ } catch (Throwable e) {
+ LOG.debug("Waiting for storage distribution assertions to pass", e);
+ return false;
+ }
+ }, 2000, 60000);
+ cluster.getHddsDatanodes().get(0).stop();
+
+ GenericTestUtils.waitFor(() -> {
+ try {
+ StringBuilder urlBuilder = new StringBuilder();
+ urlBuilder.append(getReconWebAddress(conf))
+ .append(PENDING_DELETION_ENDPOINT + "?component=dn");
+ String response = TestReconEndpointUtil.makeHttpCall(conf, urlBuilder);
+ DataNodeMetricsServiceResponse pendingDeletion =
+ MAPPER.readValue(response, DataNodeMetricsServiceResponse.class);
+ assertNotNull(pendingDeletion);
+ assertEquals(1, pendingDeletion.getTotalNodeQueryFailures());
+ assertTrue(pendingDeletion.getPendingDeletionPerDataNode()
+ .stream()
+ .anyMatch(dn -> dn.getPendingBlockSize() == -1));
+ return true;
+ } catch (Throwable e) {
+ return false;
+ }
+ }, 2000, 60000);
}
private void verifyBlocksCreated(
@@ -266,6 +354,14 @@ private static void performOperationOnKeyContainers(
}
}
+ private static void closeAllContainers() {
+ for (ContainerInfo container :
+ scm.getContainerManager().getContainers()) {
+ scm.getEventQueue().fireEvent(SCMEvents.CLOSE_CONTAINER,
+ container.containerID());
+ }
+ }
+
@AfterEach
public void cleanup() {
assertDoesNotThrow(() -> {
diff --git a/hadoop-ozone/recon/pom.xml b/hadoop-ozone/recon/pom.xml
index 9a0936ebf194..cdabc26f3fbd 100644
--- a/hadoop-ozone/recon/pom.xml
+++ b/hadoop-ozone/recon/pom.xml
@@ -126,6 +126,26 @@
org.apache.hadoop
hadoop-hdfs-client
+
+ org.apache.httpcomponents
+ httpasyncclient
+ ${httpasyncclient.version}
+
+
+ org.apache.httpcomponents
+ httpclient
+ ${httpclient.version}
+
+
+ org.apache.httpcomponents
+ httpcore
+ ${httpcore.version}
+
+
+ org.apache.httpcomponents
+ httpcore-nio
+ ${httpcore.version}
+
org.apache.ozone
hdds-common
diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/MetricsServiceProviderFactory.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/MetricsServiceProviderFactory.java
index d09c01ea72f9..cd315e220d9e 100644
--- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/MetricsServiceProviderFactory.java
+++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/MetricsServiceProviderFactory.java
@@ -23,6 +23,7 @@
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
import org.apache.hadoop.hdds.recon.ReconConfigKeys;
import org.apache.hadoop.ozone.recon.spi.MetricsServiceProvider;
+import org.apache.hadoop.ozone.recon.spi.impl.JmxServiceProviderImpl;
import org.apache.hadoop.ozone.recon.spi.impl.PrometheusServiceProviderImpl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -67,6 +68,14 @@ public MetricsServiceProvider getMetricsServiceProvider() {
return null;
}
+ /**
+ * Returns the configured MetricsServiceProvider implementation for Jmx.
+ * @return MetricsServiceProvider instance for Jmx
+ */
+ public MetricsServiceProvider getJmxMetricsServiceProvider() {
+ return new JmxServiceProviderImpl(configuration, reconUtils);
+ }
+
/**
* Returns the Prometheus endpoint if configured. Otherwise returns null.
*
diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/ReconServerConfigKeys.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/ReconServerConfigKeys.java
index d3f684238712..8fa11caa00f2 100644
--- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/ReconServerConfigKeys.java
+++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/ReconServerConfigKeys.java
@@ -191,6 +191,14 @@ public final class ReconServerConfigKeys {
public static final int
OZONE_RECON_SCM_CLIENT_FAILOVER_MAX_RETRY_DEFAULT = 3;
+ public static final String OZONE_RECON_DN_METRICS_COLLECTION_THREAD_COUNT =
+ "ozone.recon.dn.metrics.collection.thread.count";
+ public static final int OZONE_RECON_DN_METRICS_COLLECTION_THREAD_COUNT_DEFAULT = 10;
+
+ public static final String OZONE_RECON_DN_METRICS_COLLECTION_MINIMUM_API_DELAY =
+ "ozone.recon.dn.metrics.collection.minimum.api.delay";
+ public static final String OZONE_RECON_DN_METRICS_COLLECTION_MINIMUM_API_DELAY_DEFAULT = "30s";
+
/**
* Private constructor for utility class.
*/
diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/ReconUtils.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/ReconUtils.java
index ccc92648f117..293a4d77a069 100644
--- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/ReconUtils.java
+++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/ReconUtils.java
@@ -847,4 +847,31 @@ public static String constructObjectPathWithPrefix(long... ids) {
}
return pathBuilder.toString();
}
+
+ public static Map getMetricsData(List