diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/io/BackgroundIOAnalyzer.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/io/BackgroundIOAnalyzer.java
new file mode 100644
index 000000000000..5ce4941fa6aa
--- /dev/null
+++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/io/BackgroundIOAnalyzer.java
@@ -0,0 +1,189 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.ozone.container.io;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.concurrent.atomic.AtomicBoolean;
+import org.apache.commons.lang3.ArrayUtils;
+import org.apache.hadoop.util.Shell;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class BackgroundIOAnalyzer extends Thread {
+
+ private static final Logger LOG =
+ LoggerFactory.getLogger(BackgroundIOAnalyzer.class);
+
+ private static final String NAME = "BackgroundIOAnalyzer";
+ private static final String PROCFS_STAT = "/proc/stat";
+ private static final String PROCFS_CPU = "cpu";
+ private DataNodeIOMetrics metrics;
+ private final AtomicBoolean stopping;
+ private final long remainingSleep;
+
+ public BackgroundIOAnalyzer(IOAnalyzerConfiguration conf) {
+ this.metrics = DataNodeIOMetrics.create();
+ this.stopping = new AtomicBoolean(false);
+ this.remainingSleep = conf.getIOAnalyzerInterval();
+ setName(NAME);
+ setDaemon(true);
+ }
+
+ @Override
+ public void run() {
+ try {
+ while (!stopping.get()) {
+ analyzerIoWaitAndSystem();
+ }
+ LOG.info("{} exiting.", this);
+ } catch (Exception e) {
+ LOG.error("{} exiting because of exception ", this, e);
+ } finally {
+ if (metrics != null) {
+ metrics.unregister();
+ }
+ }
+ }
+
+ /**
+ * Analyzes the usage of IOWait and System metrics.
+ *
+ *
Drive Types
+ *
+ * - HDDs and SSDs: May experience spikes in IOWait during read/write operations.
+ * - NVMe drives: Can result in high System values during intensive read/write activities.
+ *
+ *
+ * Monitoring Purpose
+ * Monitoring IOWait and System metrics is crucial for:
+ *
+ * - Identifying performance bottlenecks.
+ * - Guiding future management decisions.
+ *
+ *
+ * Data Collection Approach
+ * This method employs a lightweight strategy to gather relevant data by:
+ *
+ * - Reading the
/proc/stat file to obtain current CPU usage.
+ * - Focusing solely on the first line of this file for metrics.
+ *
+ *
+ * Metrics Breakdown
+ *
+ *
+ * | user |
+ * nice |
+ * system |
+ * idle |
+ * iowait |
+ * irq |
+ * softirq |
+ * steal |
+ * guest |
+ * guest_nice |
+ *
+ *
+ * | 10672634648 |
+ * 17921665 |
+ * 1413355479 |
+ * 397261436142 |
+ * 17949886 |
+ * 0 |
+ * 55070272 |
+ * 0 |
+ * 0 |
+ * 0 |
+ *
+ *
+ *
+ */
+ private void analyzerIoWaitAndSystem() {
+
+ if (!Shell.LINUX) {
+ LOG.warn("Analyzing IO: We currently only support Linux systems.");
+ return;
+ }
+
+ try (BufferedReader reader = new BufferedReader(new FileReader(PROCFS_STAT))) {
+ String line;
+ while ((line = reader.readLine()) != null) {
+ if (line.startsWith(PROCFS_CPU)) {
+ String[] values = line.split("\\s+");
+ if(ArrayUtils.isNotEmpty(values)) {
+
+ // Step1. Retrieve all CPU system time data.
+ long user = Long.parseLong(values[1]);
+ long nice = Long.parseLong(values[2]);
+ long system = Long.parseLong(values[3]);
+ long idle = Long.parseLong(values[4]);
+ long iowait = Long.parseLong(values[5]);
+
+ // Step2. Calculate total CPU time.
+ long totalCpuTime = user + nice + system + idle + iowait;
+
+ // Step3. Calculate the ratio.
+ long iowaitRatio = (long) Math.floor((double) iowait / totalCpuTime * 100);
+ metrics.setDNIoWait(iowaitRatio);
+
+ long systemRatio = (long) Math.floor((double) system / totalCpuTime * 100);
+ metrics.setDNSystem(systemRatio);
+
+ LOG.debug("IO Analyzer : IoWait = {}, System = {}.", iowaitRatio, systemRatio);
+ }
+ break;
+ }
+ }
+ } catch (IOException e) {
+ LOG.error("An error occurred during the Analyzing IO process.", e);
+ }
+
+ // We collect IO performance data at regular intervals,
+ // which is usually every 30 seconds.
+ handleRemainingSleep(remainingSleep);
+ }
+
+ public final void handleRemainingSleep(long remainingSleep) {
+ if (remainingSleep > 0) {
+ try {
+ Thread.sleep(remainingSleep);
+ } catch (InterruptedException ignored) {
+ stopping.set(true);
+ LOG.warn("Background IOAnalyzer was interrupted.");
+ Thread.currentThread().interrupt();
+ }
+ }
+ }
+
+ public synchronized void shutdown() {
+ if (stopping.compareAndSet(false, true)) {
+ this.interrupt();
+ try {
+ this.join();
+ } catch (InterruptedException ex) {
+ LOG.warn("Unexpected exception while stopping io analyzer.", ex);
+ Thread.currentThread().interrupt();
+ }
+ }
+
+ if (metrics != null) {
+ metrics.unregister();
+ }
+ }
+}
diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/io/DataNodeIOMetrics.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/io/DataNodeIOMetrics.java
new file mode 100644
index 000000000000..ece4b0eefd52
--- /dev/null
+++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/io/DataNodeIOMetrics.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.ozone.container.io;
+
+import org.apache.hadoop.hdds.annotation.InterfaceAudience.Private;
+import org.apache.hadoop.metrics2.MetricsSystem;
+import org.apache.hadoop.metrics2.annotation.Metric;
+import org.apache.hadoop.metrics2.annotation.Metrics;
+import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
+import org.apache.hadoop.metrics2.lib.MutableGaugeLong;
+
+@Private
+@Metrics(about = "Datanode io metrics", context = "io")
+public class DataNodeIOMetrics {
+ private final String name;
+ private final MetricsSystem ms;
+
+ @Metric
+ private MutableGaugeLong ioWaitGauge;
+
+ @Metric
+ private MutableGaugeLong systemGauge;
+
+ public DataNodeIOMetrics(String name, MetricsSystem ms) {
+ this.name = name;
+ this.ms = ms;
+ }
+
+ public void setDNIoWait(long ioWait) {
+ ioWaitGauge.set(ioWait);
+ }
+
+ public void setDNSystem(long system) {
+ systemGauge.set(system);
+ }
+
+ public static DataNodeIOMetrics create() {
+ MetricsSystem ms = DefaultMetricsSystem.instance();
+ String name = "DataNodeIOMetrics";
+ return ms.register(name, null, new DataNodeIOMetrics(name, ms));
+ }
+
+ public void unregister() {
+ ms.unregisterSource(name);
+ }
+}
diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/io/IOAnalyzerConfiguration.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/io/IOAnalyzerConfiguration.java
new file mode 100644
index 000000000000..36ffb817f594
--- /dev/null
+++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/io/IOAnalyzerConfiguration.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.ozone.container.io;
+
+import java.time.Duration;
+import org.apache.hadoop.hdds.conf.Config;
+import org.apache.hadoop.hdds.conf.ConfigGroup;
+import org.apache.hadoop.hdds.conf.ConfigTag;
+import org.apache.hadoop.hdds.conf.ConfigType;
+
+@ConfigGroup(prefix = "hdds.datanode.io")
+public class IOAnalyzerConfiguration {
+
+ public static final long IO_ANALYZER_INTERVAL_DEFAULT =
+ Duration.ofDays(7).toMillis();
+
+ @Config(key = "enabled",
+ type = ConfigType.BOOLEAN,
+ defaultValue = "true",
+ tags = {ConfigTag.DATANODE},
+ description = "Config parameter to enable datanode io analyzer.")
+ private boolean enabled = true;
+
+ @Config(key = "io.analyzer.interval",
+ type = ConfigType.TIME,
+ defaultValue = "30s",
+ tags = {ConfigTag.STORAGE},
+ description = "The time interval for acquiring IO data is set to 30s.")
+ private long iOAnalyzerInterval = IO_ANALYZER_INTERVAL_DEFAULT;
+
+ public long getIOAnalyzerInterval() {
+ return iOAnalyzerInterval;
+ }
+}
diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java
index 8030de196a1f..bbe034108eb7 100644
--- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java
+++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java
@@ -86,6 +86,8 @@
import org.apache.hadoop.ozone.container.common.volume.StorageVolume;
import org.apache.hadoop.ozone.container.common.volume.StorageVolume.VolumeType;
import org.apache.hadoop.ozone.container.common.volume.StorageVolumeChecker;
+import org.apache.hadoop.ozone.container.io.BackgroundIOAnalyzer;
+import org.apache.hadoop.ozone.container.io.IOAnalyzerConfiguration;
import org.apache.hadoop.ozone.container.keyvalue.statemachine.background.StaleRecoveringContainerScrubbingService;
import org.apache.hadoop.ozone.container.metadata.WitnessedContainerMetadataStore;
import org.apache.hadoop.ozone.container.metadata.WitnessedContainerMetadataStoreImpl;
@@ -131,6 +133,7 @@ public class OzoneContainer {
private DatanodeDetails datanodeDetails;
private StateContext context;
private ScheduledExecutorService dbCompactionExecutorService;
+ private BackgroundIOAnalyzer ioAnalyzer;
private final ContainerMetrics metrics;
private WitnessedContainerMetadataStore witnessedContainerMetadataStore;
@@ -289,6 +292,10 @@ public OzoneContainer(HddsDatanodeService hddsDatanodeService,
initializingStatus =
new AtomicReference<>(InitializingStatus.UNINITIALIZED);
+
+ IOAnalyzerConfiguration c = config.getObject(
+ IOAnalyzerConfiguration.class);
+ ioAnalyzer = new BackgroundIOAnalyzer(c);
}
/**
@@ -541,6 +548,8 @@ public void stop() {
recoveringContainerScrubbingService.shutdown();
IOUtils.closeQuietly(metrics);
ContainerMetrics.remove();
+ ioAnalyzer.shutdown();
+
if (this.witnessedContainerMetadataStore != null) {
try {
this.witnessedContainerMetadataStore.stop();