diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/io/BackgroundIOAnalyzer.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/io/BackgroundIOAnalyzer.java new file mode 100644 index 000000000000..5ce4941fa6aa --- /dev/null +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/io/BackgroundIOAnalyzer.java @@ -0,0 +1,189 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.ozone.container.io; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.util.concurrent.atomic.AtomicBoolean; +import org.apache.commons.lang3.ArrayUtils; +import org.apache.hadoop.util.Shell; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class BackgroundIOAnalyzer extends Thread { + + private static final Logger LOG = + LoggerFactory.getLogger(BackgroundIOAnalyzer.class); + + private static final String NAME = "BackgroundIOAnalyzer"; + private static final String PROCFS_STAT = "/proc/stat"; + private static final String PROCFS_CPU = "cpu"; + private DataNodeIOMetrics metrics; + private final AtomicBoolean stopping; + private final long remainingSleep; + + public BackgroundIOAnalyzer(IOAnalyzerConfiguration conf) { + this.metrics = DataNodeIOMetrics.create(); + this.stopping = new AtomicBoolean(false); + this.remainingSleep = conf.getIOAnalyzerInterval(); + setName(NAME); + setDaemon(true); + } + + @Override + public void run() { + try { + while (!stopping.get()) { + analyzerIoWaitAndSystem(); + } + LOG.info("{} exiting.", this); + } catch (Exception e) { + LOG.error("{} exiting because of exception ", this, e); + } finally { + if (metrics != null) { + metrics.unregister(); + } + } + } + + /** + * Analyzes the usage of IOWait and System metrics. + * + *

Drive Types

+ * + * + *

Monitoring Purpose

+ *

Monitoring IOWait and System metrics is crucial for:

+ * + * + *

Data Collection Approach

+ *

This method employs a lightweight strategy to gather relevant data by:

+ * + * + *

Metrics Breakdown

+ * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
usernicesystemidleiowaitirqsoftirqstealguestguest_nice
1067263464817921665141335547939726143614217949886055070272000
+ * + */ + private void analyzerIoWaitAndSystem() { + + if (!Shell.LINUX) { + LOG.warn("Analyzing IO: We currently only support Linux systems."); + return; + } + + try (BufferedReader reader = new BufferedReader(new FileReader(PROCFS_STAT))) { + String line; + while ((line = reader.readLine()) != null) { + if (line.startsWith(PROCFS_CPU)) { + String[] values = line.split("\\s+"); + if(ArrayUtils.isNotEmpty(values)) { + + // Step1. Retrieve all CPU system time data. + long user = Long.parseLong(values[1]); + long nice = Long.parseLong(values[2]); + long system = Long.parseLong(values[3]); + long idle = Long.parseLong(values[4]); + long iowait = Long.parseLong(values[5]); + + // Step2. Calculate total CPU time. + long totalCpuTime = user + nice + system + idle + iowait; + + // Step3. Calculate the ratio. + long iowaitRatio = (long) Math.floor((double) iowait / totalCpuTime * 100); + metrics.setDNIoWait(iowaitRatio); + + long systemRatio = (long) Math.floor((double) system / totalCpuTime * 100); + metrics.setDNSystem(systemRatio); + + LOG.debug("IO Analyzer : IoWait = {}, System = {}.", iowaitRatio, systemRatio); + } + break; + } + } + } catch (IOException e) { + LOG.error("An error occurred during the Analyzing IO process.", e); + } + + // We collect IO performance data at regular intervals, + // which is usually every 30 seconds. + handleRemainingSleep(remainingSleep); + } + + public final void handleRemainingSleep(long remainingSleep) { + if (remainingSleep > 0) { + try { + Thread.sleep(remainingSleep); + } catch (InterruptedException ignored) { + stopping.set(true); + LOG.warn("Background IOAnalyzer was interrupted."); + Thread.currentThread().interrupt(); + } + } + } + + public synchronized void shutdown() { + if (stopping.compareAndSet(false, true)) { + this.interrupt(); + try { + this.join(); + } catch (InterruptedException ex) { + LOG.warn("Unexpected exception while stopping io analyzer.", ex); + Thread.currentThread().interrupt(); + } + } + + if (metrics != null) { + metrics.unregister(); + } + } +} diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/io/DataNodeIOMetrics.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/io/DataNodeIOMetrics.java new file mode 100644 index 000000000000..ece4b0eefd52 --- /dev/null +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/io/DataNodeIOMetrics.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.ozone.container.io; + +import org.apache.hadoop.hdds.annotation.InterfaceAudience.Private; +import org.apache.hadoop.metrics2.MetricsSystem; +import org.apache.hadoop.metrics2.annotation.Metric; +import org.apache.hadoop.metrics2.annotation.Metrics; +import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; +import org.apache.hadoop.metrics2.lib.MutableGaugeLong; + +@Private +@Metrics(about = "Datanode io metrics", context = "io") +public class DataNodeIOMetrics { + private final String name; + private final MetricsSystem ms; + + @Metric + private MutableGaugeLong ioWaitGauge; + + @Metric + private MutableGaugeLong systemGauge; + + public DataNodeIOMetrics(String name, MetricsSystem ms) { + this.name = name; + this.ms = ms; + } + + public void setDNIoWait(long ioWait) { + ioWaitGauge.set(ioWait); + } + + public void setDNSystem(long system) { + systemGauge.set(system); + } + + public static DataNodeIOMetrics create() { + MetricsSystem ms = DefaultMetricsSystem.instance(); + String name = "DataNodeIOMetrics"; + return ms.register(name, null, new DataNodeIOMetrics(name, ms)); + } + + public void unregister() { + ms.unregisterSource(name); + } +} diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/io/IOAnalyzerConfiguration.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/io/IOAnalyzerConfiguration.java new file mode 100644 index 000000000000..36ffb817f594 --- /dev/null +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/io/IOAnalyzerConfiguration.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.ozone.container.io; + +import java.time.Duration; +import org.apache.hadoop.hdds.conf.Config; +import org.apache.hadoop.hdds.conf.ConfigGroup; +import org.apache.hadoop.hdds.conf.ConfigTag; +import org.apache.hadoop.hdds.conf.ConfigType; + +@ConfigGroup(prefix = "hdds.datanode.io") +public class IOAnalyzerConfiguration { + + public static final long IO_ANALYZER_INTERVAL_DEFAULT = + Duration.ofDays(7).toMillis(); + + @Config(key = "enabled", + type = ConfigType.BOOLEAN, + defaultValue = "true", + tags = {ConfigTag.DATANODE}, + description = "Config parameter to enable datanode io analyzer.") + private boolean enabled = true; + + @Config(key = "io.analyzer.interval", + type = ConfigType.TIME, + defaultValue = "30s", + tags = {ConfigTag.STORAGE}, + description = "The time interval for acquiring IO data is set to 30s.") + private long iOAnalyzerInterval = IO_ANALYZER_INTERVAL_DEFAULT; + + public long getIOAnalyzerInterval() { + return iOAnalyzerInterval; + } +} diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java index 8030de196a1f..bbe034108eb7 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java @@ -86,6 +86,8 @@ import org.apache.hadoop.ozone.container.common.volume.StorageVolume; import org.apache.hadoop.ozone.container.common.volume.StorageVolume.VolumeType; import org.apache.hadoop.ozone.container.common.volume.StorageVolumeChecker; +import org.apache.hadoop.ozone.container.io.BackgroundIOAnalyzer; +import org.apache.hadoop.ozone.container.io.IOAnalyzerConfiguration; import org.apache.hadoop.ozone.container.keyvalue.statemachine.background.StaleRecoveringContainerScrubbingService; import org.apache.hadoop.ozone.container.metadata.WitnessedContainerMetadataStore; import org.apache.hadoop.ozone.container.metadata.WitnessedContainerMetadataStoreImpl; @@ -131,6 +133,7 @@ public class OzoneContainer { private DatanodeDetails datanodeDetails; private StateContext context; private ScheduledExecutorService dbCompactionExecutorService; + private BackgroundIOAnalyzer ioAnalyzer; private final ContainerMetrics metrics; private WitnessedContainerMetadataStore witnessedContainerMetadataStore; @@ -289,6 +292,10 @@ public OzoneContainer(HddsDatanodeService hddsDatanodeService, initializingStatus = new AtomicReference<>(InitializingStatus.UNINITIALIZED); + + IOAnalyzerConfiguration c = config.getObject( + IOAnalyzerConfiguration.class); + ioAnalyzer = new BackgroundIOAnalyzer(c); } /** @@ -541,6 +548,8 @@ public void stop() { recoveringContainerScrubbingService.shutdown(); IOUtils.closeQuietly(metrics); ContainerMetrics.remove(); + ioAnalyzer.shutdown(); + if (this.witnessedContainerMetadataStore != null) { try { this.witnessedContainerMetadataStore.stop();