Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ public interface NodeManagerMXBean {
*
* @return A state to number of nodes that in this state mapping
*/
Map<String, Integer> getNodeCount();
Map<String, Map<String, Integer>> getNodeCount();

/**
* Get the disk metrics like capacity, usage and remaining based on the
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -482,62 +482,97 @@ private SCMNodeStat getNodeStatInternal(DatanodeDetails datanodeDetails) {
}
}

@Override
public Map<String, Integer> getNodeCount() {
// TODO - This does not consider decom, maint etc.
Map<String, Integer> nodeCountMap = new HashMap<String, Integer>();
for(NodeState state : NodeState.values()) {
// TODO - this iterate the node list once per state and needs
// fixed to only perform one pass.
nodeCountMap.put(state.toString(), getNodeCount(null, state));
@Override // NodeManagerMXBean
public Map<String, Map<String, Integer>> getNodeCount() {
Map<String, Map<String, Integer>> nodes = new HashMap<>();
for (NodeOperationalState opState : NodeOperationalState.values()) {
Map<String, Integer> states = new HashMap<>();
for (NodeState health : NodeState.values()) {
states.put(health.name(), 0);
}
nodes.put(opState.name(), states);
}
for (DatanodeInfo dni : nodeStateManager.getAllNodes()) {
NodeStatus status = dni.getNodeStatus();
nodes.get(status.getOperationalState().name())
.compute(status.getHealth().name(), (k, v) -> v+1);
}
return nodeCountMap;
return nodes;
}

// We should introduce DISK, SSD, etc., notion in
// SCMNodeStat and try to use it.
@Override
@Override // NodeManagerMXBean
public Map<String, Long> getNodeInfo() {
long diskCapacity = 0L;
long diskUsed = 0L;
long diskRemaning = 0L;

long ssdCapacity = 0L;
long ssdUsed = 0L;
long ssdRemaining = 0L;

List<DatanodeInfo> healthyNodes = nodeStateManager.getHealthyNodes();
List<DatanodeInfo> staleNodes = nodeStateManager.getStaleNodes();

List<DatanodeInfo> datanodes = new ArrayList<>(healthyNodes);
datanodes.addAll(staleNodes);
Map<String, Long> nodeInfo = new HashMap<>();
// Compute all the possible stats from the enums, and default to zero:
for (UsageStates s : UsageStates.values()) {
for (UsageMetrics stat : UsageMetrics.values()) {
nodeInfo.put(s.label + stat.name(), 0L);
}
}

for (DatanodeInfo dnInfo : datanodes) {
List<StorageReportProto> storageReportProtos = dnInfo.getStorageReports();
for (DatanodeInfo node : nodeStateManager.getAllNodes()) {
String keyPrefix = "";
NodeStatus status = node.getNodeStatus();
if (status.isMaintenance()) {
keyPrefix = UsageStates.MAINT.getLabel();
} else if (status.isDecommission()) {
keyPrefix = UsageStates.DECOM.getLabel();
} else if (status.isAlive()) {
// Inservice but not dead
keyPrefix = UsageStates.ONLINE.getLabel();
} else {
// dead inservice node, skip it
continue;
}
List<StorageReportProto> storageReportProtos = node.getStorageReports();
for (StorageReportProto reportProto : storageReportProtos) {
if (reportProto.getStorageType() ==
StorageContainerDatanodeProtocolProtos.StorageTypeProto.DISK) {
diskCapacity += reportProto.getCapacity();
diskRemaning += reportProto.getRemaining();
diskUsed += reportProto.getScmUsed();
nodeInfo.compute(keyPrefix + UsageMetrics.DiskCapacity.name(),
(k, v) -> v + reportProto.getCapacity());
nodeInfo.compute(keyPrefix + UsageMetrics.DiskRemaining.name(),
(k, v) -> v + reportProto.getRemaining());
nodeInfo.compute(keyPrefix + UsageMetrics.DiskUsed.name(),
(k, v) -> v + reportProto.getScmUsed());
} else if (reportProto.getStorageType() ==
StorageContainerDatanodeProtocolProtos.StorageTypeProto.SSD) {
ssdCapacity += reportProto.getCapacity();
ssdRemaining += reportProto.getRemaining();
ssdUsed += reportProto.getScmUsed();
nodeInfo.compute(keyPrefix + UsageMetrics.SSDCapacity.name(),
(k, v) -> v + reportProto.getCapacity());
nodeInfo.compute(keyPrefix + UsageMetrics.SSDRemaining.name(),
(k, v) -> v + reportProto.getRemaining());
nodeInfo.compute(keyPrefix + UsageMetrics.SSDUsed.name(),
(k, v) -> v + reportProto.getScmUsed());
}
}
}
return nodeInfo;
}

Map<String, Long> nodeInfo = new HashMap<>();
nodeInfo.put("DISKCapacity", diskCapacity);
nodeInfo.put("DISKUsed", diskUsed);
nodeInfo.put("DISKRemaining", diskRemaning);
private enum UsageMetrics {
DiskCapacity,
DiskUsed,
DiskRemaining,
SSDCapacity,
SSDUsed,
SSDRemaining
}

nodeInfo.put("SSDCapacity", ssdCapacity);
nodeInfo.put("SSDUsed", ssdUsed);
nodeInfo.put("SSDRemaining", ssdRemaining);
return nodeInfo;
private enum UsageStates {
ONLINE(""),
MAINT("Maintenance"),
DECOM("Decommissioned");

private final String label;

public String getLabel() {
return label;
}

UsageStates(String label) {
this.label = label;
}
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,12 @@

package org.apache.hadoop.hdds.scm.node;

import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.DEAD;
import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.HEALTHY;
import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.STALE;

import java.util.Map;

import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.metrics2.MetricsCollector;
import org.apache.hadoop.metrics2.MetricsInfo;
import org.apache.hadoop.metrics2.MetricsRecordBuilder;
import org.apache.hadoop.metrics2.MetricsSource;
import org.apache.hadoop.metrics2.MetricsSystem;
import org.apache.hadoop.metrics2.annotation.Metric;
Expand All @@ -36,6 +33,7 @@
import org.apache.hadoop.metrics2.lib.MetricsRegistry;
import org.apache.hadoop.metrics2.lib.MutableCounterLong;
import org.apache.hadoop.ozone.OzoneConsts;
import org.apache.hadoop.util.StringUtils;

/**
* This class maintains Node related metrics.
Expand All @@ -51,6 +49,7 @@ public final class SCMNodeMetrics implements MetricsSource {
private @Metric MutableCounterLong numHBProcessingFailed;
private @Metric MutableCounterLong numNodeReportProcessed;
private @Metric MutableCounterLong numNodeReportProcessingFailed;
private @Metric String textMetric;

private final MetricsRegistry registry;
private final NodeManagerMXBean managerMXBean;
Expand All @@ -61,6 +60,7 @@ public final class SCMNodeMetrics implements MetricsSource {
private SCMNodeMetrics(NodeManagerMXBean managerMXBean) {
this.managerMXBean = managerMXBean;
this.registry = new MetricsRegistry(recordInfo);
this.textMetric = "my_test_metric";
}

/**
Expand Down Expand Up @@ -116,39 +116,58 @@ void incNumNodeReportProcessingFailed() {
@Override
@SuppressWarnings("SuspiciousMethodCalls")
public void getMetrics(MetricsCollector collector, boolean all) {
Map<String, Integer> nodeCount = managerMXBean.getNodeCount();
Map<String, Map<String, Integer>> nodeCount = managerMXBean.getNodeCount();
Map<String, Long> nodeInfo = managerMXBean.getNodeInfo();

registry.snapshot(
collector.addRecord(registry.info()) // Add annotated ones first
.addGauge(Interns.info(
"HealthyNodes",
"Number of healthy datanodes"),
nodeCount.get(HEALTHY.toString()))
.addGauge(Interns.info("StaleNodes",
"Number of stale datanodes"),
nodeCount.get(STALE.toString()))
.addGauge(Interns.info("DeadNodes",
"Number of dead datanodes"),
nodeCount.get(DEAD.toString()))
.addGauge(Interns.info("DiskCapacity",
"Total disk capacity"),
nodeInfo.get("DISKCapacity"))
.addGauge(Interns.info("DiskUsed",
"Total disk capacity used"),
nodeInfo.get("DISKUsed"))
.addGauge(Interns.info("DiskRemaining",
"Total disk capacity remaining"),
nodeInfo.get("DISKRemaining"))
.addGauge(Interns.info("SSDCapacity",
"Total ssd capacity"),
nodeInfo.get("SSDCapacity"))
.addGauge(Interns.info("SSDUsed",
"Total ssd capacity used"),
nodeInfo.get("SSDUsed"))
.addGauge(Interns.info("SSDRemaining",
"Total disk capacity remaining"),
nodeInfo.get("SSDRemaining")),
all);
/**
* Loop over the Node map and create a metric for the cross product of all
* Operational and health states, ie:
* InServiceHealthy
* InServiceStale
* ...
* EnteringMaintenanceHealthy
* ...
*/
MetricsRecordBuilder metrics = collector.addRecord(registry.info());
for(Map.Entry<String, Map<String, Integer>> e : nodeCount.entrySet()) {
for(Map.Entry<String, Integer> h : e.getValue().entrySet()) {
metrics.addGauge(
Interns.info(
StringUtils.camelize(e.getKey()+"_"+h.getKey()+"_nodes"),
"Number of "+e.getKey()+" "+h.getKey()+" datanodes"),
h.getValue());
}
}

for (Map.Entry<String, Long> e : nodeInfo.entrySet()) {
metrics.addGauge(
Interns.info(e.getKey(), diskMetricDescription(e.getKey())),
e.getValue());
}
registry.snapshot(metrics, all);
}

private String diskMetricDescription(String metric) {
StringBuilder sb = new StringBuilder();
sb.append("Total");
if (metric.indexOf("Maintenance") >= 0) {
sb.append(" maintenance");
} else if (metric.indexOf("Decommissioned") >= 0) {
sb.append(" decommissioned");
}
if (metric.indexOf("DiskCapacity") >= 0) {
sb.append(" disk capacity");
} else if (metric.indexOf("DiskUsed") >= 0) {
sb.append(" disk capacity used");
} else if (metric.indexOf("DiskRemaining") >= 0) {
sb.append(" disk capacity remaining");
} else if (metric.indexOf("SSDCapacity") >= 0) {
sb.append(" SSD capacity");
} else if (metric.indexOf("SSDUsed") >= 0) {
sb.append(" SSD capacity used");
} else if (metric.indexOf("SSDRemaining") >= 0) {
sb.append(" SSD capacity remaining");
}
return sb.toString();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException;
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState;
import org.apache.hadoop.hdds.protocol.proto
.StorageContainerDatanodeProtocolProtos.NodeReportProto;
import org.apache.hadoop.hdds.protocol.proto
Expand Down Expand Up @@ -471,12 +472,23 @@ public Boolean isNodeRegistered(
}

@Override
public Map<String, Integer> getNodeCount() {
Map<String, Integer> nodeCountMap = new HashMap<String, Integer>();
public Map<String, Map<String, Integer>> getNodeCount() {
Map<String, Map<String, Integer>> nodes = new HashMap<>();
for (NodeOperationalState opState : NodeOperationalState.values()) {
Map<String, Integer> states = new HashMap<>();
for (HddsProtos.NodeState health : HddsProtos.NodeState.values()) {
states.put(health.name(), 0);
}
nodes.put(opState.name(), states);
}
// At the moment MockNodeManager is not aware of decommission and
// maintenance states, therefore loop over all nodes and assume all nodes
// are IN_SERVICE. This will be fixed as part of HDDS-2673
for (HddsProtos.NodeState state : HddsProtos.NodeState.values()) {
nodeCountMap.put(state.toString(), getNodeCount(null, state));
nodes.get(NodeOperationalState.IN_SERVICE.name())
.compute(state.name(), (k, v) -> v + 1);
}
return nodeCountMap;
return nodes;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ public void close() throws IOException {
}

@Override
public Map<String, Integer> getNodeCount() {
public Map<String, Map<String, Integer>> getNodeCount() {
return null;
}

Expand Down
Loading