diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java
index 7b007fdca1f0..787f023df2ea 100644
--- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java
+++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java
@@ -392,4 +392,9 @@ private HddsConfigKeys() {
public static final String OZONE_AUDIT_LOG_DEBUG_CMD_LIST_DNAUDIT =
"ozone.audit.log.debug.cmd.list.dnaudit";
+
+ public static final String HDDS_DATANODE_SLOW_OP_WARNING_THRESHOLD_KEY =
+ "hdds.datanode.slow.op.warning.threshold";
+ public static final String HDDS_DATANODE_SLOW_OP_WARNING_THRESHOLD_DEFAULT =
+ "500ms";
}
diff --git a/hadoop-hdds/common/src/main/resources/ozone-default.xml b/hadoop-hdds/common/src/main/resources/ozone-default.xml
index 7debcd479ec4..9dad826c7557 100644
--- a/hadoop-hdds/common/src/main/resources/ozone-default.xml
+++ b/hadoop-hdds/common/src/main/resources/ozone-default.xml
@@ -3418,6 +3418,14 @@
Timeout for the request submitted directly to Ratis in datanode.
+
+ hdds.datanode.slow.op.warning.threshold
+ OZONE, DATANODE, PERFORMANCE
+ 500ms
+
+ Thresholds for printing slow-operation audit logs.
+
+
ozone.om.keyname.character.check.enabled
OZONE, OM
diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/HddsDispatcher.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/HddsDispatcher.java
index eeb3548d5f87..63e51bdfd588 100644
--- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/HddsDispatcher.java
+++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/HddsDispatcher.java
@@ -75,7 +75,9 @@
import java.util.Optional;
import java.util.Set;
import java.util.TreeMap;
+import java.util.concurrent.TimeUnit;
+import static org.apache.hadoop.ozone.audit.AuditLogger.PerformanceStringBuilder;
import static org.apache.hadoop.hdds.scm.protocolPB.ContainerCommandResponseBuilders.malformedRequest;
import static org.apache.hadoop.hdds.scm.protocolPB.ContainerCommandResponseBuilders.unsupportedRequest;
import static org.apache.hadoop.ozone.container.common.interfaces.Container.ScanResult;
@@ -101,6 +103,7 @@ public class HddsDispatcher implements ContainerDispatcher, Auditor {
private String clusterId;
private ContainerMetrics metrics;
private final TokenVerifier tokenVerifier;
+ private long slowOpThresholdMs;
/**
* Constructs an OzoneContainer that receives calls from
@@ -121,6 +124,7 @@ public HddsDispatcher(ConfigurationSource config, ContainerSet contSet,
HddsConfigKeys.HDDS_CONTAINER_CLOSE_THRESHOLD_DEFAULT);
this.tokenVerifier = tokenVerifier != null ? tokenVerifier
: new NoopTokenVerifier();
+ this.slowOpThresholdMs = getSlowOpThresholdMs(conf);
protocolMetrics =
new ProtocolMessageMetrics<>(
@@ -196,6 +200,7 @@ private ContainerCommandResponseProto dispatchRequest(
AuditAction action = getAuditAction(msg.getCmdType());
EventType eventType = getEventType(msg);
Map params = getAuditParams(msg);
+ PerformanceStringBuilder perf = new PerformanceStringBuilder();
ContainerType containerType;
ContainerCommandResponseProto responseProto = null;
@@ -326,10 +331,11 @@ private ContainerCommandResponseProto dispatchRequest(
audit(action, eventType, params, AuditEventStatus.FAILURE, ex);
return ContainerUtils.logAndReturnError(LOG, ex, msg);
}
+ perf.appendPreOpLatencyMs(Time.monotonicNow() - startTime);
responseProto = handler.handle(msg, container, dispatcherContext);
+ long oPLatencyMS = Time.monotonicNow() - startTime;
if (responseProto != null) {
- metrics.incContainerOpsLatencies(cmdType,
- Time.monotonicNow() - startTime);
+ metrics.incContainerOpsLatencies(cmdType, oPLatencyMS);
// If the request is of Write Type and the container operation
// is unsuccessful, it implies the applyTransaction on the container
@@ -402,6 +408,8 @@ private ContainerCommandResponseProto dispatchRequest(
audit(action, eventType, params, AuditEventStatus.FAILURE,
new Exception(responseProto.getMessage()));
}
+ perf.appendOpLatencyMs(oPLatencyMS);
+ performanceAudit(action, params, perf, oPLatencyMS);
return responseProto;
} else {
@@ -412,6 +420,13 @@ private ContainerCommandResponseProto dispatchRequest(
}
}
+ private long getSlowOpThresholdMs(ConfigurationSource config) {
+ return config.getTimeDuration(
+ HddsConfigKeys.HDDS_DATANODE_SLOW_OP_WARNING_THRESHOLD_KEY,
+ HddsConfigKeys.HDDS_DATANODE_SLOW_OP_WARNING_THRESHOLD_DEFAULT,
+ TimeUnit.MILLISECONDS);
+ }
+
private void updateBCSID(Container container,
DispatcherContext dispatcherContext, Type cmdType) {
if (dispatcherContext != null && (cmdType == Type.PutBlock
@@ -682,6 +697,26 @@ private void audit(AuditAction action, EventType eventType,
}
}
+ private void performanceAudit(AuditAction action, Map params,
+ PerformanceStringBuilder performance, long opLatencyMs) {
+ if (isOperationSlow(opLatencyMs)) {
+ AuditMessage msg =
+ buildAuditMessageForPerformance(action, params, performance);
+ AUDIT.logPerformance(msg);
+ }
+ }
+
+ public AuditMessage buildAuditMessageForPerformance(AuditAction op,
+ Map auditMap, PerformanceStringBuilder performance) {
+ return new AuditMessage.Builder()
+ .setUser(null)
+ .atIp(null)
+ .forOperation(op)
+ .withParams(auditMap)
+ .setPerformance(performance)
+ .build();
+ }
+
//TODO: use GRPC to fetch user and ip details
@Override
public AuditMessage buildAuditMessageForSuccess(AuditAction op,
@@ -845,6 +880,8 @@ private static Map getAuditParams(
case ReadChunk:
auditParams.put("blockData",
BlockID.getFromProtobuf(msg.getReadChunk().getBlockID()).toString());
+ auditParams.put("blockDataSize",
+ String.valueOf(msg.getReadChunk().getChunkData().getLen()));
return auditParams;
case DeleteChunk:
@@ -857,6 +894,8 @@ private static Map getAuditParams(
auditParams.put("blockData",
BlockID.getFromProtobuf(msg.getWriteChunk().getBlockID())
.toString());
+ auditParams.put("blockDataSize",
+ String.valueOf(msg.getWriteChunk().getChunkData().getLen()));
return auditParams;
case ListChunk:
@@ -873,6 +912,8 @@ private static Map getAuditParams(
auditParams.put("blockData",
BlockData.getFromProtoBuf(msg.getPutSmallFile()
.getBlock().getBlockData()).toString());
+ auditParams.put("blockDataSize",
+ String.valueOf(msg.getPutSmallFile().getChunkInfo().getLen()));
} catch (IOException ex) {
if (LOG.isTraceEnabled()) {
LOG.trace("Encountered error parsing BlockData from protobuf: "
@@ -904,4 +945,7 @@ private static Map getAuditParams(
}
+ private boolean isOperationSlow(long opLatencyMs) {
+ return opLatencyMs >= slowOpThresholdMs;
+ }
}
diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java
index 362c08c6a94b..c50097e9cb95 100644
--- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java
+++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java
@@ -805,6 +805,7 @@ ContainerCommandResponseProto handleWriteChunk(
WriteChunkRequestProto writeChunk = request.getWriteChunk();
BlockID blockID = BlockID.getFromProtobuf(writeChunk.getBlockID());
ContainerProtos.ChunkInfo chunkInfoProto = writeChunk.getChunkData();
+
ChunkInfo chunkInfo = ChunkInfo.getFromProtoBuf(chunkInfoProto);
Preconditions.checkNotNull(chunkInfo);
diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/ozone/audit/AuditLogger.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/ozone/audit/AuditLogger.java
index 33ad70869873..042887e4e533 100644
--- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/ozone/audit/AuditLogger.java
+++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/ozone/audit/AuditLogger.java
@@ -49,6 +49,7 @@ public class AuditLogger {
private static final Marker WRITE_MARKER = AuditMarker.WRITE.getMarker();
private static final Marker READ_MARKER = AuditMarker.READ.getMarker();
private static final Marker AUTH_MARKER = AuditMarker.AUTH.getMarker();
+ private static final Marker PERFORMANCE = AuditMarker.PERFORMANCE.getMarker();
private final AtomicReference> debugCmdSetRef =
new AtomicReference<>(new HashSet<>());
public static final String AUDIT_LOG_DEBUG_CMD_LIST_PREFIX =
@@ -118,6 +119,10 @@ public void logWrite(AuditMessage auditMessage) {
}
}
+ public void logPerformance(AuditMessage msg) {
+ this.logger.logIfEnabled(FQCN, Level.INFO, PERFORMANCE, msg, null);
+ }
+
public void refreshDebugCmdSet() {
OzoneConfiguration conf = new OzoneConfiguration();
refreshDebugCmdSet(conf);
@@ -161,6 +166,22 @@ public void appendOpLatencyNanos(long nanos) {
append("opLatencyMs", TimeUnit.NANOSECONDS.toMillis(nanos));
}
+ /**
+ * Appends pre-operation operation latency in milliseconds.
+ * @param millis Latency in nanoseconds.
+ */
+ public void appendPreOpLatencyMs(long millis) {
+ append("preOpLatencyMs", millis);
+ }
+
+ /**
+ * Appends whole operation latency in milliseconds.
+ * @param millis Latency in milliseconds.
+ */
+ public void appendOpLatencyMs(long millis) {
+ append("opLatencyMs", millis);
+ }
+
/**
* Appends the size in bytes.
* @param bytes Size in bytes.
diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/ozone/audit/AuditMarker.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/ozone/audit/AuditMarker.java
index d9aed2bb65f3..2e991a6ea2e1 100644
--- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/ozone/audit/AuditMarker.java
+++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/ozone/audit/AuditMarker.java
@@ -25,7 +25,8 @@
public enum AuditMarker {
WRITE(MarkerManager.getMarker("WRITE")),
READ(MarkerManager.getMarker("READ")),
- AUTH(MarkerManager.getMarker("AUTH")),;
+ AUTH(MarkerManager.getMarker("AUTH")),
+ PERFORMANCE(MarkerManager.getMarker("PERFORMANCE"));
private Marker marker;