Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
197 changes: 101 additions & 96 deletions nvme_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,102 +24,110 @@

metrics = {
# fmt: off
# Host-specific metrics
"nvmecli": Info(
"nvmecli",
"nvme-cli tool information",
["version"], namespace=namespace, registry=registry,
),

# Controller-specific (e.g. "nvme0") metrics
"avail_spare": Gauge(
"available_spare_ratio",
"Device available spare ratio",
["device"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"controller_busy_time": Counter(
"controller_busy_time_seconds",
"Device controller busy time in seconds",
["device"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"controller_info": Info(
"controller",
"Controller information",
["controller", "model", "firmware", "serial", "transport"], namespace=namespace,
registry=registry,
),
"critical_warning": Gauge(
"critical_warning",
"Device critical warning bitmap field",
["device"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"data_units_read": Counter(
"data_units_read_total",
"Number of 512-byte data units read by host, reported in thousands",
["device"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"data_units_written": Counter(
"data_units_written_total",
"Number of 512-byte data units written by host, reported in thousands",
["device"], namespace=namespace, registry=registry,
),
"device_info": Info(
"device",
"Device information",
["device", "model", "firmware", "serial"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"host_read_commands": Counter(
"host_read_commands_total",
"Device read commands from host",
["device"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"host_write_commands": Counter(
"host_write_commands_total",
"Device write commands from host",
["device"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"media_errors": Counter(
"media_errors_total",
"Device media errors total",
["device"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"num_err_log_entries": Counter(
"num_err_log_entries_total",
"Device error log entry count",
["device"], namespace=namespace, registry=registry,
),
# FIXME: The "nvmecli" metric ought to be an Info type, not a Gauge. However, making this change
# will result in the metric having a "_info" suffix automatically appended, which is arguably
# a breaking change.
"nvmecli": Gauge(
"nvmecli",
"nvme-cli tool information",
["version"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"percent_used": Gauge(
"percentage_used_ratio",
"Device percentage used ratio",
["device"], namespace=namespace, registry=registry,
),
"physical_size": Gauge(
"physical_size_bytes",
"Device size in bytes",
["device"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"power_cycles": Counter(
"power_cycles_total",
"Device number of power cycles",
["device"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"power_on_hours": Counter(
"power_on_hours_total",
"Device power-on hours",
["device"], namespace=namespace, registry=registry,
),
"sector_size": Gauge(
"sector_size_bytes",
"Device sector size in bytes",
["device"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"spare_thresh": Gauge(
"available_spare_threshold_ratio",
"Device available spare threshold ratio",
["device"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"temperature": Gauge(
"temperature_celsius",
"Device temperature in degrees Celsius",
["device"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"unsafe_shutdowns": Counter(
"unsafe_shutdowns_total",
"Device number of unsafe shutdowns",
["controller"], namespace=namespace, registry=registry,
),

# Namespace-specific (e.g. "nvme0n1") metrics
"namespace_info": Info(
"namespace",
"Namespace information",
["device", "nsid", "controller"], namespace=namespace, registry=registry,
),
"physical_size": Gauge(
"physical_size_bytes",
"Device size in bytes",
["device"], namespace=namespace, registry=registry,
),
"sector_size": Gauge(
"sector_size_bytes",
"Device sector size in bytes",
["device"], namespace=namespace, registry=registry,
),
"used_bytes": Gauge(
Expand Down Expand Up @@ -159,75 +167,72 @@ def main():
cli_version = match.group(1)
else:
cli_version = "unknown"
metrics["nvmecli"].labels(cli_version).set(1)
metrics["nvmecli"].labels(cli_version)

device_list = exec_nvme_json("list")

for device in device_list["Devices"]:
for subsys in device["Subsystems"]:
for ctrl in subsys["Controllers"]:
ctrl_dev = ctrl["Controller"]

metrics["controller_info"].labels(
ctrl_dev,
ctrl["ModelNumber"],
ctrl["Firmware"],
ctrl["SerialNumber"].strip(),
ctrl["Transport"],
)

for ns in ctrl["Namespaces"]:
device_name = ns["NameSpace"]

# FIXME: This metric ought to be refactored into a "controller_info" metric,
# since it contains information that is not unique to the namespace. However,
# previous versions of this collector erroneously referred to namespaces, e.g.
# "nvme0n1", as devices, so preserve the former behaviour for now.
metrics["device_info"].labels(
device_name,
ctrl["ModelNumber"],
ctrl["Firmware"],
ctrl["SerialNumber"].strip(),
)
ns_dev = ns["NameSpace"]

metrics["sector_size"].labels(device_name).set(ns["SectorSize"])
metrics["physical_size"].labels(device_name).set(ns["PhysicalSize"])
metrics["used_bytes"].labels(device_name).set(ns["UsedBytes"])

# FIXME: The smart-log should only need to be fetched once per controller, not
# per namespace. However, in order to preserve legacy metric labels, fetch it
# per namespace anyway. Most consumer grade SSDs will only have one namespace.
smart_log = exec_nvme_json("smart-log", os.path.join("/dev", device_name))

# Various counters in the NVMe specification are 128-bit, which would have to
# discard resolution if converted to a JSON number (i.e., float64_t). Instead,
# nvme-cli marshals them as strings. As such, they need to be explicitly cast
# to int or float when using them in Counter metrics.
metrics["data_units_read"].labels(device_name).inc(
int(smart_log["data_units_read"])
)
metrics["data_units_written"].labels(device_name).inc(
int(smart_log["data_units_written"])
)
metrics["host_read_commands"].labels(device_name).inc(
int(smart_log["host_read_commands"])
)
metrics["host_write_commands"].labels(device_name).inc(
int(smart_log["host_write_commands"])
)
metrics["avail_spare"].labels(device_name).set(smart_log["avail_spare"] / 100)
metrics["spare_thresh"].labels(device_name).set(smart_log["spare_thresh"] / 100)
metrics["percent_used"].labels(device_name).set(smart_log["percent_used"] / 100)
metrics["critical_warning"].labels(device_name).set(
smart_log["critical_warning"]["value"]
)
metrics["media_errors"].labels(device_name).inc(int(smart_log["media_errors"]))
metrics["num_err_log_entries"].labels(device_name).inc(
int(smart_log["num_err_log_entries"])
)
metrics["power_cycles"].labels(device_name).inc(int(smart_log["power_cycles"]))
metrics["power_on_hours"].labels(device_name).inc(
int(smart_log["power_on_hours"])
)
metrics["controller_busy_time"].labels(device_name).inc(
int(smart_log["controller_busy_time"])
)
metrics["unsafe_shutdowns"].labels(device_name).inc(
int(smart_log["unsafe_shutdowns"])
metrics["namespace_info"].labels(
ns_dev,
ns["NSID"],
ctrl_dev,
)

# NVMe reports temperature in kelvins; convert it to degrees Celsius.
metrics["temperature"].labels(device_name).set(smart_log["temperature"] - 273)
metrics["sector_size"].labels(ns_dev).set(ns["SectorSize"])
metrics["physical_size"].labels(ns_dev).set(ns["PhysicalSize"])
metrics["used_bytes"].labels(ns_dev).set(ns["UsedBytes"])

# Fetch the controller global SMART log by omitting the --namespace-id flag.
smart_log = exec_nvme_json("smart-log", os.path.join("/dev", ctrl["Controller"]))

# Various counters in the NVMe specification are 128-bit, which would have to
# discard resolution if converted to a JSON number (i.e., float64_t). Instead,
# nvme-cli marshals them as strings. As such, they need to be explicitly cast to int
# or float when using them in Counter metrics.
metrics["data_units_read"].labels(ctrl_dev).inc(int(smart_log["data_units_read"]))
metrics["data_units_written"].labels(ctrl_dev).inc(
int(smart_log["data_units_written"])
)
metrics["host_read_commands"].labels(ctrl_dev).inc(
int(smart_log["host_read_commands"])
)
metrics["host_write_commands"].labels(ctrl_dev).inc(
int(smart_log["host_write_commands"])
)
metrics["avail_spare"].labels(ctrl_dev).set(smart_log["avail_spare"] / 100)
metrics["spare_thresh"].labels(ctrl_dev).set(smart_log["spare_thresh"] / 100)
metrics["percent_used"].labels(ctrl_dev).set(smart_log["percent_used"] / 100)
metrics["critical_warning"].labels(ctrl_dev).set(
smart_log["critical_warning"]["value"]
)
metrics["media_errors"].labels(ctrl_dev).inc(int(smart_log["media_errors"]))
metrics["num_err_log_entries"].labels(ctrl_dev).inc(
int(smart_log["num_err_log_entries"])
)
metrics["power_cycles"].labels(ctrl_dev).inc(int(smart_log["power_cycles"]))
metrics["power_on_hours"].labels(ctrl_dev).inc(int(smart_log["power_on_hours"]))
metrics["controller_busy_time"].labels(ctrl_dev).inc(
int(smart_log["controller_busy_time"])
)
metrics["unsafe_shutdowns"].labels(ctrl_dev).inc(int(smart_log["unsafe_shutdowns"]))

# NVMe reports temperature in kelvins; convert it to degrees Celsius.
metrics["temperature"].labels(ctrl_dev).set(smart_log["temperature"] - 273)


if __name__ == "__main__":
Expand Down