Skip to content

Commit

Permalink
gpu_sysman: Add "LogMetrics" option
Browse files Browse the repository at this point in the history
This can be used for local real-time monitoring of GPU metrics without
need to pull/parse collectd write plugin output.

Output is most readable when only one metric type and MetricsOutput
variant are enabled, and collectd sees only single GPU.  Alternatively
one could grep the output for the relevant GPU, metric type and its
output variant.

Signed-off-by: Eero Tamminen <[email protected]>
  • Loading branch information
eero-t committed Feb 19, 2024
1 parent 07f21f7 commit 37f32b9
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 9 deletions.
12 changes: 12 additions & 0 deletions src/collectd.conf.pod
Original file line number Diff line number Diff line change
Expand Up @@ -3777,6 +3777,18 @@ If enabled, plugin logs at start some information about plugin
settings, all the GPUs detected through Sysman API, and enables
"pci_dev" PCI device ID label for the metrics.

=item B<LogMetrics>

If enabled, all metric values are also printed to collectd log
(standard output by default). This can be useful for local real-time
monitoring / debugging of specific GPU metric values, as one does not
need to enable any write plugins.

Output is most readable when only one metric type + "MetricsOutput"
variant are enabled, and collectd (container) sees only single GPU.
Alternatively one could grep the output for the relevant GPU, metric
type and its output variant, but that adds delay to the output.

=item B<MetricsOutput>

Set of "base", "rate", and "ratio" strings, separated by comma, colon,
Expand Down
60 changes: 51 additions & 9 deletions src/gpu_sysman.c
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#include <level_zero/ze_api.h>
#include <level_zero/zes_api.h>
Expand Down Expand Up @@ -143,6 +144,7 @@ static gpu_device_t *gpus;
static uint32_t gpu_count;
static struct {
bool gpuinfo;
bool logmetrics;
gpu_disable_t disabled;
output_t output;
uint32_t samples;
Expand All @@ -161,8 +163,9 @@ static struct {
#define KEY_DISABLE_TEMP "DisableTemperature"
#define KEY_DISABLE_THROTTLE "DisableThrottleTime"

#define KEY_METRICS_OUTPUT "MetricsOutput"
#define KEY_LOG_GPU_INFO "LogGpuInfo"
#define KEY_LOG_METRICS "LogMetrics"
#define KEY_METRICS_OUTPUT "MetricsOutput"
#define KEY_SAMPLES "Samples"
#define MAX_SAMPLES 64

Expand Down Expand Up @@ -789,8 +792,47 @@ static int gpu_init(void) {
}

/* Add GPU resource attribs to given metric family and submit family to
* collectd. Resets metric family after dispatch */
* collectd, and log the metric if metric logging is enabled.
* Resets metric family after dispatch */
static void gpu_submit(gpu_device_t *gpu, metric_family_t *fam) {
const char *pci_bdf = label_set_get(gpu->attribs, "pci_bdf");
assert(pci_bdf);

/* logmetrics readability: skip common BDF address prefix */
if (strncmp("0000:", pci_bdf, 5) == 0) {
pci_bdf += 5;
}

const char *name = fam->name;
/* logmetrics readability: skip common metrics name prefix */
if (strncmp(METRIC_PREFIX, name, strlen(METRIC_PREFIX)) != 0) {
name += strlen(METRIC_PREFIX);
}

struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);

for (size_t i = 0; i < fam->metric.num; i++) {
metric_t *m = fam->metric.ptr + i;

/* log metric values in addition to dispatching them? */
if (config.logmetrics) {
const char *type = "<type>";
char *labels[] = {"direction", "location", "type"};
for (size_t i = 0; i < STATIC_ARRAY_SIZE(labels); i++) {
char const *l = metric_label_get(m, labels[i]);
if (l != NULL) {
type = l;
break;
}
}
INFO("[%7ld.%03ld] %s: %s / %s [%ld]: %.3f", ts.tv_sec,
ts.tv_nsec / 1000000, pci_bdf, name, type, i,
fam->type == METRIC_TYPE_COUNTER ? m->value.counter
: m->value.gauge);
}
}

fam->resource = gpu->attribs;
int status = plugin_dispatch_metric_family(fam);
if (status != 0) {
Expand Down Expand Up @@ -2563,6 +2605,8 @@ static int gpu_config_parse(const char *key, const char *value) {
config.disabled.throttle = IS_TRUE(value);
} else if (strcasecmp(key, KEY_LOG_GPU_INFO) == 0) {
config.gpuinfo = IS_TRUE(value);
} else if (strcasecmp(key, KEY_LOG_METRICS) == 0) {
config.logmetrics = IS_TRUE(value);
} else if (strcasecmp(key, KEY_METRICS_OUTPUT) == 0) {
config.output = 0;
static const char delim[] = ",:/ ";
Expand Down Expand Up @@ -2616,13 +2660,11 @@ static int gpu_config_parse(const char *key, const char *value) {
void module_register(void) {
/* NOTE: key strings *must* be static */
static const char *config_keys[] = {
KEY_DISABLE_ENGINE, KEY_DISABLE_ENGINE_SINGLE,
KEY_DISABLE_FABRIC, KEY_DISABLE_FREQ,
KEY_DISABLE_MEM, KEY_DISABLE_MEMBW,
KEY_DISABLE_POWER, KEY_DISABLE_RAS,
KEY_DISABLE_RAS_SEPARATE, KEY_DISABLE_TEMP,
KEY_DISABLE_THROTTLE, KEY_METRICS_OUTPUT,
KEY_LOG_GPU_INFO, KEY_SAMPLES};
KEY_DISABLE_ENGINE, KEY_DISABLE_ENGINE_SINGLE, KEY_DISABLE_FABRIC,
KEY_DISABLE_FREQ, KEY_DISABLE_MEM, KEY_DISABLE_MEMBW,
KEY_DISABLE_POWER, KEY_DISABLE_RAS, KEY_DISABLE_RAS_SEPARATE,
KEY_DISABLE_TEMP, KEY_DISABLE_THROTTLE, KEY_METRICS_OUTPUT,
KEY_LOG_GPU_INFO, KEY_LOG_METRICS, KEY_SAMPLES};
const int config_keys_num = STATIC_ARRAY_SIZE(config_keys);

plugin_register_config(PLUGIN_NAME, gpu_config_parse, config_keys,
Expand Down
2 changes: 2 additions & 0 deletions src/gpu_sysman_test.c
Original file line number Diff line number Diff line change
Expand Up @@ -1476,6 +1476,7 @@ int main(int argc, const char **argv) {

assert(registry.config("DisableSeparateErrors", "false") == 0);
set_verbose(VERBOSE_CALLS_METRICS, VERBOSE_METRICS_NORMAL);
assert(registry.config("LogMetrics", "true") == 0);
assert(registry.init() == 0);

fprintf(stderr, "Query all metrics for the first time, with separate errors "
Expand All @@ -1487,6 +1488,7 @@ int main(int argc, const char **argv) {
assert(globs.warnings == 0);
/* per-time counters do not report on first round */
assert(validate_and_reset_saved_metrics(1, 0) > 0);
assert(registry.config("LogMetrics", "false") == 0);
fprintf(stderr, "metrics query round 1: PASS\n\n");

api_calls = globs.api_calls;
Expand Down

0 comments on commit 37f32b9

Please sign in to comment.