diff --git a/.chloggen/systemd-restart-count.yaml b/.chloggen/systemd-restart-count.yaml new file mode 100644 index 0000000000000..7927fc6292d97 --- /dev/null +++ b/.chloggen/systemd-restart-count.yaml @@ -0,0 +1,27 @@ +# Use this changelog template to create an entry for release notes. + +# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' +change_type: enhancement + +# The name of the component, or a single word describing the area of concern, (e.g. receiver/filelog) +component: receiver/systemd + +# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). +note: Add metric for number of times a service has restarted. + +# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists. +issues: [45071] + +# (Optional) One or more lines of additional information to render under the primary note. +# These lines will be padded with 2 spaces and then inserted directly into the document. +# Use pipe (|) for multiline entries. +subtext: + +# If your change doesn't affect end users or the exported elements of any package, +# you should instead start your pull request title with [chore] or use the "Skip Changelog" label. +# Optional: The change log or logs in which this entry should be included. +# e.g. '[user]' or '[user, api]' +# Include 'user' if the change is relevant to end users. +# Include 'api' if there is a change to a library API. +# Default: '[user]' +change_logs: [user] diff --git a/receiver/systemdreceiver/documentation.md b/receiver/systemdreceiver/documentation.md index c23df3a47cc03..14be711373f55 100644 --- a/receiver/systemdreceiver/documentation.md +++ b/receiver/systemdreceiver/documentation.md @@ -40,6 +40,29 @@ Total CPU time spent by this service. | ---- | ----------- | ------ | -------- | | systemd.unit.active_state | The active state of the unit (https://www.freedesktop.org/software/systemd/man/latest/systemd.html#Units) | Str: ``active``, ``reloading``, ``inactive``, ``failed``, ``activating``, ``deactivating``, ``maintenance``, ``refreshing`` | Recommended | +## Optional Metrics + +The following metrics are not emitted by default. Each of them can be enabled by applying the following configuration: + +```yaml +metrics: + : + enabled: true +``` + +### systemd.service.restarts + +Number of automatic restarts for the service. + +This exposes services' `NRestarts` property as a metric. This only tracks +automatic service restarts (restarts when the process exits), and does +not include manual restarts (e.g. from `systemctl restart`). + + +| Unit | Metric Type | Value Type | Aggregation Temporality | Monotonic | Stability | +| ---- | ----------- | ---------- | ----------------------- | --------- | --------- | +| {restarts} | Sum | Int | Cumulative | true | Development | + ## Resource Attributes | Name | Description | Values | Enabled | diff --git a/receiver/systemdreceiver/internal/metadata/generated_config.go b/receiver/systemdreceiver/internal/metadata/generated_config.go index b9f49c0b52813..84b32d6d892ab 100644 --- a/receiver/systemdreceiver/internal/metadata/generated_config.go +++ b/receiver/systemdreceiver/internal/metadata/generated_config.go @@ -28,8 +28,9 @@ func (ms *MetricConfig) Unmarshal(parser *confmap.Conf) error { // MetricsConfig provides config for systemd metrics. type MetricsConfig struct { - SystemdServiceCPUTime MetricConfig `mapstructure:"systemd.service.cpu.time"` - SystemdUnitState MetricConfig `mapstructure:"systemd.unit.state"` + SystemdServiceCPUTime MetricConfig `mapstructure:"systemd.service.cpu.time"` + SystemdServiceRestarts MetricConfig `mapstructure:"systemd.service.restarts"` + SystemdUnitState MetricConfig `mapstructure:"systemd.unit.state"` } func DefaultMetricsConfig() MetricsConfig { @@ -37,6 +38,9 @@ func DefaultMetricsConfig() MetricsConfig { SystemdServiceCPUTime: MetricConfig{ Enabled: true, }, + SystemdServiceRestarts: MetricConfig{ + Enabled: false, + }, SystemdUnitState: MetricConfig{ Enabled: true, }, diff --git a/receiver/systemdreceiver/internal/metadata/generated_config_test.go b/receiver/systemdreceiver/internal/metadata/generated_config_test.go index e4fa074f81570..0085b36c472fe 100644 --- a/receiver/systemdreceiver/internal/metadata/generated_config_test.go +++ b/receiver/systemdreceiver/internal/metadata/generated_config_test.go @@ -26,8 +26,9 @@ func TestMetricsBuilderConfig(t *testing.T) { name: "all_set", want: MetricsBuilderConfig{ Metrics: MetricsConfig{ - SystemdServiceCPUTime: MetricConfig{Enabled: true}, - SystemdUnitState: MetricConfig{Enabled: true}, + SystemdServiceCPUTime: MetricConfig{Enabled: true}, + SystemdServiceRestarts: MetricConfig{Enabled: true}, + SystemdUnitState: MetricConfig{Enabled: true}, }, ResourceAttributes: ResourceAttributesConfig{ SystemdUnitName: ResourceAttributeConfig{Enabled: true}, @@ -38,8 +39,9 @@ func TestMetricsBuilderConfig(t *testing.T) { name: "none_set", want: MetricsBuilderConfig{ Metrics: MetricsConfig{ - SystemdServiceCPUTime: MetricConfig{Enabled: false}, - SystemdUnitState: MetricConfig{Enabled: false}, + SystemdServiceCPUTime: MetricConfig{Enabled: false}, + SystemdServiceRestarts: MetricConfig{Enabled: false}, + SystemdUnitState: MetricConfig{Enabled: false}, }, ResourceAttributes: ResourceAttributesConfig{ SystemdUnitName: ResourceAttributeConfig{Enabled: false}, diff --git a/receiver/systemdreceiver/internal/metadata/generated_metrics.go b/receiver/systemdreceiver/internal/metadata/generated_metrics.go index ba9658f737dbf..aadcb8deb8be0 100644 --- a/receiver/systemdreceiver/internal/metadata/generated_metrics.go +++ b/receiver/systemdreceiver/internal/metadata/generated_metrics.go @@ -92,14 +92,18 @@ var MetricsInfo = metricsInfo{ SystemdServiceCPUTime: metricInfo{ Name: "systemd.service.cpu.time", }, + SystemdServiceRestarts: metricInfo{ + Name: "systemd.service.restarts", + }, SystemdUnitState: metricInfo{ Name: "systemd.unit.state", }, } type metricsInfo struct { - SystemdServiceCPUTime metricInfo - SystemdUnitState metricInfo + SystemdServiceCPUTime metricInfo + SystemdServiceRestarts metricInfo + SystemdUnitState metricInfo } type metricInfo struct { @@ -159,6 +163,57 @@ func newMetricSystemdServiceCPUTime(cfg MetricConfig) metricSystemdServiceCPUTim return m } +type metricSystemdServiceRestarts struct { + data pmetric.Metric // data buffer for generated metric. + config MetricConfig // metric config provided by user. + capacity int // max observed number of data points added to the metric. +} + +// init fills systemd.service.restarts metric with initial data. +func (m *metricSystemdServiceRestarts) init() { + m.data.SetName("systemd.service.restarts") + m.data.SetDescription("Number of automatic restarts for the service.") + m.data.SetUnit("{restarts}") + m.data.SetEmptySum() + m.data.Sum().SetIsMonotonic(true) + m.data.Sum().SetAggregationTemporality(pmetric.AggregationTemporalityCumulative) +} + +func (m *metricSystemdServiceRestarts) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val int64) { + if !m.config.Enabled { + return + } + dp := m.data.Sum().DataPoints().AppendEmpty() + dp.SetStartTimestamp(start) + dp.SetTimestamp(ts) + dp.SetIntValue(val) +} + +// updateCapacity saves max length of data point slices that will be used for the slice capacity. +func (m *metricSystemdServiceRestarts) updateCapacity() { + if m.data.Sum().DataPoints().Len() > m.capacity { + m.capacity = m.data.Sum().DataPoints().Len() + } +} + +// emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. +func (m *metricSystemdServiceRestarts) emit(metrics pmetric.MetricSlice) { + if m.config.Enabled && m.data.Sum().DataPoints().Len() > 0 { + m.updateCapacity() + m.data.MoveTo(metrics.AppendEmpty()) + m.init() + } +} + +func newMetricSystemdServiceRestarts(cfg MetricConfig) metricSystemdServiceRestarts { + m := metricSystemdServiceRestarts{config: cfg} + if cfg.Enabled { + m.data = pmetric.NewMetric() + m.init() + } + return m +} + type metricSystemdUnitState struct { data pmetric.Metric // data buffer for generated metric. config MetricConfig // metric config provided by user. @@ -223,6 +278,7 @@ type MetricsBuilder struct { resourceAttributeIncludeFilter map[string]filter.Filter resourceAttributeExcludeFilter map[string]filter.Filter metricSystemdServiceCPUTime metricSystemdServiceCPUTime + metricSystemdServiceRestarts metricSystemdServiceRestarts metricSystemdUnitState metricSystemdUnitState } @@ -250,6 +306,7 @@ func NewMetricsBuilder(mbc MetricsBuilderConfig, settings receiver.Settings, opt metricsBuffer: pmetric.NewMetrics(), buildInfo: settings.BuildInfo, metricSystemdServiceCPUTime: newMetricSystemdServiceCPUTime(mbc.Metrics.SystemdServiceCPUTime), + metricSystemdServiceRestarts: newMetricSystemdServiceRestarts(mbc.Metrics.SystemdServiceRestarts), metricSystemdUnitState: newMetricSystemdUnitState(mbc.Metrics.SystemdUnitState), resourceAttributeIncludeFilter: make(map[string]filter.Filter), resourceAttributeExcludeFilter: make(map[string]filter.Filter), @@ -330,6 +387,7 @@ func (mb *MetricsBuilder) EmitForResource(options ...ResourceMetricsOption) { ils.Scope().SetVersion(mb.buildInfo.Version) ils.Metrics().EnsureCapacity(mb.metricsCapacity) mb.metricSystemdServiceCPUTime.emit(ils.Metrics()) + mb.metricSystemdServiceRestarts.emit(ils.Metrics()) mb.metricSystemdUnitState.emit(ils.Metrics()) for _, op := range options { @@ -367,6 +425,11 @@ func (mb *MetricsBuilder) RecordSystemdServiceCPUTimeDataPoint(ts pcommon.Timest mb.metricSystemdServiceCPUTime.recordDataPoint(mb.startTime, ts, val, cpuModeAttributeValue.String()) } +// RecordSystemdServiceRestartsDataPoint adds a data point to systemd.service.restarts metric. +func (mb *MetricsBuilder) RecordSystemdServiceRestartsDataPoint(ts pcommon.Timestamp, val int64) { + mb.metricSystemdServiceRestarts.recordDataPoint(mb.startTime, ts, val) +} + // RecordSystemdUnitStateDataPoint adds a data point to systemd.unit.state metric. func (mb *MetricsBuilder) RecordSystemdUnitStateDataPoint(ts pcommon.Timestamp, val int64, systemdUnitActiveStateAttributeValue AttributeSystemdUnitActiveState) { mb.metricSystemdUnitState.recordDataPoint(mb.startTime, ts, val, systemdUnitActiveStateAttributeValue.String()) diff --git a/receiver/systemdreceiver/internal/metadata/generated_metrics_test.go b/receiver/systemdreceiver/internal/metadata/generated_metrics_test.go index 2dbac9d966052..384e67d7de139 100644 --- a/receiver/systemdreceiver/internal/metadata/generated_metrics_test.go +++ b/receiver/systemdreceiver/internal/metadata/generated_metrics_test.go @@ -72,6 +72,9 @@ func TestMetricsBuilder(t *testing.T) { allMetricsCount++ mb.RecordSystemdServiceCPUTimeDataPoint(ts, 1, AttributeCPUModeSystem) + allMetricsCount++ + mb.RecordSystemdServiceRestartsDataPoint(ts, 1) + defaultMetricsCount++ allMetricsCount++ mb.RecordSystemdUnitStateDataPoint(ts, 1, AttributeSystemdUnitActiveStateActive) @@ -117,6 +120,20 @@ func TestMetricsBuilder(t *testing.T) { attrVal, ok := dp.Attributes().Get("cpu.mode") assert.True(t, ok) assert.Equal(t, "system", attrVal.Str()) + case "systemd.service.restarts": + assert.False(t, validatedMetrics["systemd.service.restarts"], "Found a duplicate in the metrics slice: systemd.service.restarts") + validatedMetrics["systemd.service.restarts"] = true + assert.Equal(t, pmetric.MetricTypeSum, ms.At(i).Type()) + assert.Equal(t, 1, ms.At(i).Sum().DataPoints().Len()) + assert.Equal(t, "Number of automatic restarts for the service.", ms.At(i).Description()) + assert.Equal(t, "{restarts}", ms.At(i).Unit()) + assert.True(t, ms.At(i).Sum().IsMonotonic()) + assert.Equal(t, pmetric.AggregationTemporalityCumulative, ms.At(i).Sum().AggregationTemporality()) + dp := ms.At(i).Sum().DataPoints().At(0) + assert.Equal(t, start, dp.StartTimestamp()) + assert.Equal(t, ts, dp.Timestamp()) + assert.Equal(t, pmetric.NumberDataPointValueTypeInt, dp.ValueType()) + assert.Equal(t, int64(1), dp.IntValue()) case "systemd.unit.state": assert.False(t, validatedMetrics["systemd.unit.state"], "Found a duplicate in the metrics slice: systemd.unit.state") validatedMetrics["systemd.unit.state"] = true diff --git a/receiver/systemdreceiver/internal/metadata/testdata/config.yaml b/receiver/systemdreceiver/internal/metadata/testdata/config.yaml index c73e542e18ac7..83bd8a92f9ea5 100644 --- a/receiver/systemdreceiver/internal/metadata/testdata/config.yaml +++ b/receiver/systemdreceiver/internal/metadata/testdata/config.yaml @@ -3,6 +3,8 @@ all_set: metrics: systemd.service.cpu.time: enabled: true + systemd.service.restarts: + enabled: true systemd.unit.state: enabled: true resource_attributes: @@ -12,6 +14,8 @@ none_set: metrics: systemd.service.cpu.time: enabled: false + systemd.service.restarts: + enabled: false systemd.unit.state: enabled: false resource_attributes: diff --git a/receiver/systemdreceiver/metadata.yaml b/receiver/systemdreceiver/metadata.yaml index 9ea9f50d0c113..8ef01bd53740a 100644 --- a/receiver/systemdreceiver/metadata.yaml +++ b/receiver/systemdreceiver/metadata.yaml @@ -48,6 +48,21 @@ metrics: unit: us attributes: [cpu.mode] + systemd.service.restarts: + description: Number of automatic restarts for the service. + extended_documentation: | + This exposes services' `NRestarts` property as a metric. This only tracks + automatic service restarts (restarts when the process exits), and does + not include manual restarts (e.g. from `systemctl restart`). + enabled: false + stability: + level: development + sum: + value_type: int + aggregation_temporality: cumulative + monotonic: true + unit: "{restarts}" + systemd.unit.state: description: 1 if the check resulted in active_state matching the current state, otherwise 0. enabled: true diff --git a/receiver/systemdreceiver/scraper.go b/receiver/systemdreceiver/scraper.go index 093057fddf0a3..228afe18af409 100644 --- a/receiver/systemdreceiver/scraper.go +++ b/receiver/systemdreceiver/scraper.go @@ -126,6 +126,22 @@ func (s *systemdScraper) hasCgroupMetrics() bool { return s.cfg.Metrics.SystemdServiceCPUTime.Enabled } +func (s *systemdScraper) scrapeRestartCount(now pcommon.Timestamp, unit *unitTuple) error { + restartVariant, err := s.conn.Object("org.freedesktop.systemd1", unit.Path).GetProperty("org.freedesktop.systemd1.Service.NRestarts") + if err != nil { + return err + } + + var restarts int64 + if err2 := restartVariant.Store(&restarts); err2 != nil { + return err2 + } + + s.mb.RecordSystemdServiceRestartsDataPoint(now, restarts) + + return nil +} + func (s *systemdScraper) scrape(ctx context.Context) (pmetric.Metrics, error) { now := pcommon.NewTimestampFromTime(time.Now()) @@ -152,6 +168,13 @@ func (s *systemdScraper) scrape(ctx context.Context) (pmetric.Metrics, error) { errs.AddPartial(1, err) } } + + if s.cfg.Metrics.SystemdServiceRestarts.Enabled { + err := s.scrapeRestartCount(now, unit) + if err != nil { + errs.AddPartial(1, err) + } + } } resource := s.mb.NewResourceBuilder() diff --git a/receiver/systemdreceiver/scraper_test.go b/receiver/systemdreceiver/scraper_test.go index a3b4e79aed195..be87adf698df5 100644 --- a/receiver/systemdreceiver/scraper_test.go +++ b/receiver/systemdreceiver/scraper_test.go @@ -113,6 +113,7 @@ func (s *testDbusConnection) Object(dest string, path dbus.ObjectPath) dbus.BusO path: path, properties: map[string]dbus.Variant{ "org.freedesktop.systemd1.Service.ControlGroup": dbus.MakeVariant("/system.slice/nginx.service"), + "org.freedesktop.systemd1.Service.NRestarts": dbus.MakeVariant(3), }, } } @@ -128,6 +129,18 @@ func newTestScraper(conf *Config, units []unitTuple) *systemdScraper { } func TestScraperScrape(t *testing.T) { + nginxService := unitTuple{ + Name: "nginx.service", + Description: "A high performance web server and a reverse proxy server", + LoadState: "loaded", + ActiveState: "active", + SubState: "plugged", + Following: "", + Path: "/org/freedesktop/systemd1/unit/nginx_2eservice", + JobID: uint32(0), + JobType: "", + JobPath: "/", + } testCases := []struct { desc string config func() *Config @@ -143,18 +156,7 @@ func TestScraperScrape(t *testing.T) { return cfg }, units: []unitTuple{ - { - Name: "nginx.service", - Description: "A high performance web server and a reverse proxy server", - LoadState: "loaded", - ActiveState: "active", - SubState: "plugged", - Following: "", - Path: "/org/freedesktop/systemd1/unit/nginx_2eservice", - JobID: uint32(0), - JobType: "", - JobPath: "/", - }, + nginxService, { Name: "rsyslog.service", Description: "Advanced key-value store", @@ -179,18 +181,7 @@ func TestScraperScrape(t *testing.T) { return cfg }, units: []unitTuple{ - { - Name: "nginx.service", - Description: "A high performance web server and a reverse proxy server", - LoadState: "loaded", - ActiveState: "active", - SubState: "plugged", - Following: "", - Path: "/org/freedesktop/systemd1/unit/nginx_2eservice", - JobID: uint32(0), - JobType: "", - JobPath: "/", - }, + nginxService, { Name: "dbus.socket", Description: "D-Bus System Message Bus Socket", @@ -207,6 +198,17 @@ func TestScraperScrape(t *testing.T) { goldenName: "cgroups", expectedErr: nil, }, + { + desc: "Service restarts", + config: func() *Config { + cfg := createDefaultDisabledConfig() + cfg.Metrics.SystemdServiceRestarts.Enabled = true + return cfg + }, + units: []unitTuple{nginxService}, + goldenName: "restarts", + expectedErr: nil, + }, } for _, tc := range testCases { diff --git a/receiver/systemdreceiver/testdata/expected_metrics/restarts.yaml b/receiver/systemdreceiver/testdata/expected_metrics/restarts.yaml new file mode 100644 index 0000000000000..855173e14ded2 --- /dev/null +++ b/receiver/systemdreceiver/testdata/expected_metrics/restarts.yaml @@ -0,0 +1,21 @@ +resourceMetrics: + - resource: + attributes: + - key: systemd.unit.name + value: + stringValue: nginx.service + scopeMetrics: + - metrics: + - description: Number of automatic restarts for the service. + name: systemd.service.restarts + sum: + aggregationTemporality: 2 + dataPoints: + - asInt: "3" + startTimeUnixNano: "1000000" + timeUnixNano: "2000000" + isMonotonic: true + unit: '{restarts}' + scope: + name: github.com/open-telemetry/opentelemetry-collector-contrib/receiver/systemdreceiver + version: latest