From 4456c97ffba1384db2e1eda5648210067f792c21 Mon Sep 17 00:00:00 2001 From: Mike Eves Date: Thu, 12 May 2022 14:31:12 +0100 Subject: [PATCH] Add additional collector for SLM stats (#558) * Add additional collector for SLM stats Signed-off-by: Mike Eves * Add additional metric for SLM status (operation mode) Signed-off-by: Mike Eves * Update README Signed-off-by: Mike Eves * Record time metrics in seconds Signed-off-by: Mike Eves * Update metrics to be counters where appropriate Signed-off-by: Mike Eves * Modify tests and update label on operation_mode metric Signed-off-by: Mike Eves * Simplify test fixture Signed-off-by: Mike Eves --- README.md | 19 ++ collector/slm.go | 401 ++++++++++++++++++++++++++++++++++++++ collector/slm_response.go | 42 ++++ collector/slm_test.go | 65 ++++++ main.go | 7 + 5 files changed, 534 insertions(+) create mode 100644 collector/slm.go create mode 100644 collector/slm_response.go create mode 100644 collector/slm_test.go diff --git a/README.md b/README.md index 0dfd5977..a1cf9104 100644 --- a/README.md +++ b/README.md @@ -56,6 +56,7 @@ elasticsearch_exporter --help | es.indices_mappings | 1.2.0 | If true, query stats for mappings of all indices of the cluster. | false | | es.shards | 1.0.3rc1 | If true, query stats for all indices in the cluster, including shard-level stats (implies `es.indices=true`). | false | | es.snapshots | 1.0.4rc1 | If true, query stats for the cluster snapshots. | false | +| es.slm | | If true, query stats for SLM. | false | | es.timeout | 1.0.2 | Timeout for trying to get stats from Elasticsearch. (ex: 20s) | 5s | | es.ca | 1.0.2 | Path to PEM file that contains trusted Certificate Authorities for the Elasticsearch connection. | | | es.client-private-key | 1.0.2 | Path to PEM file that contains the private key for client auth when connecting to Elasticsearch. | | @@ -86,6 +87,7 @@ es.indices | `indices` `monitor` (per index or `*`) | All actions that are requi es.indices_settings | `indices` `monitor` (per index or `*`) | es.shards | not sure if `indices` or `cluster` `monitor` or both | es.snapshots | `cluster:admin/snapshot/status` and `cluster:admin/repository/get` | [ES Forum Post](https://discuss.elastic.co/t/permissions-for-backup-user-with-x-pack/88057) +es.slm | `read_slm` Further Information - [Build in Users](https://www.elastic.co/guide/en/elastic-stack-overview/7.3/built-in-users.html) @@ -221,6 +223,23 @@ Further Information | elasticsearch_clusterinfo_last_retrieval_success_ts | gauge | 1 | Timestamp of the last successful cluster info retrieval | elasticsearch_clusterinfo_up | gauge | 1 | Up metric for the cluster info collector | elasticsearch_clusterinfo_version_info | gauge | 6 | Constant metric with ES version information as labels +| elasticsearch_slm_stats_up | gauge | 0 | Up metric for SLM collector +| elasticsearch_slm_stats_total_scrapes | counter | 0 | Number of scrapes for SLM collector +| elasticsearch_slm_stats_json_parse_failures | counter | 0 | JSON parse failures for SLM collector +| elasticsearch_slm_stats_retention_runs_total | counter | 0 | Total retention runs +| elasticsearch_slm_stats_retention_failed_total | counter | 0 | Total failed retention runs +| elasticsearch_slm_stats_retention_timed_out_total | counter | 0 | Total retention run timeouts +| elasticsearch_slm_stats_retention_deletion_time_seconds | gauge | 0 | Retention run deletion time +| elasticsearch_slm_stats_total_snapshots_taken_total | counter | 0 | Total snapshots taken +| elasticsearch_slm_stats_total_snapshots_failed_total | counter | 0 | Total snapshots failed +| elasticsearch_slm_stats_total_snapshots_deleted_total | counter | 0 | Total snapshots deleted +| elasticsearch_slm_stats_total_snapshots_failed_total | counter | 0 | Total snapshots failed +| elasticsearch_slm_stats_snapshots_taken_total | counter | 1 | Snapshots taken by policy +| elasticsearch_slm_stats_snapshots_failed_total | counter | 1 | Snapshots failed by policy +| elasticsearch_slm_stats_snapshots_deleted_total | counter | 1 | Snapshots deleted by policy +| elasticsearch_slm_stats_snapshot_deletion_failures_total | counter | 1 | Snapshot deletion failures by policy +| elasticsearch_slm_stats_operation_mode | gauge | 1 | SLM operation mode (Running, stopping, stopped) + ### Alerts & Recording Rules diff --git a/collector/slm.go b/collector/slm.go new file mode 100644 index 00000000..84465517 --- /dev/null +++ b/collector/slm.go @@ -0,0 +1,401 @@ +// Copyright 2022 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package collector + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "net/http" + "net/url" + "path" + + "github.com/go-kit/log" + "github.com/go-kit/log/level" + "github.com/prometheus/client_golang/prometheus" +) + +type policyMetric struct { + Type prometheus.ValueType + Desc *prometheus.Desc + Value func(policyStats PolicyStats) float64 + Labels func(policyStats PolicyStats) []string +} + +type slmMetric struct { + Type prometheus.ValueType + Desc *prometheus.Desc + Value func(slmStats SLMStatsResponse) float64 +} + +type slmStatusMetric struct { + Type prometheus.ValueType + Desc *prometheus.Desc + Value func(slmStatus SLMStatusResponse, operationMode string) float64 + Labels func(operationMode string) []string +} + +var ( + defaultPolicyLabels = []string{"policy"} + defaultPolicyLabelValues = func(policyStats PolicyStats) []string { + return []string{policyStats.Policy} + } + + statuses = []string{"RUNNING", "STOPPING", "STOPPED"} +) + +// SLM information struct +type SLM struct { + logger log.Logger + client *http.Client + url *url.URL + + up prometheus.Gauge + totalScrapes, jsonParseFailures prometheus.Counter + + slmMetrics []*slmMetric + policyMetrics []*policyMetric + slmStatusMetric *slmStatusMetric +} + +// NewSLM defines SLM Prometheus metrics +func NewSLM(logger log.Logger, client *http.Client, url *url.URL) *SLM { + return &SLM{ + logger: logger, + client: client, + url: url, + + up: prometheus.NewGauge(prometheus.GaugeOpts{ + Name: prometheus.BuildFQName(namespace, "slm_stats", "up"), + Help: "Was the last scrape of the ElasticSearch SLM endpoint successful.", + }), + totalScrapes: prometheus.NewCounter(prometheus.CounterOpts{ + Name: prometheus.BuildFQName(namespace, "slm_stats", "total_scrapes"), + Help: "Current total ElasticSearch SLM scrapes.", + }), + jsonParseFailures: prometheus.NewCounter(prometheus.CounterOpts{ + Name: prometheus.BuildFQName(namespace, "slm_stats", "json_parse_failures"), + Help: "Number of errors while parsing JSON.", + }), + slmMetrics: []*slmMetric{ + { + Type: prometheus.CounterValue, + Desc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, "slm_stats", "retention_runs_total"), + "Total retention runs", + nil, nil, + ), + Value: func(slmStats SLMStatsResponse) float64 { + return float64(slmStats.RetentionRuns) + }, + }, + { + Type: prometheus.CounterValue, + Desc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, "slm_stats", "retention_failed_total"), + "Total failed retention runs", + nil, nil, + ), + Value: func(slmStats SLMStatsResponse) float64 { + return float64(slmStats.RetentionFailed) + }, + }, + { + Type: prometheus.CounterValue, + Desc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, "slm_stats", "retention_timed_out_total"), + "Total timed out retention runs", + nil, nil, + ), + Value: func(slmStats SLMStatsResponse) float64 { + return float64(slmStats.RetentionTimedOut) + }, + }, + { + Type: prometheus.GaugeValue, + Desc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, "slm_stats", "retention_deletion_time_seconds"), + "Retention run deletion time", + nil, nil, + ), + Value: func(slmStats SLMStatsResponse) float64 { + return float64(slmStats.RetentionDeletionTimeMillis) / 1000 + }, + }, + { + Type: prometheus.CounterValue, + Desc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, "slm_stats", "total_snapshots_taken_total"), + "Total snapshots taken", + nil, nil, + ), + Value: func(slmStats SLMStatsResponse) float64 { + return float64(slmStats.TotalSnapshotsTaken) + }, + }, + { + Type: prometheus.CounterValue, + Desc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, "slm_stats", "total_snapshots_failed_total"), + "Total snapshots failed", + nil, nil, + ), + Value: func(slmStats SLMStatsResponse) float64 { + return float64(slmStats.TotalSnapshotsFailed) + }, + }, + { + Type: prometheus.CounterValue, + Desc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, "slm_stats", "total_snapshots_deleted_total"), + "Total snapshots deleted", + nil, nil, + ), + Value: func(slmStats SLMStatsResponse) float64 { + return float64(slmStats.TotalSnapshotsDeleted) + }, + }, + { + Type: prometheus.CounterValue, + Desc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, "slm_stats", "total_snapshot_deletion_failures_total"), + "Total snapshot deletion failures", + nil, nil, + ), + Value: func(slmStats SLMStatsResponse) float64 { + return float64(slmStats.TotalSnapshotDeletionFailures) + }, + }, + }, + policyMetrics: []*policyMetric{ + { + Type: prometheus.CounterValue, + Desc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, "slm_stats", "snapshots_taken_total"), + "Total snapshots taken", + defaultPolicyLabels, nil, + ), + Value: func(policyStats PolicyStats) float64 { + return float64(policyStats.SnapshotsTaken) + }, + Labels: defaultPolicyLabelValues, + }, + { + Type: prometheus.CounterValue, + Desc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, "slm_stats", "snapshots_failed_total"), + "Total snapshots failed", + defaultPolicyLabels, nil, + ), + Value: func(policyStats PolicyStats) float64 { + return float64(policyStats.SnapshotsFailed) + }, + Labels: defaultPolicyLabelValues, + }, + { + Type: prometheus.CounterValue, + Desc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, "slm_stats", "snapshots_deleted_total"), + "Total snapshots deleted", + defaultPolicyLabels, nil, + ), + Value: func(policyStats PolicyStats) float64 { + return float64(policyStats.SnapshotsDeleted) + }, + Labels: defaultPolicyLabelValues, + }, + { + Type: prometheus.CounterValue, + Desc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, "slm_stats", "snapshot_deletion_failures_total"), + "Total snapshot deletion failures", + defaultPolicyLabels, nil, + ), + Value: func(policyStats PolicyStats) float64 { + return float64(policyStats.SnapshotDeletionFailures) + }, + Labels: defaultPolicyLabelValues, + }, + }, + slmStatusMetric: &slmStatusMetric{ + Type: prometheus.GaugeValue, + Desc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, "slm_stats", "operation_mode"), + "Operating status of SLM", + []string{"operation_mode"}, nil, + ), + Value: func(slmStatus SLMStatusResponse, operationMode string) float64 { + if slmStatus.OperationMode == operationMode { + return 1 + } + return 0 + }, + }, + } +} + +// Describe adds SLM metrics descriptions +func (s *SLM) Describe(ch chan<- *prometheus.Desc) { + ch <- s.slmStatusMetric.Desc + + for _, metric := range s.slmMetrics { + ch <- metric.Desc + } + + for _, metric := range s.policyMetrics { + ch <- metric.Desc + } + + ch <- s.up.Desc() + ch <- s.totalScrapes.Desc() + ch <- s.jsonParseFailures.Desc() +} + +func (s *SLM) fetchAndDecodeSLMStats() (SLMStatsResponse, error) { + var ssr SLMStatsResponse + + u := *s.url + u.Path = path.Join(u.Path, "/_slm/stats") + res, err := s.client.Get(u.String()) + if err != nil { + return ssr, fmt.Errorf("failed to get slm stats health from %s://%s:%s%s: %s", + u.Scheme, u.Hostname(), u.Port(), u.Path, err) + } + + defer func() { + err = res.Body.Close() + if err != nil { + _ = level.Warn(s.logger).Log( + "msg", "failed to close http.Client", + "err", err, + ) + } + }() + + if res.StatusCode != http.StatusOK { + return ssr, fmt.Errorf("HTTP Request failed with code %d", res.StatusCode) + } + + bts, err := ioutil.ReadAll(res.Body) + if err != nil { + s.jsonParseFailures.Inc() + return ssr, err + } + + if err := json.Unmarshal(bts, &ssr); err != nil { + s.jsonParseFailures.Inc() + return ssr, err + } + + return ssr, nil +} + +func (s *SLM) fetchAndDecodeSLMStatus() (SLMStatusResponse, error) { + var ssr SLMStatusResponse + + u := *s.url + u.Path = path.Join(u.Path, "/_slm/status") + res, err := s.client.Get(u.String()) + if err != nil { + return ssr, fmt.Errorf("failed to get slm status from %s://%s:%s%s: %s", + u.Scheme, u.Hostname(), u.Port(), u.Path, err) + } + + defer func() { + err = res.Body.Close() + if err != nil { + _ = level.Warn(s.logger).Log( + "msg", "failed to close http.Client", + "err", err, + ) + } + }() + + if res.StatusCode != http.StatusOK { + return ssr, fmt.Errorf("HTTP Request failed with code %d", res.StatusCode) + } + + bts, err := ioutil.ReadAll(res.Body) + if err != nil { + s.jsonParseFailures.Inc() + return ssr, err + } + + if err := json.Unmarshal(bts, &ssr); err != nil { + s.jsonParseFailures.Inc() + return ssr, err + } + + return ssr, nil +} + +// Collect gets SLM metric values +func (s *SLM) Collect(ch chan<- prometheus.Metric) { + s.totalScrapes.Inc() + defer func() { + ch <- s.up + ch <- s.totalScrapes + ch <- s.jsonParseFailures + }() + + slmStatusResp, err := s.fetchAndDecodeSLMStatus() + if err != nil { + s.up.Set(0) + _ = level.Warn(s.logger).Log( + "msg", "failed to fetch and decode slm status", + "err", err, + ) + return + } + + slmStatsResp, err := s.fetchAndDecodeSLMStats() + if err != nil { + s.up.Set(0) + _ = level.Warn(s.logger).Log( + "msg", "failed to fetch and decode slm stats", + "err", err, + ) + return + } + + s.up.Set(1) + + for _, status := range statuses { + ch <- prometheus.MustNewConstMetric( + s.slmStatusMetric.Desc, + s.slmStatusMetric.Type, + s.slmStatusMetric.Value(slmStatusResp, status), + status, + ) + } + + for _, metric := range s.slmMetrics { + ch <- prometheus.MustNewConstMetric( + metric.Desc, + metric.Type, + metric.Value(slmStatsResp), + ) + } + + for _, metric := range s.policyMetrics { + for _, policy := range slmStatsResp.PolicyStats { + ch <- prometheus.MustNewConstMetric( + metric.Desc, + metric.Type, + metric.Value(policy), + metric.Labels(policy)..., + ) + } + } +} diff --git a/collector/slm_response.go b/collector/slm_response.go new file mode 100644 index 00000000..b1cfc1b1 --- /dev/null +++ b/collector/slm_response.go @@ -0,0 +1,42 @@ +// Copyright 2022 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package collector + +// SLMStatsResponse is a representation of the SLM stats +type SLMStatsResponse struct { + RetentionRuns int64 `json:"retention_runs"` + RetentionFailed int64 `json:"retention_failed"` + RetentionTimedOut int64 `json:"retention_timed_out"` + RetentionDeletionTime string `json:"retention_deletion_time"` + RetentionDeletionTimeMillis int64 `json:"retention_deletion_time_millis"` + TotalSnapshotsTaken int64 `json:"total_snapshots_taken"` + TotalSnapshotsFailed int64 `json:"total_snapshots_failed"` + TotalSnapshotsDeleted int64 `json:"total_snapshots_deleted"` + TotalSnapshotDeletionFailures int64 `json:"total_snapshot_deletion_failures"` + PolicyStats []PolicyStats `json:"policy_stats"` +} + +// PolicyStats is a representation of SLM stats for specific policies +type PolicyStats struct { + Policy string `json:"policy"` + SnapshotsTaken int64 `json:"snapshots_taken"` + SnapshotsFailed int64 `json:"snapshots_failed"` + SnapshotsDeleted int64 `json:"snapshots_deleted"` + SnapshotDeletionFailures int64 `json:"snapshot_deletion_failures"` +} + +// SLMStatusResponse is a representation of the SLM status +type SLMStatusResponse struct { + OperationMode string `json:"operation_mode"` +} diff --git a/collector/slm_test.go b/collector/slm_test.go new file mode 100644 index 00000000..5bbea562 --- /dev/null +++ b/collector/slm_test.go @@ -0,0 +1,65 @@ +// Copyright 2022 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package collector + +import ( + "fmt" + "net/http" + "net/http/httptest" + "net/url" + "testing" + + "github.com/go-kit/log" +) + +func TestSLM(t *testing.T) { + // Testcases created using: + + // docker run -d -p 9200:9200 -e discovery.type=single-node -e path.repo=/tmp/backups docker.elastic.co/elasticsearch/elasticsearch:7.15.0-arm64 + // curl -XPUT http://127.0.0.1:9200/_snapshot/my_repository -H 'Content-Type: application/json' -d '{"type":"url","settings":{"url":"file:/tmp/backups"}}' + // curl -XPUT http://127.0.0.1:9200/_slm/policy/everything -H 'Content-Type: application/json' -d '{"schedule":"0 */15 * * * ?","name":"","repository":"my_repository","config":{"indices":".*","include_global_state":true,"ignore_unavailable":true},"retention":{"expire_after":"7d"}}' + // curl http://127.0.0.1:9200/_slm/stats (Numbers manually tweaked) + + tcs := map[string]string{ + "7.15.0": `{"retention_runs":9,"retention_failed":0,"retention_timed_out":0,"retention_deletion_time":"1.2m","retention_deletion_time_millis":72491,"total_snapshots_taken":103,"total_snapshots_failed":2,"total_snapshots_deleted":20,"total_snapshot_deletion_failures":0,"policy_stats":[{"policy":"everything","snapshots_taken":50,"snapshots_failed":2,"snapshots_deleted":20,"snapshot_deletion_failures":0}]}`, + } + for ver, out := range tcs { + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + fmt.Fprintln(w, out) + })) + defer ts.Close() + + u, err := url.Parse(ts.URL) + if err != nil { + t.Fatalf("Failed to parse URL: %s", err) + } + s := NewSLM(log.NewNopLogger(), http.DefaultClient, u) + stats, err := s.fetchAndDecodeSLMStats() + if err != nil { + t.Fatalf("Failed to fetch or decode snapshots stats: %s", err) + } + t.Logf("[%s] SLM Response: %+v", ver, stats) + slmStats := stats + policyStats := stats.PolicyStats[0] + + if slmStats.TotalSnapshotsTaken != 103 { + t.Errorf("Bad number of total snapshots taken") + } + + if policyStats.SnapshotsTaken != 50 { + t.Errorf("Bad number of policy snapshots taken") + } + } + +} diff --git a/main.go b/main.go index cde2e747..2d3657ce 100644 --- a/main.go +++ b/main.go @@ -82,6 +82,9 @@ func main() { esExportSnapshots = kingpin.Flag("es.snapshots", "Export stats for the cluster snapshots."). Default("false").Bool() + esExportSLM = kingpin.Flag("es.slm", + "Export stats for SLM snapshots."). + Default("false").Bool() esClusterInfoInterval = kingpin.Flag("es.clusterinfo.interval", "Cluster info update interval for the cluster label"). Default("5m").Duration() @@ -176,6 +179,10 @@ func main() { prometheus.MustRegister(collector.NewSnapshots(logger, httpClient, esURL)) } + if *esExportSLM { + prometheus.MustRegister(collector.NewSLM(logger, httpClient, esURL)) + } + if *esExportClusterSettings { prometheus.MustRegister(collector.NewClusterSettings(logger, httpClient, esURL)) }