From 41735100415b26b229e49c3ed80fe280e7a4b10e Mon Sep 17 00:00:00 2001 From: Mike Eves Date: Fri, 1 Apr 2022 17:03:31 +0100 Subject: [PATCH] Add additional collector for SLM stats Signed-off-by: Mike Eves --- collector/slm.go | 315 ++++++++++++++++++++++++++++++++++++++ collector/slm_response.go | 37 +++++ collector/slm_test.go | 69 +++++++++ main.go | 7 + 4 files changed, 428 insertions(+) create mode 100644 collector/slm.go create mode 100644 collector/slm_response.go create mode 100644 collector/slm_test.go diff --git a/collector/slm.go b/collector/slm.go new file mode 100644 index 00000000..08a93744 --- /dev/null +++ b/collector/slm.go @@ -0,0 +1,315 @@ +// Copyright 2021 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package collector + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "net/http" + "net/url" + "path" + + "github.com/go-kit/log" + "github.com/go-kit/log/level" + "github.com/prometheus/client_golang/prometheus" +) + +type policyMetric struct { + Type prometheus.ValueType + Desc *prometheus.Desc + Value func(policyStats PolicyStats) float64 + Labels func(policyStats PolicyStats) []string +} + +type slmMetric struct { + Type prometheus.ValueType + Desc *prometheus.Desc + Value func(slmStats SLMStatsResponse) float64 +} + +var ( + defaultPolicyLabels = []string{"policy"} + defaultPolicyLabelValues = func(policyStats PolicyStats) []string { + return []string{policyStats.Policy} + } +) + +// SLM information struct +type SLM struct { + logger log.Logger + client *http.Client + url *url.URL + + up prometheus.Gauge + totalScrapes, jsonParseFailures prometheus.Counter + + slmMetrics []*slmMetric + policyMetrics []*policyMetric +} + +// NewSLM defines SLM Prometheus metrics +func NewSLM(logger log.Logger, client *http.Client, url *url.URL) *SLM { + return &SLM{ + logger: logger, + client: client, + url: url, + + up: prometheus.NewGauge(prometheus.GaugeOpts{ + Name: prometheus.BuildFQName(namespace, "slm_stats", "up"), + Help: "Was the last scrape of the ElasticSearch SLM endpoint successful.", + }), + totalScrapes: prometheus.NewCounter(prometheus.CounterOpts{ + Name: prometheus.BuildFQName(namespace, "slm_stats", "total_scrapes"), + Help: "Current total ElasticSearch SLM scrapes.", + }), + jsonParseFailures: prometheus.NewCounter(prometheus.CounterOpts{ + Name: prometheus.BuildFQName(namespace, "slm_stats", "json_parse_failures"), + Help: "Number of errors while parsing JSON.", + }), + slmMetrics: []*slmMetric{ + { + Type: prometheus.GaugeValue, + Desc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, "slm_stats", "retention_runs"), + "Total retention runs", + nil, nil, + ), + Value: func(slmStats SLMStatsResponse) float64 { + return float64(slmStats.RetentionRuns) + }, + }, + { + Type: prometheus.GaugeValue, + Desc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, "slm_stats", "retention_failed"), + "Total failed retention runs", + nil, nil, + ), + Value: func(slmStats SLMStatsResponse) float64 { + return float64(slmStats.RetentionFailed) + }, + }, + { + Type: prometheus.GaugeValue, + Desc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, "slm_stats", "retention_timed_out"), + "Total timed out retention runs", + nil, nil, + ), + Value: func(slmStats SLMStatsResponse) float64 { + return float64(slmStats.RetentionTimedOut) + }, + }, + { + Type: prometheus.GaugeValue, + Desc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, "slm_stats", "retention_deletion_time_millis"), + "Retention run deletion time", + nil, nil, + ), + Value: func(slmStats SLMStatsResponse) float64 { + return float64(slmStats.RetentionDeletionTimeMillis) + }, + }, + { + Type: prometheus.GaugeValue, + Desc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, "slm_stats", "total_snapshots_taken"), + "Total snapshots taken", + nil, nil, + ), + Value: func(slmStats SLMStatsResponse) float64 { + return float64(slmStats.TotalSnapshotsTaken) + }, + }, + { + Type: prometheus.GaugeValue, + Desc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, "slm_stats", "total_snapshots_failed"), + "Total snapshots failed", + nil, nil, + ), + Value: func(slmStats SLMStatsResponse) float64 { + return float64(slmStats.TotalSnapshotsFailed) + }, + }, + { + Type: prometheus.GaugeValue, + Desc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, "slm_stats", "total_snapshots_deleted"), + "Total snapshots deleted", + nil, nil, + ), + Value: func(slmStats SLMStatsResponse) float64 { + return float64(slmStats.TotalSnapshotsDeleted) + }, + }, + { + Type: prometheus.GaugeValue, + Desc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, "slm_stats", "total_snapshot_deletion_failures"), + "Total snapshot deletion failures", + nil, nil, + ), + Value: func(slmStats SLMStatsResponse) float64 { + return float64(slmStats.TotalSnapshotDeletionFailures) + }, + }, + }, + policyMetrics: []*policyMetric{ + { + Type: prometheus.GaugeValue, + Desc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, "slm_stats", "snapshots_taken"), + "Total snapshots taken", + defaultPolicyLabels, nil, + ), + Value: func(policyStats PolicyStats) float64 { + return float64(policyStats.SnapshotsTaken) + }, + Labels: defaultPolicyLabelValues, + }, + { + Type: prometheus.GaugeValue, + Desc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, "slm_stats", "snapshots_failed"), + "Total snapshots failed", + defaultPolicyLabels, nil, + ), + Value: func(policyStats PolicyStats) float64 { + return float64(policyStats.SnapshotsFailed) + }, + Labels: defaultPolicyLabelValues, + }, + { + Type: prometheus.GaugeValue, + Desc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, "slm_stats", "snapshots_deleted"), + "Total snapshots deleted", + defaultPolicyLabels, nil, + ), + Value: func(policyStats PolicyStats) float64 { + return float64(policyStats.SnapshotsDeleted) + }, + Labels: defaultPolicyLabelValues, + }, + { + Type: prometheus.GaugeValue, + Desc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, "slm_stats", "snapshot_deletion_failures"), + "Total snapshot deletion failures", + defaultPolicyLabels, nil, + ), + Value: func(policyStats PolicyStats) float64 { + return float64(policyStats.SnapshotDeletionFailures) + }, + Labels: defaultPolicyLabelValues, + }, + }, + } +} + +// Describe adds SLM metrics descriptions +func (s *SLM) Describe(ch chan<- *prometheus.Desc) { + for _, metric := range s.slmMetrics { + ch <- metric.Desc + } + + for _, metric := range s.policyMetrics { + ch <- metric.Desc + } + ch <- s.up.Desc() + ch <- s.totalScrapes.Desc() + ch <- s.jsonParseFailures.Desc() +} + +func (s *SLM) fetchAndDecodeSLMStats() (SLMStatsResponse, error) { + var ssr SLMStatsResponse + + u := *s.url + u.Path = path.Join(u.Path, "/_slm/stats") + res, err := s.client.Get(u.String()) + if err != nil { + return ssr, fmt.Errorf("failed to get slm stats health from %s://%s:%s%s: %s", + u.Scheme, u.Hostname(), u.Port(), u.Path, err) + } + + defer func() { + err = res.Body.Close() + if err != nil { + _ = level.Warn(s.logger).Log( + "msg", "failed to close http.Client", + "err", err, + ) + } + }() + + if res.StatusCode != http.StatusOK { + return ssr, fmt.Errorf("HTTP Request failed with code %d", res.StatusCode) + } + + bts, err := ioutil.ReadAll(res.Body) + if err != nil { + s.jsonParseFailures.Inc() + return ssr, err + } + + if err := json.Unmarshal(bts, &ssr); err != nil { + s.jsonParseFailures.Inc() + return ssr, err + } + + return ssr, nil +} + +// Collect gets SLM metric values +func (s *SLM) Collect(ch chan<- prometheus.Metric) { + s.totalScrapes.Inc() + defer func() { + ch <- s.up + ch <- s.totalScrapes + ch <- s.jsonParseFailures + }() + + slmStatsResp, err := s.fetchAndDecodeSLMStats() + if err != nil { + s.up.Set(0) + _ = level.Warn(s.logger).Log( + "msg", "failed to fetch and decode slm stats", + "err", err, + ) + return + } + s.up.Set(1) + + for _, metric := range s.slmMetrics { + ch <- prometheus.MustNewConstMetric( + metric.Desc, + metric.Type, + metric.Value(slmStatsResp), + ) + } + + for _, metric := range s.policyMetrics { + for _, policy := range slmStatsResp.PolicyStats { + ch <- prometheus.MustNewConstMetric( + metric.Desc, + metric.Type, + metric.Value(policy), + metric.Labels(policy)..., + ) + } + } +} diff --git a/collector/slm_response.go b/collector/slm_response.go new file mode 100644 index 00000000..fb7ddbc6 --- /dev/null +++ b/collector/slm_response.go @@ -0,0 +1,37 @@ +// Copyright 2021 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package collector + +// SLMStatsResponse is a representation of the SLM stats +type SLMStatsResponse struct { + RetentionRuns int64 `json:"retention_runs"` + RetentionFailed int64 `json:"retention_failed"` + RetentionTimedOut int64 `json:"retention_timed_out"` + RetentionDeletionTime string `json:"retention_deletion_time"` + RetentionDeletionTimeMillis int64 `json:"retention_deletion_time_millis"` + TotalSnapshotsTaken int64 `json:"total_snapshots_taken"` + TotalSnapshotsFailed int64 `json:"total_snapshots_failed"` + TotalSnapshotsDeleted int64 `json:"total_snapshots_deleted"` + TotalSnapshotDeletionFailures int64 `json:"total_snapshot_deletion_failures"` + PolicyStats []PolicyStats `json:"policy_stats"` +} + +// PolicyStats is a representation of SLM stats for specific policies +type PolicyStats struct { + Policy string `json:"policy"` + SnapshotsTaken int64 `json:"snapshots_taken"` + SnapshotsFailed int64 `json:"snapshots_failed"` + SnapshotsDeleted int64 `json:"snapshots_deleted"` + SnapshotDeletionFailures int64 `json:"snapshot_deletion_failures"` +} diff --git a/collector/slm_test.go b/collector/slm_test.go new file mode 100644 index 00000000..b22f8787 --- /dev/null +++ b/collector/slm_test.go @@ -0,0 +1,69 @@ +// Copyright 2021 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package collector + +import ( + "fmt" + "net/http" + "net/http/httptest" + "net/url" + "testing" + + "github.com/go-kit/log" +) + +func TestSLM(t *testing.T) { + // Testcases created using: + + // docker run -d -p 9200:9200 -e discovery.type=single-node -e path.repo=/tmp/backups docker.elastic.co/elasticsearch/elasticsearch:7.15.0-arm64 + // curl -XPUT http://127.0.0.1:9200/_snapshot/my_repository -H 'Content-Type: application/json' -d '{"type":"url","settings":{"url":"file:/tmp/backups"}}' + // curl -XPUT http://127.0.0.1:9200/_slm/policy/everything -H 'Content-Type: application/json' -d '{"schedule":"0 */15 * * * ?","name":"","repository":"my_repository","config":{"indices":".*","include_global_state":true,"ignore_unavailable":true},"retention":{"expire_after":"7d"}}' + // curl http://127.0.0.1:9200/_slm/stats (Numbers manually tweaked) + + tcs := map[string][]string{ + "7.15.0": {`{"retention_runs":9,"retention_failed":0,"retention_timed_out":0,"retention_deletion_time":"1.2m","retention_deletion_time_millis":72491,"total_snapshots_taken":103,"total_snapshots_failed":2,"total_snapshots_deleted":20,"total_snapshot_deletion_failures":0,"policy_stats":[{"policy":"everything","snapshots_taken":50,"snapshots_failed":2,"snapshots_deleted":20,"snapshot_deletion_failures":0}]}`}, + } + for ver, out := range tcs { + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.RequestURI == "/_slm/stats" { + fmt.Fprint(w, out[0]) + return + } + fmt.Fprint(w, out[1]) + })) + defer ts.Close() + + u, err := url.Parse(ts.URL) + if err != nil { + t.Fatalf("Failed to parse URL: %s", err) + } + s := NewSLM(log.NewNopLogger(), http.DefaultClient, u) + stats, err := s.fetchAndDecodeSLMStats() + if err != nil { + t.Fatalf("Failed to fetch or decode snapshots stats: %s", err) + } + t.Logf("[%s] SLM Response: %+v", ver, stats) + slmStats := stats + policyStats := stats.PolicyStats[0] + + if slmStats.TotalSnapshotsTaken != 103 { + t.Errorf("Bad number of total snapshots taken") + } + + if policyStats.SnapshotsTaken != 50 { + t.Errorf("Bad number of policy snapshots taken") + } + } + +} diff --git a/main.go b/main.go index cde2e747..2d3657ce 100644 --- a/main.go +++ b/main.go @@ -82,6 +82,9 @@ func main() { esExportSnapshots = kingpin.Flag("es.snapshots", "Export stats for the cluster snapshots."). Default("false").Bool() + esExportSLM = kingpin.Flag("es.slm", + "Export stats for SLM snapshots."). + Default("false").Bool() esClusterInfoInterval = kingpin.Flag("es.clusterinfo.interval", "Cluster info update interval for the cluster label"). Default("5m").Duration() @@ -176,6 +179,10 @@ func main() { prometheus.MustRegister(collector.NewSnapshots(logger, httpClient, esURL)) } + if *esExportSLM { + prometheus.MustRegister(collector.NewSLM(logger, httpClient, esURL)) + } + if *esExportClusterSettings { prometheus.MustRegister(collector.NewClusterSettings(logger, httpClient, esURL)) }