Skip to content

Commit

Permalink
Refactor tests for slm collector (#928)
Browse files Browse the repository at this point in the history
- Remove up, totalScrapes, and jsonParseFailures metrics. They are not useful.
- Move fixtures to individual files
- Base tests on the metric output for better testing the expected output instead of the internals.

Signed-off-by: Joe Adams <[email protected]>
  • Loading branch information
sysadmind authored Sep 13, 2024
1 parent 711a6ce commit d98d2f6
Show file tree
Hide file tree
Showing 4 changed files with 120 additions and 59 deletions.
33 changes: 0 additions & 33 deletions collector/slm.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,6 @@ type SLM struct {
client *http.Client
url *url.URL

up prometheus.Gauge
totalScrapes, jsonParseFailures prometheus.Counter

slmMetrics []*slmMetric
policyMetrics []*policyMetric
slmStatusMetric *slmStatusMetric
Expand All @@ -75,19 +72,6 @@ func NewSLM(logger log.Logger, client *http.Client, url *url.URL) *SLM {
logger: logger,
client: client,
url: url,

up: prometheus.NewGauge(prometheus.GaugeOpts{
Name: prometheus.BuildFQName(namespace, "slm_stats", "up"),
Help: "Was the last scrape of the Elasticsearch SLM endpoint successful.",
}),
totalScrapes: prometheus.NewCounter(prometheus.CounterOpts{
Name: prometheus.BuildFQName(namespace, "slm_stats", "total_scrapes"),
Help: "Current total Elasticsearch SLM scrapes.",
}),
jsonParseFailures: prometheus.NewCounter(prometheus.CounterOpts{
Name: prometheus.BuildFQName(namespace, "slm_stats", "json_parse_failures"),
Help: "Number of errors while parsing JSON.",
}),
slmMetrics: []*slmMetric{
{
Type: prometheus.CounterValue,
Expand Down Expand Up @@ -257,9 +241,6 @@ func (s *SLM) Describe(ch chan<- *prometheus.Desc) {
ch <- metric.Desc
}

ch <- s.up.Desc()
ch <- s.totalScrapes.Desc()
ch <- s.jsonParseFailures.Desc()
}

func (s *SLM) fetchAndDecodeSLMStats() (SLMStatsResponse, error) {
Expand Down Expand Up @@ -289,12 +270,10 @@ func (s *SLM) fetchAndDecodeSLMStats() (SLMStatsResponse, error) {

bts, err := io.ReadAll(res.Body)
if err != nil {
s.jsonParseFailures.Inc()
return ssr, err
}

if err := json.Unmarshal(bts, &ssr); err != nil {
s.jsonParseFailures.Inc()
return ssr, err
}

Expand Down Expand Up @@ -328,12 +307,10 @@ func (s *SLM) fetchAndDecodeSLMStatus() (SLMStatusResponse, error) {

bts, err := io.ReadAll(res.Body)
if err != nil {
s.jsonParseFailures.Inc()
return ssr, err
}

if err := json.Unmarshal(bts, &ssr); err != nil {
s.jsonParseFailures.Inc()
return ssr, err
}

Expand All @@ -342,16 +319,9 @@ func (s *SLM) fetchAndDecodeSLMStatus() (SLMStatusResponse, error) {

// Collect gets SLM metric values
func (s *SLM) Collect(ch chan<- prometheus.Metric) {
s.totalScrapes.Inc()
defer func() {
ch <- s.up
ch <- s.totalScrapes
ch <- s.jsonParseFailures
}()

slmStatusResp, err := s.fetchAndDecodeSLMStatus()
if err != nil {
s.up.Set(0)
level.Warn(s.logger).Log(
"msg", "failed to fetch and decode slm status",
"err", err,
Expand All @@ -361,16 +331,13 @@ func (s *SLM) Collect(ch chan<- prometheus.Metric) {

slmStatsResp, err := s.fetchAndDecodeSLMStats()
if err != nil {
s.up.Set(0)
level.Warn(s.logger).Log(
"msg", "failed to fetch and decode slm stats",
"err", err,
)
return
}

s.up.Set(1)

for _, status := range statuses {
ch <- prometheus.MustNewConstMetric(
s.slmStatusMetric.Desc,
Expand Down
125 changes: 99 additions & 26 deletions collector/slm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,17 @@
package collector

import (
"fmt"
"io"
"net/http"
"net/http/httptest"
"net/url"
"os"
"path"
"strings"
"testing"

"github.com/go-kit/log"
"github.com/prometheus/client_golang/prometheus/testutil"
)

func TestSLM(t *testing.T) {
Expand All @@ -31,35 +35,104 @@ func TestSLM(t *testing.T) {
// curl -XPUT http://127.0.0.1:9200/_slm/policy/everything -H 'Content-Type: application/json' -d '{"schedule":"0 */15 * * * ?","name":"<everything-{now/d}>","repository":"my_repository","config":{"indices":".*","include_global_state":true,"ignore_unavailable":true},"retention":{"expire_after":"7d"}}'
// curl http://127.0.0.1:9200/_slm/stats (Numbers manually tweaked)

tcs := map[string]string{
"7.15.0": `{"retention_runs":9,"retention_failed":0,"retention_timed_out":0,"retention_deletion_time":"1.2m","retention_deletion_time_millis":72491,"total_snapshots_taken":103,"total_snapshots_failed":2,"total_snapshots_deleted":20,"total_snapshot_deletion_failures":0,"policy_stats":[{"policy":"everything","snapshots_taken":50,"snapshots_failed":2,"snapshots_deleted":20,"snapshot_deletion_failures":0}]}`,
tests := []struct {
name string
file string
want string
}{
{
name: "7.15.0",
file: "7.15.0.json",
want: `# HELP elasticsearch_slm_stats_operation_mode Operating status of SLM
# TYPE elasticsearch_slm_stats_operation_mode gauge
elasticsearch_slm_stats_operation_mode{operation_mode="RUNNING"} 0
elasticsearch_slm_stats_operation_mode{operation_mode="STOPPED"} 0
elasticsearch_slm_stats_operation_mode{operation_mode="STOPPING"} 0
# HELP elasticsearch_slm_stats_retention_deletion_time_seconds Retention run deletion time
# TYPE elasticsearch_slm_stats_retention_deletion_time_seconds gauge
elasticsearch_slm_stats_retention_deletion_time_seconds 72.491
# HELP elasticsearch_slm_stats_retention_failed_total Total failed retention runs
# TYPE elasticsearch_slm_stats_retention_failed_total counter
elasticsearch_slm_stats_retention_failed_total 0
# HELP elasticsearch_slm_stats_retention_runs_total Total retention runs
# TYPE elasticsearch_slm_stats_retention_runs_total counter
elasticsearch_slm_stats_retention_runs_total 9
# HELP elasticsearch_slm_stats_retention_timed_out_total Total timed out retention runs
# TYPE elasticsearch_slm_stats_retention_timed_out_total counter
elasticsearch_slm_stats_retention_timed_out_total 0
# HELP elasticsearch_slm_stats_snapshot_deletion_failures_total Total snapshot deletion failures
# TYPE elasticsearch_slm_stats_snapshot_deletion_failures_total counter
elasticsearch_slm_stats_snapshot_deletion_failures_total{policy="everything"} 0
# HELP elasticsearch_slm_stats_snapshots_deleted_total Total snapshots deleted
# TYPE elasticsearch_slm_stats_snapshots_deleted_total counter
elasticsearch_slm_stats_snapshots_deleted_total{policy="everything"} 20
# HELP elasticsearch_slm_stats_snapshots_failed_total Total snapshots failed
# TYPE elasticsearch_slm_stats_snapshots_failed_total counter
elasticsearch_slm_stats_snapshots_failed_total{policy="everything"} 2
# HELP elasticsearch_slm_stats_snapshots_taken_total Total snapshots taken
# TYPE elasticsearch_slm_stats_snapshots_taken_total counter
elasticsearch_slm_stats_snapshots_taken_total{policy="everything"} 50
# HELP elasticsearch_slm_stats_total_snapshot_deletion_failures_total Total snapshot deletion failures
# TYPE elasticsearch_slm_stats_total_snapshot_deletion_failures_total counter
elasticsearch_slm_stats_total_snapshot_deletion_failures_total 0
# HELP elasticsearch_slm_stats_total_snapshots_deleted_total Total snapshots deleted
# TYPE elasticsearch_slm_stats_total_snapshots_deleted_total counter
elasticsearch_slm_stats_total_snapshots_deleted_total 20
# HELP elasticsearch_slm_stats_total_snapshots_failed_total Total snapshots failed
# TYPE elasticsearch_slm_stats_total_snapshots_failed_total counter
elasticsearch_slm_stats_total_snapshots_failed_total 2
# HELP elasticsearch_slm_stats_total_snapshots_taken_total Total snapshots taken
# TYPE elasticsearch_slm_stats_total_snapshots_taken_total counter
elasticsearch_slm_stats_total_snapshots_taken_total 103
`,
},
}
for ver, out := range tcs {
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
fmt.Fprintln(w, out)
}))
defer ts.Close()

u, err := url.Parse(ts.URL)
if err != nil {
t.Fatalf("Failed to parse URL: %s", err)
}
s := NewSLM(log.NewNopLogger(), http.DefaultClient, u)
stats, err := s.fetchAndDecodeSLMStats()
if err != nil {
t.Fatalf("Failed to fetch or decode snapshots stats: %s", err)
}
t.Logf("[%s] SLM Response: %+v", ver, stats)
slmStats := stats
policyStats := stats.PolicyStats[0]
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
fStatsPath := path.Join("../fixtures/slm/stats/", tt.file)
fStats, err := os.Open(fStatsPath)
if err != nil {
t.Fatal(err)
}
defer fStats.Close()

if slmStats.TotalSnapshotsTaken != 103 {
t.Errorf("Bad number of total snapshots taken")
}
fStatusPath := path.Join("../fixtures/slm/status/", tt.file)
fStatus, err := os.Open(fStatusPath)
if err != nil {
t.Fatal(err)
}
defer fStatus.Close()

ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch r.RequestURI {
case "/_slm/stats":
io.Copy(w, fStats)
return
case "/_slm/status":
io.Copy(w, fStatus)
return
}

http.Error(w, "Not Found", http.StatusNotFound)
}))
defer ts.Close()

u, err := url.Parse(ts.URL)
if err != nil {
t.Fatalf("Failed to parse URL: %s", err)
}

s := NewSLM(log.NewNopLogger(), http.DefaultClient, u)
if err != nil {
t.Fatal(err)
}

if err := testutil.CollectAndCompare(s, strings.NewReader(tt.want)); err != nil {
t.Fatal(err)
}
})

if policyStats.SnapshotsTaken != 50 {
t.Errorf("Bad number of policy snapshots taken")
}
}

}
20 changes: 20 additions & 0 deletions fixtures/slm/stats/7.15.0.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"retention_runs": 9,
"retention_failed": 0,
"retention_timed_out": 0,
"retention_deletion_time": "1.2m",
"retention_deletion_time_millis": 72491,
"total_snapshots_taken": 103,
"total_snapshots_failed": 2,
"total_snapshots_deleted": 20,
"total_snapshot_deletion_failures": 0,
"policy_stats": [
{
"policy": "everything",
"snapshots_taken": 50,
"snapshots_failed": 2,
"snapshots_deleted": 20,
"snapshot_deletion_failures": 0
}
]
}
1 change: 1 addition & 0 deletions fixtures/slm/status/7.15.0.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{}

0 comments on commit d98d2f6

Please sign in to comment.