Cherry-pick to 7.9: [Metricbeat][Kibana] Apply backoff when errored at getting usage stats (#20772) (#21162)

afharo · ycombinator · web-flow · commit 1979d7872f4f · 2020-09-18T19:01:18.000+01:00
Co-authored-by: Shaunak Kashyap &lt;ycombinator@gmail.com&gt;

Co-authored-by: Shaunak Kashyap &lt;ycombinator@gmail.com&gt;
diff --git a/CHANGELOG.next.asciidoc b/CHANGELOG.next.asciidoc
@@ -101,6 +101,7 @@ https://github.com/elastic/beats/compare/v7.0.0-alpha2...master[Check the HEAD d
 - Fix storage metricset to allow config without region/zone. {issue}17623[17623] {pull}17624[17624]
 - Fix overflow on Prometheus rates when new buckets are added on the go. {pull}17753[17753]
 - Add a switch to the driver definition on SQL module to use pretty names {pull}17378[17378]
+- The Kibana collector applies backoff when errored at getting usage stats {pull}20772[20772]
 - The `elasticsearch/index` metricset only requests wildcard expansion for hidden indices if the monitored Elasticsearch cluster supports it. {pull}20938[20938]
 - Fix panic index out of range error when getting AWS account name. {pull}21101[21101] {issue}21095[21095]
 - Handle missing counters in the application_pool metricset. {pull}21071[21071]
diff --git a/metricbeat/module/kibana/stats/stats.go b/metricbeat/module/kibana/stats/stats.go
@@ -38,9 +38,10 @@ func init() {
 }
 
 const (
-	statsPath             = "api/stats"
-	settingsPath          = "api/settings"
-	usageCollectionPeriod = 24 * time.Hour
+	statsPath              = "api/stats"
+	settingsPath           = "api/settings"
+	usageCollectionPeriod  = 24 * time.Hour
+	usageCollectionBackoff = 1 * time.Hour
 )
 
 var (
@@ -57,6 +58,7 @@ type MetricSet struct {
 	statsHTTP            *helper.HTTP
 	settingsHTTP         *helper.HTTP
 	usageLastCollectedOn time.Time
+	usageNextCollectOn   time.Time
 	isUsageExcludable    bool
 }
 
@@ -165,6 +167,10 @@ func (m *MetricSet) fetchStats(r mb.ReporterV2, now time.Time) error {
 
 		content, err = m.statsHTTP.FetchContent()
 		if err != nil {
+			if shouldCollectUsage {
+				// When errored in collecting the usage stats it may be counterproductive to try again on the next poll, try to collect the stats again after usageCollectionBackoff
+				m.usageNextCollectOn = now.Add(usageCollectionBackoff)
+			}
 			return err
 		}
 
@@ -215,5 +221,5 @@ func (m *MetricSet) calculateIntervalMs() int64 {
 }
 
 func (m *MetricSet) shouldCollectUsage(now time.Time) bool {
-	return now.Sub(m.usageLastCollectedOn) > usageCollectionPeriod
+	return now.Sub(m.usageLastCollectedOn) > usageCollectionPeriod && now.Sub(m.usageNextCollectOn) > 0
 }
diff --git a/metricbeat/module/kibana/stats/stats_test.go b/metricbeat/module/kibana/stats/stats_test.go
@@ -23,6 +23,7 @@ import (
 	"net/http"
 	"net/http/httptest"
 	"testing"
+	"time"
 
 	"github.com/stretchr/testify/require"
 
@@ -48,12 +49,12 @@ func TestFetchUsage(t *testing.T) {
 				w.WriteHeader(503)
 
 			case 1: // second call
-				// Make sure exclude_usage is still false since first call failed
-				require.Equal(t, "false", excludeUsage)
+				// Make sure exclude_usage is true since first call failed and it should not try again until usageCollectionBackoff time has passed
+				require.Equal(t, "true", excludeUsage)
 				w.WriteHeader(200)
 
 			case 2: // third call
-				// Make sure exclude_usage is now true since second call succeeded
+				// Make sure exclude_usage is still true
 				require.Equal(t, "true", excludeUsage)
 				w.WriteHeader(200)
 			}
@@ -76,3 +77,40 @@ func TestFetchUsage(t *testing.T) {
 	// Third fetch
 	mbtest.ReportingFetchV2Error(f)
 }
+
+func TestShouldCollectUsage(t *testing.T) {
+	now := time.Now()
+
+	cases := map[string]struct {
+		usageLastCollectedOn time.Time
+		usageNextCollectOn   time.Time
+		expectedResult       bool
+	}{
+		"within_usage_collection_period": {
+			usageLastCollectedOn: now.Add(-1 * usageCollectionPeriod),
+			expectedResult:       false,
+		},
+		"after_usage_collection_period_but_before_next_scheduled_collection": {
+			usageLastCollectedOn: now.Add(-2 * usageCollectionPeriod),
+			usageNextCollectOn:   now.Add(3 * time.Hour),
+			expectedResult:       false,
+		},
+		"after_usage_collection_period_and_after_next_scheduled_collection": {
+			usageLastCollectedOn: now.Add(-2 * usageCollectionPeriod),
+			usageNextCollectOn:   now.Add(-1 * time.Hour),
+			expectedResult:       true,
+		},
+	}
+
+	for name, test := range cases {
+		t.Run(name, func(t *testing.T) {
+			m := MetricSet{
+				usageLastCollectedOn: test.usageLastCollectedOn,
+				usageNextCollectOn:   test.usageNextCollectOn,
+			}
+
+			actualResult := m.shouldCollectUsage(now)
+			require.Equal(t, test.expectedResult, actualResult)
+		})
+	}
+}

Original file line number	Diff line number	Diff line change
`@@ -38,9 +38,10 @@ func init() {`
`38`	`38`	`}`
`39`	`39`
`40`	`40`	`const (`
`41`		`- statsPath = "api/stats"`
`42`		`- settingsPath = "api/settings"`
`43`		`- usageCollectionPeriod = 24 * time.Hour`
	`41`	`+ statsPath = "api/stats"`
	`42`	`+ settingsPath = "api/settings"`
	`43`	`+ usageCollectionPeriod = 24 * time.Hour`
	`44`	`+ usageCollectionBackoff = 1 * time.Hour`
`44`	`45`	`)`
`45`	`46`
`46`	`47`	`var (`
`@@ -57,6 +58,7 @@ type MetricSet struct {`
`57`	`58`	`statsHTTP *helper.HTTP`
`58`	`59`	`settingsHTTP *helper.HTTP`
`59`	`60`	`usageLastCollectedOn time.Time`
	`61`	`+ usageNextCollectOn time.Time`
`60`	`62`	`isUsageExcludable bool`
`61`	`63`	`}`
`62`	`64`
`@@ -165,6 +167,10 @@ func (m *MetricSet) fetchStats(r mb.ReporterV2, now time.Time) error {`
`165`	`167`
`166`	`168`	`content, err = m.statsHTTP.FetchContent()`
`167`	`169`	`if err != nil {`
	`170`	`+ if shouldCollectUsage {`
	`171`	`+ // When errored in collecting the usage stats it may be counterproductive to try again on the next poll, try to collect the stats again after usageCollectionBackoff`
	`172`	`+ m.usageNextCollectOn = now.Add(usageCollectionBackoff)`
	`173`	`+ }`
`168`	`174`	`return err`
`169`	`175`	`}`
`170`	`176`
`@@ -215,5 +221,5 @@ func (m *MetricSet) calculateIntervalMs() int64 {`
`215`	`221`	`}`
`216`	`222`
`217`	`223`	`func (m *MetricSet) shouldCollectUsage(now time.Time) bool {`
`218`		`- return now.Sub(m.usageLastCollectedOn) > usageCollectionPeriod`
	`224`	`+ return now.Sub(m.usageLastCollectedOn) > usageCollectionPeriod && now.Sub(m.usageNextCollectOn) > 0`
`219`	`225`	`}`