From 7083ae8267ceead89b27621c165230cb0fd239ec Mon Sep 17 00:00:00 2001 From: Bryan Boreham Date: Wed, 10 Jul 2024 10:00:31 +0100 Subject: [PATCH] [release 2.53] Revert 13583 to stop dropping samples in remote-write catch-up (#14446) * Revert "fix bug that would cause us to endlessly fall behind (#13583)" This reverts commit 0c71230784368da829f1f02d412d181d7a06aee6. (leaving the new test in place) * TSDB: enhance TestRun_AvoidNotifyWhenBehind With code suggested by @cstyan in #14439. * WAL watcher: add back log line showing current segment --------- Signed-off-by: Bryan Boreham --- tsdb/wlog/watcher.go | 21 ++++---- tsdb/wlog/watcher_test.go | 110 +++++++++++++++++++------------------- 2 files changed, 65 insertions(+), 66 deletions(-) diff --git a/tsdb/wlog/watcher.go b/tsdb/wlog/watcher.go index 8ebd9249aad..d836e3d868d 100644 --- a/tsdb/wlog/watcher.go +++ b/tsdb/wlog/watcher.go @@ -262,6 +262,11 @@ func (w *Watcher) loop() { // Run the watcher, which will tail the WAL until the quit channel is closed // or an error case is hit. func (w *Watcher) Run() error { + _, lastSegment, err := w.firstAndLast() + if err != nil { + return fmt.Errorf("wal.Segments: %w", err) + } + // We want to ensure this is false across iterations since // Run will be called again if there was a failure to read the WAL. w.sendSamples = false @@ -286,20 +291,14 @@ func (w *Watcher) Run() error { return err } - level.Debug(w.logger).Log("msg", "Tailing WAL", "lastCheckpoint", lastCheckpoint, "checkpointIndex", checkpointIndex, "currentSegment", currentSegment) + level.Debug(w.logger).Log("msg", "Tailing WAL", "lastCheckpoint", lastCheckpoint, "checkpointIndex", checkpointIndex, "currentSegment", currentSegment, "lastSegment", lastSegment) for !isClosed(w.quit) { w.currentSegmentMetric.Set(float64(currentSegment)) - // Re-check on each iteration in case a new segment was added, - // because watch() will wait for notifications on the last segment. - _, lastSegment, err := w.firstAndLast() - if err != nil { - return fmt.Errorf("wal.Segments: %w", err) - } - tail := currentSegment >= lastSegment - - level.Debug(w.logger).Log("msg", "Processing segment", "currentSegment", currentSegment, "lastSegment", lastSegment) - if err := w.watch(currentSegment, tail); err != nil && !errors.Is(err, ErrIgnorable) { + // On start, after reading the existing WAL for series records, we have a pointer to what is the latest segment. + // On subsequent calls to this function, currentSegment will have been incremented and we should open that segment. + level.Debug(w.logger).Log("msg", "Processing segment", "currentSegment", currentSegment) + if err := w.watch(currentSegment, currentSegment >= lastSegment); err != nil && !errors.Is(err, ErrIgnorable) { return err } diff --git a/tsdb/wlog/watcher_test.go b/tsdb/wlog/watcher_test.go index ff006cb8174..b39a0fa8abf 100644 --- a/tsdb/wlog/watcher_test.go +++ b/tsdb/wlog/watcher_test.go @@ -17,6 +17,7 @@ import ( "math/rand" "os" "path" + "runtime" "sync" "testing" "time" @@ -698,11 +699,46 @@ func TestRun_StartupTime(t *testing.T) { } } +func generateWALRecords(w *WL, segment, seriesCount, samplesCount int) error { + enc := record.Encoder{} + for j := 0; j < seriesCount; j++ { + ref := j + (segment * 100) + series := enc.Series([]record.RefSeries{ + { + Ref: chunks.HeadSeriesRef(ref), + Labels: labels.FromStrings("__name__", fmt.Sprintf("metric_%d", segment)), + }, + }, nil) + if err := w.Log(series); err != nil { + return err + } + + for k := 0; k < samplesCount; k++ { + inner := rand.Intn(ref + 1) + sample := enc.Samples([]record.RefSample{ + { + Ref: chunks.HeadSeriesRef(inner), + T: int64(segment), + V: float64(segment), + }, + }, nil) + if err := w.Log(sample); err != nil { + return err + } + } + } + return nil +} + func TestRun_AvoidNotifyWhenBehind(t *testing.T) { - const pageSize = 32 * 1024 - const segments = 10 - const seriesCount = 20 - const samplesCount = 300 + if runtime.GOOS == "windows" { // Takes a really long time, perhaps because min sleep time is 15ms. + t.SkipNow() + } + const segmentSize = pageSize // Smallest allowed segment size. + const segmentsToWrite = 5 + const segmentsToRead = segmentsToWrite - 1 + const seriesCount = 10 + const samplesCount = 50 // This test can take longer than intended to finish in cloud CI. readTimeout := 10 * time.Second @@ -715,73 +751,37 @@ func TestRun_AvoidNotifyWhenBehind(t *testing.T) { err := os.Mkdir(wdir, 0o777) require.NoError(t, err) - enc := record.Encoder{} - w, err := NewSize(nil, nil, wdir, pageSize, compress) + w, err := NewSize(nil, nil, wdir, segmentSize, compress) require.NoError(t, err) var wg sync.WaitGroup - // add one segment initially to ensure there's a value > 0 for the last segment id - for i := 0; i < 1; i++ { - for j := 0; j < seriesCount; j++ { - ref := j + (i * 100) - series := enc.Series([]record.RefSeries{ - { - Ref: chunks.HeadSeriesRef(ref), - Labels: labels.FromStrings("__name__", fmt.Sprintf("metric_%d", i)), - }, - }, nil) - require.NoError(t, w.Log(series)) - - for k := 0; k < samplesCount; k++ { - inner := rand.Intn(ref + 1) - sample := enc.Samples([]record.RefSample{ - { - Ref: chunks.HeadSeriesRef(inner), - T: int64(i), - V: float64(i), - }, - }, nil) - require.NoError(t, w.Log(sample)) - } - } - } + // Generate one segment initially to ensure that watcher.Run() finds at least one segment on disk. + require.NoError(t, generateWALRecords(w, 0, seriesCount, samplesCount)) + w.NextSegment() // Force creation of the next segment wg.Add(1) go func() { defer wg.Done() - for i := 1; i < segments; i++ { - for j := 0; j < seriesCount; j++ { - ref := j + (i * 100) - series := enc.Series([]record.RefSeries{ - { - Ref: chunks.HeadSeriesRef(ref), - Labels: labels.FromStrings("__name__", fmt.Sprintf("metric_%d", i)), - }, - }, nil) - require.NoError(t, w.Log(series)) - - for k := 0; k < samplesCount; k++ { - inner := rand.Intn(ref + 1) - sample := enc.Samples([]record.RefSample{ - { - Ref: chunks.HeadSeriesRef(inner), - T: int64(i), - V: float64(i), - }, - }, nil) - require.NoError(t, w.Log(sample)) - } - } + for i := 1; i < segmentsToWrite; i++ { + require.NoError(t, generateWALRecords(w, i, seriesCount, samplesCount)) + w.NextSegment() } }() wt := newWriteToMock(time.Millisecond) watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, false, false) - watcher.MaxSegment = segments + watcher.MaxSegment = segmentsToRead watcher.setMetrics() startTime := time.Now() err = watcher.Run() wg.Wait() require.Less(t, time.Since(startTime), readTimeout) + + // But samples records shouldn't get dropped + retry(t, defaultRetryInterval, defaultRetries, func() bool { + return wt.checkNumSeries() > 0 + }) + require.Equal(t, segmentsToRead*seriesCount*samplesCount, wt.samplesAppended) + require.NoError(t, err) require.NoError(t, w.Close()) })