Skip to content

Commit 003b9b2

Browse files
committed
pkg/util/log: Implement handling for oversized log messages in bufferedSink
Previously, when a single log message exceeded the configured max-buffer-size for a buffered sink with exit-on-error enabled, the error would propagate up and trigger process termination. This was overly aggressive for what amounts to a logging configuration issue - a single oversized SQL query (e.g., with a multi-megabyte string literal) could crash an entire CockroachDB node. This commit modifies bufferedSink.output() to detect the errMsgTooLarge error and handle it gracefully. When an oversized message is encountered, instead of propagating the error, we drop the message and log a warning via Ops.Warningf() indicating that the message exceeded the buffer size limit. This allows the node to continue operating normally while still providing visibility into the issue through logged warnings. The implementation uses a two-phase approach to avoid deadlock: first, while holding the sink's mutex, we detect the oversized message and set a flag with the relevant information; then, after releasing the lock, we emit the warning. This is necessary because calling Ops.Warningf() while holding the mutex would cause the warning message to attempt re-entry into the same sink, resulting in a deadlock when it tries to acquire the already-held lock. Part of: CRDB-53951 Epic: CRDB-56325 Release note: None
1 parent fd29a93 commit 003b9b2

File tree

2 files changed

+62
-0
lines changed

2 files changed

+62
-0
lines changed

pkg/util/log/buffered_sink.go

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,12 @@ func (bs *bufferedSink) output(b []byte, opts sinkOutputOptions) error {
221221

222222
var errC chan error
223223

224+
// Variables to log an oversized message warning after releasing the lock.
225+
// We can't call Ops.Warningf() while holding bs.mu because that would
226+
// cause a deadlock (the warning message would try to log through this same sink).
227+
var logOversizedWarning bool
228+
var maxSize uint64
229+
224230
err := func() error {
225231
bs.mu.Lock()
226232
defer bs.mu.Unlock()
@@ -229,6 +235,17 @@ func (bs *bufferedSink) output(b []byte, opts sinkOutputOptions) error {
229235
if err != nil {
230236
// Release the msg buffer, since our append failed.
231237
putBuffer(msg)
238+
// If the message is too large to fit in the buffer, we log a warning
239+
// but don't return an error. This prevents the server from crashing
240+
// due to exit-on-error when a single log message exceeds max-buffer-size.
241+
// Instead, we gracefully drop the oversized message.
242+
// NOTE: The message content will not be logged.
243+
if errors.Is(err, errMsgTooLarge) {
244+
// Set flag to log warning after releasing the lock (to avoid recursion/deadlock)
245+
logOversizedWarning = true
246+
maxSize = bs.mu.buf.maxSizeBytes
247+
return nil // Don't propagate the error - handle gracefully
248+
}
232249
return err
233250
}
234251

@@ -281,6 +298,14 @@ func (bs *bufferedSink) output(b []byte, opts sinkOutputOptions) error {
281298
}
282299
return nil
283300
}()
301+
302+
// Log the oversized message warning after releasing the lock to avoid deadlock.
303+
// We use Ops.Warningf() which routes the warning through the OPS channel,
304+
// ensuring it appears in log files rather than only on stderr.
305+
if logOversizedWarning {
306+
Ops.Warningf(context.Background(), "dropped log message because it exceeds max-buffer-size (%d bytes) for sink %T", maxSize, bs.child)
307+
}
308+
284309
if err != nil {
285310
return err
286311
}

pkg/util/log/buffered_sink_test.go

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -630,3 +630,40 @@ func (t *testWaitGroupSink) exitCode() exit.Code {
630630
}
631631

632632
var _ logSink = (*testWaitGroupSink)(nil)
633+
634+
// TestBufferedSinkOversizedMessage tests that oversized messages (larger than
635+
// maxBufferSize) are dropped gracefully without returning an error, rather than
636+
// causing the process to crash when exit-on-error is enabled.
637+
func TestBufferedSinkOversizedMessage(t *testing.T) {
638+
defer leaktest.AfterTest(t)()
639+
defer Scope(t).Close(t)
640+
641+
ctrl := gomock.NewController(t)
642+
defer ctrl.Finish()
643+
mock := NewMockLogSink(ctrl)
644+
645+
// Set up a buffered sink with a small max buffer size.
646+
maxBufferSize := uint64(10)
647+
sink := newBufferedSink(mock, noMaxStaleness, noSizeTrigger, maxBufferSize, false /* crashOnAsyncFlushErr */, nil)
648+
closer := newBufferedSinkCloser()
649+
sink.Start(closer)
650+
defer func() { require.NoError(t, closer.Close(defaultCloserTimeout)) }()
651+
652+
// Create a message that exceeds the max buffer size.
653+
oversizedMessage := bytes.Repeat([]byte("x"), int(maxBufferSize)+10)
654+
655+
// The oversized message should NOT be sent to the child sink because it's
656+
// dropped before reaching the child. The bufferedSink detects the message
657+
// is too large in appendMsg() and returns early without ever calling the
658+
// child sink's output() method.
659+
mock.EXPECT().
660+
output(gomock.Any(), gomock.Any()).
661+
Times(0) // No calls should reach the child sink
662+
663+
// Attempt to output the oversized message. This should succeed (return nil)
664+
// and not cause the process to crash. The key behavior being tested is that
665+
// errMsgTooLarge is caught and handled gracefully, returning nil instead of
666+
// propagating the error.
667+
err := sink.output(oversizedMessage, sinkOutputOptions{})
668+
require.NoError(t, err, "oversized message should be handled gracefully without returning an error")
669+
}

0 commit comments

Comments
 (0)