From 639204da104dbc98ebe950af37586dea8ead1a51 Mon Sep 17 00:00:00 2001 From: Kalle <23356117+kalleep@users.noreply.github.com> Date: Wed, 19 Nov 2025 09:38:04 +0100 Subject: [PATCH 01/24] loki.write: implement sharding Add capability to perform sharding with loki.write --- .../reference/components/loki/loki.write.md | 16 +- .../component/common/loki/client/config.go | 3 + .../common/loki/client/consumer_fanout.go | 286 +--------- .../common/loki/client/consumer_wal.go | 494 ++--------------- .../component/common/loki/client/shards.go | 498 ++++++++++++++++++ .../common/loki/client/shards_test.go | 176 +++++++ internal/component/loki/write/types.go | 6 +- 7 files changed, 760 insertions(+), 719 deletions(-) create mode 100644 internal/component/common/loki/client/shards.go create mode 100644 internal/component/common/loki/client/shards_test.go diff --git a/docs/sources/reference/components/loki/loki.write.md b/docs/sources/reference/components/loki/loki.write.md index 0ac70692258..19cd96a6412 100644 --- a/docs/sources/reference/components/loki/loki.write.md +++ b/docs/sources/reference/components/loki/loki.write.md @@ -46,7 +46,7 @@ You can use the following blocks with `loki.write`: | `endpoint` > [`basic_auth`][basic_auth] | Configure `basic_auth` for authenticating to the endpoint. | no | | `endpoint` > [`oauth2`][oauth2] | Configure OAuth 2.0 for authenticating to the endpoint. | no | | `endpoint` > `oauth2` > [`tls_config`][tls_config] | Configure TLS settings for connecting to the endpoint. | no | -| `endpoint` > [`queue_config`][queue_config] | When WAL is enabled, configures the queue client. | no | +| `endpoint` > [`queue_config`][queue_config] | Configure the queue used for endpoint. | no | | `endpoint` > [`tls_config`][tls_config] | Configure TLS settings for connecting to the endpoint. | no | | [`wal`][wal] | Write-ahead log configuration. | no | @@ -104,8 +104,9 @@ The following arguments are supported: If no `tenant_id` is provided, the component assumes that the Loki instance at `endpoint` is running in single-tenant mode and no X-Scope-OrgID header is sent. When multiple `endpoint` blocks are provided, the `loki.write` component creates a client for each. -Received log entries are fanned-out to these clients in succession. -That means that if one client is bottlenecked, it may impact the rest. +Received log entries are fanned-out to these endpoints in succession. That means that if one endpint is bottlenecked, it may impact the rest. + +Each endpoint has a _queue_ of batches to be sent. The `queue_config` block can be used to customize the behavior of this queue. Endpoints can be named for easier identification in debug metrics by using the `name` argument. If the `name` argument isn't provided, a name is generated based on a hash of the endpoint settings. @@ -127,10 +128,7 @@ When `retry_on_http_429` is enabled, the retry mechanism is governed by the back ### `queue_config` -{{< docs/shared lookup="stability/experimental_feature.md" source="alloy" version="" >}} - -The optional `queue_config` block configures, when WAL is enabled, how the underlying client queues batches of logs sent to Loki. -Refer to [Write-Ahead block](#wal) for more information. +The optional `queue_config` block configures how the endpoint queues batches of logs sent to Loki. The following arguments are supported: @@ -138,6 +136,10 @@ The following arguments are supported: | --------------- | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- | -------- | | `capacity` | `string` | Controls the size of the underlying send queue buffer. This setting should be considered a worst-case scenario of memory consumption, in which all enqueued batches are full. | `10MiB` | no | | `drain_timeout` | `duration` | Configures the maximum time the client can take to drain the send queue upon shutdown. During that time, it enqueues pending batches and drains the send queue sending each. | `"1m"` | no | +| `min_shards` | `number` | Minimum amount of concurrent shards sending samples to the endpoint. | `1` | no | + +Each endpoint manages a number of concurrent _shards_ which is responsible for sending a fraction of batches, number of shards are controlled with `min_shards` argument. +Each shard has a queue of batches it keeps in memory, controlled with the `capacity` argument. ### `tls_config` diff --git a/internal/component/common/loki/client/config.go b/internal/component/common/loki/client/config.go index 270e47c0139..f999bba99b5 100644 --- a/internal/component/common/loki/client/config.go +++ b/internal/component/common/loki/client/config.go @@ -47,6 +47,9 @@ type QueueConfig struct { // is the 1 MiB default, and a capacity of 100 MiB, the underlying buffered channel would buffer up to 100 batches. Capacity int + // MinShards is the minimum amount of concurrent shards sending batches to the endpoint. + MinShards int + // DrainTimeout controls the maximum time that draining the send queue can take. DrainTimeout time.Duration } diff --git a/internal/component/common/loki/client/consumer_fanout.go b/internal/component/common/loki/client/consumer_fanout.go index 6a7fadf3804..a8017d62d9c 100644 --- a/internal/component/common/loki/client/consumer_fanout.go +++ b/internal/component/common/loki/client/consumer_fanout.go @@ -1,25 +1,17 @@ package client import ( - "bufio" - "bytes" "context" "crypto/sha256" - "errors" "fmt" - "io" - "net/http" - "strconv" "sync" "time" "github.com/go-kit/log" "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/common/config" - "github.com/prometheus/common/model" "github.com/grafana/alloy/internal/component/common/loki" - "github.com/grafana/alloy/internal/runtime/logging/level" + "github.com/grafana/alloy/internal/component/common/loki/client/internal" "github.com/grafana/alloy/internal/useragent" "github.com/grafana/dskit/backoff" ) @@ -106,7 +98,7 @@ func getClientName(cfg Config) string { return asSha256(cfg) } -func asSha256(o interface{}) string { +func asSha256(o any) string { h := sha256.New() _, _ = fmt.Fprintf(h, "%v", o) @@ -114,151 +106,59 @@ func asSha256(o interface{}) string { return temp[:6] } -const ( - contentType = "application/x-protobuf" - maxErrMsgLen = 1024 - - // Label reserved to override the tenant ID while processing - // pipeline stages - ReservedLabelTenantID = "__tenant_id__" -) - var userAgent = useragent.Get() // Client for pushing logs in snappy-compressed protos over HTTP. type client struct { - metrics *Metrics - logger log.Logger cfg Config - client *http.Client entries chan loki.Entry - once sync.Once - wg sync.WaitGroup + wg sync.WaitGroup - // ctx is used in any upstream calls from the `client`. ctx context.Context cancel context.CancelFunc + + shards *shards } func newClient(metrics *Metrics, cfg Config, logger log.Logger) (*client, error) { - if cfg.URL.URL == nil { - return nil, errors.New("client needs target URL") - } - if metrics == nil { - return nil, errors.New("metrics must be instantiated") + logger = log.With(logger, "component", "client", "host", cfg.URL.Host) + + shards, err := newShards(metrics, logger, internal.NewNopMarkerHandler(), cfg) + if err != nil { + return nil, err } ctx, cancel := context.WithCancel(context.Background()) c := &client{ - logger: log.With(logger, "component", "client", "host", cfg.URL.Host), cfg: cfg, entries: make(chan loki.Entry), - metrics: metrics, + shards: shards, ctx: ctx, cancel: cancel, } - err := cfg.Client.Validate() - if err != nil { - return nil, err - } - - c.client, err = config.NewClientFromConfig(cfg.Client, useragent.ProductName, config.WithHTTP2Disabled()) - if err != nil { - return nil, err - } - - c.client.Timeout = cfg.Timeout + c.shards.start(cfg.Queue.MinShards) c.wg.Go(func() { c.run() }) return c, nil } -func (c *client) initBatchMetrics(tenantID string) { - // Initialize counters to 0 so the metrics are exported before the first - // occurrence of incrementing to avoid missing metrics. - for _, counter := range c.metrics.countersWithHostTenantReason { - for _, reason := range Reasons { - counter.WithLabelValues(c.cfg.URL.Host, tenantID, reason).Add(0) - } - } - - for _, counter := range c.metrics.countersWithHostTenant { - counter.WithLabelValues(c.cfg.URL.Host, tenantID).Add(0) - } -} - func (c *client) run() { - batches := map[string]*batch{} - - // Given the client handles multiple batches (1 per tenant) and each batch - // can be created at a different point in time, we look for batches whose - // max wait time has been reached every 10 times per BatchWait, so that the - // maximum delay we have sending batches is 10% of the max waiting time. - // We apply a cap of 10ms to the ticker, to avoid too frequent checks in - // case the BatchWait is very low. - minWaitCheckFrequency := 10 * time.Millisecond - maxWaitCheckFrequency := max(c.cfg.BatchWait/10, minWaitCheckFrequency) - - maxWaitCheck := time.NewTicker(maxWaitCheckFrequency) - - defer func() { - maxWaitCheck.Stop() - // Send all pending batches - for tenantID, batch := range batches { - c.sendBatch(tenantID, batch) - } - }() - for { select { - case e, ok := <-c.entries: - if !ok { - return - } - - e, tenantID := c.processEntry(e) - batch, ok := batches[tenantID] - - // If the batch doesn't exist yet, we create a new one with the entry - if !ok { - batches[tenantID] = newBatch(c.cfg.MaxStreams, e) - c.initBatchMetrics(tenantID) - break - } - - // If adding the entry to the batch will increase the size over the max - // size allowed, we do send the current batch and then create a new one - if batch.sizeBytesAfter(e.Entry) > c.cfg.BatchSize { - c.sendBatch(tenantID, batch) - - batches[tenantID] = newBatch(c.cfg.MaxStreams, e) - break - } - - // The max size of the batch isn't reached, so we can add the entry - err := batch.add(e, 0) - if err != nil { - level.Error(c.logger).Log("msg", "batch add err", "tenant", tenantID, "error", err) - reason := ReasonGeneric - if errors.Is(err, errMaxStreamsLimitExceeded) { - reason = ReasonStreamLimited + case <-c.ctx.Done(): + return + case e := <-c.entries: + backoff := backoff.New(c.ctx, backoff.Config{ + MinBackoff: 5 * time.Millisecond, + MaxBackoff: 50 * time.Millisecond, + }) + for !c.shards.enqueue(e, 0) { + if !backoff.Ongoing() { + break } - c.metrics.droppedBytes.WithLabelValues(c.cfg.URL.Host, tenantID, reason).Add(float64(len(e.Line))) - c.metrics.droppedEntries.WithLabelValues(c.cfg.URL.Host, tenantID, reason).Inc() - return - } - case <-maxWaitCheck.C: - // Send all batches whose max wait time has been reached - for tenantID, batch := range batches { - if batch.age() < c.cfg.BatchWait { - continue - } - - c.sendBatch(tenantID, batch) - delete(batches, tenantID) } } } @@ -268,146 +168,8 @@ func (c *client) Chan() chan<- loki.Entry { return c.entries } -func batchIsRateLimited(status int) bool { - return status == 429 -} - -func (c *client) sendBatch(tenantID string, batch *batch) { - buf, entriesCount, err := batch.encode() - if err != nil { - level.Error(c.logger).Log("msg", "error encoding batch", "error", err) - return - } - bufBytes := float64(len(buf)) - c.metrics.encodedBytes.WithLabelValues(c.cfg.URL.Host, tenantID).Add(bufBytes) - - backoff := backoff.New(c.ctx, c.cfg.BackoffConfig) - var status int - for { - start := time.Now() - // send uses `timeout` internally, so `context.Background` is good enough. - status, err = c.send(context.Background(), tenantID, buf) - - c.metrics.requestDuration.WithLabelValues(strconv.Itoa(status), c.cfg.URL.Host, tenantID).Observe(time.Since(start).Seconds()) - - // Immediately drop rate limited batches to avoid HOL blocking for other tenants not experiencing throttling - if c.cfg.DropRateLimitedBatches && batchIsRateLimited(status) { - level.Warn(c.logger).Log("msg", "dropping batch due to rate limiting applied at ingester") - c.metrics.droppedBytes.WithLabelValues(c.cfg.URL.Host, tenantID, ReasonRateLimited).Add(bufBytes) - c.metrics.droppedEntries.WithLabelValues(c.cfg.URL.Host, tenantID, ReasonRateLimited).Add(float64(entriesCount)) - return - } - - if err == nil { - c.metrics.sentBytes.WithLabelValues(c.cfg.URL.Host, tenantID).Add(bufBytes) - c.metrics.sentEntries.WithLabelValues(c.cfg.URL.Host, tenantID).Add(float64(entriesCount)) - return - } - - // Only retry 429s, 500s and connection-level errors. - if status > 0 && !batchIsRateLimited(status) && status/100 != 5 { - break - } - - level.Debug(c.logger).Log("msg", "error sending batch, will retry", "status", status, "tenant", tenantID, "error", err) - c.metrics.batchRetries.WithLabelValues(c.cfg.URL.Host, tenantID).Inc() - backoff.Wait() - - // Make sure it sends at least once before checking for retry. - if !backoff.Ongoing() { - break - } - } - - level.Error(c.logger).Log("msg", "final error sending batch, no retries left, dropping data", "status", status, "tenant", tenantID, "error", err) - // If the reason for the last retry error was rate limiting, count the drops as such, even if the previous errors - // were for a different reason - dropReason := ReasonGeneric - if batchIsRateLimited(status) { - dropReason = ReasonRateLimited - } - c.metrics.droppedBytes.WithLabelValues(c.cfg.URL.Host, tenantID, dropReason).Add(bufBytes) - c.metrics.droppedEntries.WithLabelValues(c.cfg.URL.Host, tenantID, dropReason).Add(float64(entriesCount)) -} - -func (c *client) send(ctx context.Context, tenantID string, buf []byte) (int, error) { - ctx, cancel := context.WithTimeout(ctx, c.cfg.Timeout) - defer cancel() - req, err := http.NewRequestWithContext(ctx, "POST", c.cfg.URL.String(), bytes.NewReader(buf)) - if err != nil { - return -1, err - } - req.Header.Set("Content-Type", contentType) - req.Header.Set("User-Agent", userAgent) - - // If the tenant ID is not empty promtail is running in multi-tenant mode, so - // we should send it to Loki - if tenantID != "" { - req.Header.Set("X-Scope-OrgID", tenantID) - } - - // Add custom headers on request - if len(c.cfg.Headers) > 0 { - for k, v := range c.cfg.Headers { - if req.Header.Get(k) == "" { - req.Header.Add(k, v) - } else { - level.Warn(c.logger).Log("msg", "custom header key already exists, skipping", "key", k) - } - } - } - - resp, err := c.client.Do(req) - if err != nil { - return -1, err - } - - // NOTE: it is important in go to fully read the body and - // close it so that the connection can be reused. - // We only partially read the body if we encounter a non 2xx error - // so we should always consume whats left. - // https://github.com/golang/go/blob/32a9804c7ba3f4a0e0bd26cc24b9204860a49ec8/src/net/http/response.go#L59-L64 - // It is unclear that we always need to drain the body but - // https://github.com/golang/go/issues/60240#issuecomment-1551060433 seems to indicate that we should. - defer func() { - _, _ = io.Copy(io.Discard, resp.Body) - _ = resp.Body.Close() - }() - - if resp.StatusCode/100 != 2 { - scanner := bufio.NewScanner(io.LimitReader(resp.Body, maxErrMsgLen)) - line := "" - if scanner.Scan() { - line = scanner.Text() - } - err = fmt.Errorf("server returned HTTP status %s (%d): %s", resp.Status, resp.StatusCode, line) - } - return resp.StatusCode, err -} - -func (c *client) getTenantID(labels model.LabelSet) string { - // Check if it has been overridden while processing the pipeline stages - if value, ok := labels[ReservedLabelTenantID]; ok { - return string(value) - } - - // Check if has been specified in the config - if c.cfg.TenantID != "" { - return c.cfg.TenantID - } - - // Defaults to an empty string, which means the X-Scope-OrgID header - // will not be sent - return "" -} - -// Stop the client. func (c *client) Stop() { - c.once.Do(func() { close(c.entries) }) + c.shards.stop() + c.cancel() c.wg.Wait() } - -func (c *client) processEntry(e loki.Entry) (loki.Entry, string) { - tenantID := c.getTenantID(e.Labels) - return e, tenantID -} diff --git a/internal/component/common/loki/client/consumer_wal.go b/internal/component/common/loki/client/consumer_wal.go index ca2a6f4cf19..63163a700b9 100644 --- a/internal/component/common/loki/client/consumer_wal.go +++ b/internal/component/common/loki/client/consumer_wal.go @@ -1,23 +1,15 @@ package client import ( - "bufio" - "bytes" "context" - "errors" "fmt" - "io" - "net/http" - "strconv" "sync" "time" "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/grafana/dskit/backoff" - "github.com/grafana/loki/pkg/push" "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/common/config" "github.com/prometheus/common/model" "github.com/prometheus/prometheus/tsdb/chunks" "github.com/prometheus/prometheus/tsdb/record" @@ -25,7 +17,6 @@ import ( "github.com/grafana/alloy/internal/component/common/loki" "github.com/grafana/alloy/internal/component/common/loki/client/internal" "github.com/grafana/alloy/internal/component/common/loki/wal" - "github.com/grafana/alloy/internal/useragent" ) func NewWALConsumer(logger log.Logger, reg prometheus.Registerer, walCfg wal.Config, clientCfgs ...Config) (*WALConsumer, error) { @@ -153,93 +144,54 @@ func (m *WALConsumer) stop(drain bool) { stopWG.Wait() } -// walClient is a WAL-specific remote write client implementation. This client attests to the wal.WriteTo interface, -// which allows it to be injected in the wal.Watcher as a destination where to write read series and entries. As the watcher -// reads from the WAL, batches are created and dispatched onto a send queue when ready to be sent. -type walClient struct { - metrics *Metrics - wcMetrics *WALClientMetrics - logger log.Logger - cfg Config - client *http.Client - - batches map[string]*batch - batchesMtx sync.Mutex - sendQueue *queue - drainTimeout time.Duration +func newWalClient(metrics *Metrics, wcMetrics *WALClientMetrics, cfg Config, logger log.Logger, markerHandler internal.MarkerHandler) (*walClient, error) { + logger = log.With(logger, "component", "client", "host", cfg.URL.Host) - wg sync.WaitGroup - - // series cache - series map[chunks.HeadSeriesRef]model.LabelSet - seriesSegment map[chunks.HeadSeriesRef]int - seriesLock sync.RWMutex - - // ctx is used in any upstream calls from the `client`. - ctx context.Context - cancel context.CancelFunc - quit chan struct{} - markerHandler internal.MarkerHandler -} - -func newWalClient(metrics *Metrics, qcMetrics *WALClientMetrics, cfg Config, logger log.Logger, markerHandler internal.MarkerHandler) (*walClient, error) { - if cfg.URL.URL == nil { - return nil, errors.New("client needs target URL") + shards, err := newShards(metrics, logger, markerHandler, cfg) + if err != nil { + return nil, err } ctx, cancel := context.WithCancel(context.Background()) c := &walClient{ - logger: log.With(logger, "component", "client", "host", cfg.URL.Host), - cfg: cfg, - metrics: metrics, - wcMetrics: qcMetrics, - drainTimeout: cfg.Queue.DrainTimeout, - quit: make(chan struct{}), - - batches: make(map[string]*batch), - markerHandler: markerHandler, - - series: make(map[chunks.HeadSeriesRef]model.LabelSet), - seriesSegment: make(map[chunks.HeadSeriesRef]int), + logger: logger, + cfg: cfg, + wcMetrics: wcMetrics, + shards: shards, ctx: ctx, cancel: cancel, - } - // The buffered channel size is calculated using the configured capacity, which is the worst case number of bytes - // the send queue can consume. - var queueBufferSize = cfg.Queue.Capacity / cfg.BatchSize - c.sendQueue = newQueue(c, queueBufferSize, logger) - - err := cfg.Client.Validate() - if err != nil { - return nil, err - } + series: make(map[chunks.HeadSeriesRef]model.LabelSet), + seriesSegment: make(map[chunks.HeadSeriesRef]int), - c.client, err = config.NewClientFromConfig(cfg.Client, useragent.ProductName) - if err != nil { - return nil, err + markerHandler: markerHandler, } - c.client.Timeout = cfg.Timeout + c.shards.start(cfg.Queue.MinShards) - c.wg.Go(func() { c.runSendOldBatches() }) return c, nil } -func (c *walClient) initBatchMetrics(tenantID string) { - // Initialize counters to 0 so the metrics are exported before the first - // occurrence of incrementing to avoid missing metrics. - for _, counter := range c.metrics.countersWithHostTenantReason { - for _, reason := range Reasons { - counter.WithLabelValues(c.cfg.URL.Host, tenantID, reason).Add(0) - } - } +// walClient is a WAL-specific remote write client implementation. This client attests to the wal.WriteTo interface, +// which allows it to be injected in the wal.Watcher as a destination where to write read series and entries. As the watcher +// reads from the WAL, batches are created and dispatched onto a send queue when ready to be sent. +type walClient struct { + wcMetrics *WALClientMetrics + logger log.Logger + cfg Config + shards *shards - for _, counter := range c.metrics.countersWithHostTenant { - counter.WithLabelValues(c.cfg.URL.Host, tenantID).Add(0) - } + ctx context.Context + cancel context.CancelFunc + + // series cache + series map[chunks.HeadSeriesRef]model.LabelSet + seriesSegment map[chunks.HeadSeriesRef]int + seriesLock sync.RWMutex + + markerHandler internal.MarkerHandler } func (c *walClient) SeriesReset(segmentNum int) { @@ -270,7 +222,11 @@ func (c *walClient) AppendEntries(entries wal.RefEntries, segment int) error { var maxSeenTimestamp int64 = -1 if ok { for _, e := range entries.Entries { - c.appendSingleEntry(segment, l, e) + ok := c.appendSingleEntry(loki.Entry{Labels: l, Entry: e}, segment) + if !ok { + return nil + } + if e.Timestamp.Unix() > maxSeenTimestamp { maxSeenTimestamp = e.Timestamp.Unix() } @@ -289,382 +245,24 @@ func (c *walClient) AppendEntries(entries wal.RefEntries, segment int) error { return nil } -func (c *walClient) appendSingleEntry(segmentNum int, lbs model.LabelSet, e push.Entry) { - lbs, tenantID := c.processLabels(lbs) - - // TODO: can I make this locking more fine grained? - c.batchesMtx.Lock() - - batch, ok := c.batches[tenantID] - - // If the batch doesn't exist yet, we create a new one with the entry - if !ok { - nb := newBatch(c.cfg.MaxStreams) - // since the batch is new, adding a new entry, and hence a new stream, won't fail since there aren't any stream - // registered in the batch. - _ = nb.add(loki.Entry{Labels: lbs, Entry: e}, segmentNum) - - c.batches[tenantID] = nb - c.batchesMtx.Unlock() - - c.initBatchMetrics(tenantID) - return - } - - // If adding the entry to the batch will increase the size over the max - // size allowed, we do send the current batch and then create a new one - if batch.sizeBytesAfter(e) > c.cfg.BatchSize { - c.sendQueue.enqueue(queuedBatch{ - TenantID: tenantID, - Batch: batch, - }) - - nb := newBatch(c.cfg.MaxStreams) - _ = nb.add(loki.Entry{Labels: lbs, Entry: e}, segmentNum) - c.batches[tenantID] = nb - c.batchesMtx.Unlock() - - return - } - - // The max size of the batch isn't reached, so we can add the entry - err := batch.add(loki.Entry{Labels: lbs, Entry: e}, segmentNum) - c.batchesMtx.Unlock() - - if err != nil { - level.Error(c.logger).Log("msg", "batch add err", "tenant", tenantID, "error", err) - reason := ReasonGeneric - if errors.Is(err, errMaxStreamsLimitExceeded) { - reason = ReasonStreamLimited - } - c.metrics.droppedBytes.WithLabelValues(c.cfg.URL.Host, tenantID, reason).Add(float64(len(e.Line))) - c.metrics.droppedEntries.WithLabelValues(c.cfg.URL.Host, tenantID, reason).Inc() - } -} - -func (c *walClient) runSendOldBatches() { - // Given the client handles multiple batches (1 per tenant) and each batch - // can be created at a different point in time, we look for batches whose - // max wait time has been reached every 10 times per BatchWait, so that the - // maximum delay we have sending batches is 10% of the max waiting time. - // We apply a cap of 10ms to the ticker, to avoid too frequent checks in - // case the BatchWait is very low. - minWaitCheckFrequency := 10 * time.Millisecond - maxWaitCheckFrequency := max(c.cfg.BatchWait/10, minWaitCheckFrequency) - - maxWaitCheck := time.NewTicker(maxWaitCheckFrequency) - - // pablo: maybe this should be moved out - defer func() { - maxWaitCheck.Stop() - }() - - var batchesToFlush []queuedBatch - - for { - select { - case <-c.quit: - return - - case <-maxWaitCheck.C: - c.batchesMtx.Lock() - // Send all batches whose max wait time has been reached - for tenantID, b := range c.batches { - if b.age() < c.cfg.BatchWait { - continue - } - - // add to batches to flush, so we can enqueue them later and release the batches lock - // as early as possible - batchesToFlush = append(batchesToFlush, queuedBatch{ - TenantID: tenantID, - Batch: b, - }) - - // deleting assuming that since the batch expired the wait time, it - // hasn't been written for some time - delete(c.batches, tenantID) - } - - c.batchesMtx.Unlock() - - // enqueue batches that were marked as too old - for _, qb := range batchesToFlush { - c.sendQueue.enqueue(qb) - } - - batchesToFlush = batchesToFlush[:0] // renew slice - } - } -} - -// enqueuePendingBatches will go over the pending batches, and enqueue them in the send queue. If the context's -// deadline is exceeded in any enqueue operation, this routine exits. -func (c *walClient) enqueuePendingBatches(ctx context.Context) { - c.batchesMtx.Lock() - defer c.batchesMtx.Unlock() - - for tenantID, batch := range c.batches { - if !c.sendQueue.enqueueWithCancel(ctx, queuedBatch{ - TenantID: tenantID, - Batch: batch, - }) { - // if enqueue times out due to the context timing out, cancel all - return - } - } -} - -func (c *walClient) sendBatch(ctx context.Context, tenantID string, batch *batch) { - buf, entriesCount, err := batch.encode() - if err != nil { - level.Error(c.logger).Log("msg", "error encoding batch", "error", err) - return - } - bufBytes := float64(len(buf)) - c.metrics.encodedBytes.WithLabelValues(c.cfg.URL.Host, tenantID).Add(bufBytes) - - backoff := backoff.New(c.ctx, c.cfg.BackoffConfig) - var status int - for { - start := time.Now() - // send uses `timeout` internally, so `context.Background` is good enough. - status, err = c.send(ctx, tenantID, buf) - - c.metrics.requestDuration.WithLabelValues(strconv.Itoa(status), c.cfg.URL.Host, tenantID).Observe(time.Since(start).Seconds()) - - // Immediately drop rate limited batches to avoid HOL blocking for other tenants not experiencing throttling - if c.cfg.DropRateLimitedBatches && batchIsRateLimited(status) { - level.Warn(c.logger).Log("msg", "dropping batch due to rate limiting applied at ingester") - c.metrics.droppedBytes.WithLabelValues(c.cfg.URL.Host, tenantID, ReasonRateLimited).Add(bufBytes) - c.metrics.droppedEntries.WithLabelValues(c.cfg.URL.Host, tenantID, ReasonRateLimited).Add(float64(entriesCount)) - return - } - - if err == nil { - c.metrics.sentBytes.WithLabelValues(c.cfg.URL.Host, tenantID).Add(bufBytes) - c.metrics.sentEntries.WithLabelValues(c.cfg.URL.Host, tenantID).Add(float64(entriesCount)) - return - } - - // Only retry 429s, 500s and connection-level errors. - if status > 0 && !batchIsRateLimited(status) && status/100 != 5 { - break - } - - level.Warn(c.logger).Log("msg", "error sending batch, will retry", "status", status, "tenant", tenantID, "error", err) - c.metrics.batchRetries.WithLabelValues(c.cfg.URL.Host, tenantID).Inc() - backoff.Wait() - - // Make sure it sends at least once before checking for retry. +func (c *walClient) appendSingleEntry(entry loki.Entry, segmentNum int) bool { + backoff := backoff.New(c.ctx, backoff.Config{ + MinBackoff: 5 * time.Millisecond, + MaxBackoff: 50 * time.Millisecond, + }) + for !c.shards.enqueue(entry, segmentNum) { if !backoff.Ongoing() { - break + // we could not enqueue and client is stopped. + return false } } - - level.Error(c.logger).Log("msg", "final error sending batch, no retries left, dropping data", "status", status, "tenant", tenantID, "error", err) - // If the reason for the last retry error was rate limiting, count the drops as such, even if the previous errors - // were for a different reason - dropReason := ReasonGeneric - if batchIsRateLimited(status) { - dropReason = ReasonRateLimited - } - c.metrics.droppedBytes.WithLabelValues(c.cfg.URL.Host, tenantID, dropReason).Add(bufBytes) - c.metrics.droppedEntries.WithLabelValues(c.cfg.URL.Host, tenantID, dropReason).Add(float64(entriesCount)) -} - -func (c *walClient) send(ctx context.Context, tenantID string, buf []byte) (int, error) { - ctx, cancel := context.WithTimeout(ctx, c.cfg.Timeout) - defer cancel() - req, err := http.NewRequest("POST", c.cfg.URL.String(), bytes.NewReader(buf)) - if err != nil { - return -1, err - } - req = req.WithContext(ctx) - req.Header.Set("Content-Type", contentType) - req.Header.Set("User-Agent", userAgent) - - // If the tenant ID is not empty promtail is running in multi-tenant mode, so - // we should send it to Loki - if tenantID != "" { - req.Header.Set("X-Scope-OrgID", tenantID) - } - - // Add custom headers on request - if len(c.cfg.Headers) > 0 { - for k, v := range c.cfg.Headers { - if req.Header.Get(k) == "" { - req.Header.Add(k, v) - } else { - level.Warn(c.logger).Log("msg", "custom header key already exists, skipping", "key", k) - } - } - } - - resp, err := c.client.Do(req) - if err != nil { - return -1, err - } - - // NOTE: it is important in go to fully read the body and - // close it so that the connection can be reused. - // We only partially read the body if we encounter a non 2xx error - // so we should always consume whats left. - // https://github.com/golang/go/blob/32a9804c7ba3f4a0e0bd26cc24b9204860a49ec8/src/net/http/response.go#L59-L64 - // It is unclear that we always need to drain the body but - // https://github.com/golang/go/issues/60240#issuecomment-1551060433 seems to indicate that we should. - - defer func() { - _, _ = io.Copy(io.Discard, resp.Body) - _ = resp.Body.Close() - }() - - if resp.StatusCode/100 != 2 { - scanner := bufio.NewScanner(io.LimitReader(resp.Body, maxErrMsgLen)) - line := "" - if scanner.Scan() { - line = scanner.Text() - } - err = fmt.Errorf("server returned HTTP status %s (%d): %s", resp.Status, resp.StatusCode, line) - } - return resp.StatusCode, err -} - -func (c *walClient) getTenantID(labels model.LabelSet) string { - // Check if it has been overridden while processing the pipeline stages - if value, ok := labels[ReservedLabelTenantID]; ok { - return string(value) - } - - // Check if has been specified in the config - if c.cfg.TenantID != "" { - return c.cfg.TenantID - } - - // Defaults to an empty string, which means the X-Scope-OrgID header - // will not be sent - return "" + return true } // Stop the client, enqueueing pending batches and draining the send queue accordingly. Both closing operations are // limited by a deadline, controlled by a configured drain timeout, which is global to the Stop call. func (c *walClient) Stop() { - // first close main queue routine - close(c.quit) - c.wg.Wait() - - // fire timeout timer - ctx, cancel := context.WithTimeout(context.Background(), c.drainTimeout) - defer cancel() - - // enqueue batches that might be pending in the batches map - c.enqueuePendingBatches(ctx) - - // drain sendQueue with timeout in context - c.sendQueue.closeAndDrain(ctx) - - // stop request after drain times out or exits - c.cancel() - + // drain shards + c.shards.stop() c.markerHandler.Stop() } - -func (c *walClient) processLabels(lbs model.LabelSet) (model.LabelSet, string) { - tenantID := c.getTenantID(lbs) - return lbs, tenantID -} - -// queuedBatch is a batch specific to a tenant, that is considered ready to be sent. -type queuedBatch struct { - TenantID string - Batch *batch -} - -// queue wraps a buffered channel and a routine that reads from it, sending batches of entries. -type queue struct { - client *walClient - q chan queuedBatch - quit chan struct{} - wg sync.WaitGroup - logger log.Logger -} - -func newQueue(client *walClient, size int, logger log.Logger) *queue { - q := queue{ - client: client, - q: make(chan queuedBatch, size), - quit: make(chan struct{}), - logger: logger, - } - - q.wg.Go(func() { q.run() }) - - return &q -} - -// enqueue adds to the send queue a batch ready to be sent. Note that if the backing queue is has no -// remaining capacity to enqueue the batch, calling enqueue might block. -func (q *queue) enqueue(qb queuedBatch) { - q.q <- qb -} - -// enqueueWithCancel tries to enqueue a batch, giving up if the supplied context times deadlines -// times out. If the batch is successfully enqueued, it returns true. -func (q *queue) enqueueWithCancel(ctx context.Context, qb queuedBatch) bool { - select { - case <-ctx.Done(): - return false - case q.q <- qb: - } - return true -} - -func (q *queue) run() { - for { - select { - case <-q.quit: - return - case qb := <-q.q: - // Since inside the actual send operation a context with time out is used, we should exceed that timeout - // instead of cancelling this send operation, since that batch has been taken out of the queue. - q.sendAndReport(context.Background(), qb.TenantID, qb.Batch) - } - } -} - -// closeAndDrain stops gracefully the queue. The process first stops the main routine that reads batches to be sent, -// to instead drain the queue and send those batches from this thread, exiting if the supplied context deadline -// is exceeded. Also, if the underlying buffered channel is fully drain, this will exit promptly. -func (q *queue) closeAndDrain(ctx context.Context) { - // defer main channel closing - defer close(q.q) - - // first stop main routine, and wait for it to signal - close(q.quit) - q.wg.Wait() - - // keep reading messages from sendQueue until all have been consumed, or timeout is exceeded - for { - select { - case qb := <-q.q: - // drain uses the same timeout, so if a timeout was applied to the parent context, it can cancel the underlying - // send operation preemptively. - q.sendAndReport(ctx, qb.TenantID, qb.Batch) - case <-ctx.Done(): - level.Warn(q.logger).Log("msg", "timeout exceeded while draining send queue") - return - default: - level.Debug(q.logger).Log("msg", "drain queue exited because there were no batches left to send") - return - // if default clause is taken, it means there's nothing left in the send queue - } - } -} - -// sendAndReport attempts to send the batch for the given tenant, and either way that operation succeeds or fails, reports -// the data as sent. -func (q *queue) sendAndReport(ctx context.Context, tenantId string, b *batch) { - q.client.sendBatch(ctx, tenantId, b) - // mark segment data for that batch as sent, even if the send operation failed - b.reportAsSentData(q.client.markerHandler) -} diff --git a/internal/component/common/loki/client/shards.go b/internal/component/common/loki/client/shards.go new file mode 100644 index 00000000000..1733ae80b33 --- /dev/null +++ b/internal/component/common/loki/client/shards.go @@ -0,0 +1,498 @@ +package client + +import ( + "bufio" + "bytes" + "context" + "errors" + "fmt" + "io" + "net/http" + "strconv" + "sync" + "time" + + "github.com/go-kit/log" + "github.com/grafana/dskit/backoff" + "github.com/prometheus/common/config" + "go.uber.org/atomic" + + "github.com/grafana/alloy/internal/component/common/loki" + "github.com/grafana/alloy/internal/runtime/logging/level" + "github.com/grafana/alloy/internal/useragent" +) + +const ( + // Label reserved to override the tenant ID while processing + // pipeline stages + ReservedLabelTenantID = "__tenant_id__" +) + +// queuedBatch is a batch specific to a tenant, that is considered ready to be sent. +type queuedBatch struct { + TenantID string + Batch *batch +} + +func newQueue(metrics *Metrics, logger log.Logger, cfg Config) *queue { + // Capacity is the worst case size in bytes desired for the send queue. This value is used to calculate the size of + // the buffered channel. The worst case scenario assumed is that every batch buffered in full, hence + // the channel capacity would be calculated as: bufferChannelSize = Capacity / BatchSize. + // For example, assuming BatchSize is the 1 MiB default and Capacity is 100 MiB, + // the underlying buffered channel would buffer up to 100 batches. + capacity := max(cfg.Queue.Capacity/max(cfg.BatchSize, 1), 1) + + return &queue{ + cfg: cfg, + metrics: metrics, + logger: logger, + + batches: make(map[string]*batch), + c: make(chan queuedBatch, capacity), + } +} + +// queue for batching and sending log entries to Loki. +// The queue maintains separate batches per tenant and enqueues batches when they +// reach the configured batch size limit. +type queue struct { + cfg Config + metrics *Metrics + logger log.Logger + c chan queuedBatch + + mu sync.Mutex + // batches maintains one active batch per tenant. When a batch reaches + // the size limit, it's moved to the channel and a new batch is created + // for that tenant. + batches map[string]*batch +} + +// append adds a log entry to the queue for the given tenant. +// It returns true if the entry was successfully queued, false if the queue +// is full and backpressure should be applied. +func (q *queue) append(tenantID string, entry loki.Entry, segmentNum int) bool { + q.mu.Lock() + defer q.mu.Unlock() + + batch, ok := q.batches[tenantID] + if !ok { + // Create a new batch for this tenant. + batch := newBatch(q.cfg.MaxStreams) + _ = batch.add(entry, segmentNum) + q.batches[tenantID] = batch + return true + } + + // If adding this entry would exceed the batch size limit, enqueue the + // current batch and start a new one. + if batch.sizeBytesAfter(entry.Entry) > q.cfg.BatchSize { + select { + case q.c <- queuedBatch{Batch: batch, TenantID: tenantID}: + // Successfully enqueued the batch. + default: + // Channel is full, signal backpressure. + return false + } + + batch := newBatch(q.cfg.MaxStreams) + _ = batch.add(entry, segmentNum) + q.batches[tenantID] = batch + return true + } + + // Add entry to existing batch. If we cannot add entry to batch we will drop it. + if err := batch.add(entry, segmentNum); err != nil { + level.Error(q.logger).Log("msg", "batch add err", "tenant", tenantID, "error", err) + reason := ReasonGeneric + if errors.Is(err, errMaxStreamsLimitExceeded) { + reason = ReasonStreamLimited + } + q.metrics.droppedBytes.WithLabelValues(q.cfg.URL.Host, tenantID, reason).Add(float64(len(entry.Line))) + q.metrics.droppedEntries.WithLabelValues(q.cfg.URL.Host, tenantID, reason).Inc() + } + + return true +} + +// channel returns the channel used to receive batches ready to be sent. +func (q *queue) channel() chan queuedBatch { + return q.c +} + +// drain retrieves all batches that are ready to be sent. +// It returns all batches currently in the channel and all batches +// from the batches map that have exceeded BatchWait. +func (q *queue) drain() []queuedBatch { + q.mu.Lock() + defer q.mu.Unlock() + + var batches []queuedBatch + +loop: + for { + select { + case b := <-q.c: + // Drain all batches from the channel + batches = append(batches, b) + default: + // Check for age-based ready batches + for tenantID, batch := range q.batches { + if batch.age() < q.cfg.BatchWait { + continue + } + + // Batch has exceeded wait time, remove from map and return it + delete(q.batches, tenantID) + batches = append(batches, queuedBatch{ + TenantID: tenantID, + Batch: batch, + }) + } + break loop + } + } + return batches +} + +// flushAndShutdown flushes all remaining batches and closes the channel. +// It will stop early if the done channel is signaled. +func (q *queue) flushAndShutdown(done chan struct{}) { + q.mu.Lock() + defer q.mu.Unlock() + +loop: + for tenantID, batch := range q.batches { + select { + case q.c <- queuedBatch{Batch: batch, TenantID: tenantID}: + // Successfully enqueued batch for sending + case <-done: + // Shutdown timeout reached, stop trying to flush + break loop + } + } + + // It's safe to set batches to nil because a queue is never reused once we have closed it. + q.batches = nil + close(q.c) +} + +// newShards creates a new shards instance for parallel processing of log entries. +// It validates the configuration and creates an HTTP client for sending batches to Loki. +func newShards(metrics *Metrics, logger log.Logger, markerHandler SentDataMarkerHandler, cfg Config) (*shards, error) { + if cfg.URL.URL == nil { + return nil, errors.New("client needs target URL") + } + + err := cfg.Client.Validate() + if err != nil { + return nil, err + } + + client, err := config.NewClientFromConfig(cfg.Client, useragent.ProductName, config.WithHTTP2Disabled()) + if err != nil { + return nil, err + } + + client.Timeout = cfg.Timeout + + return &shards{ + cfg: cfg, + logger: logger, + metrics: metrics, + client: client, + markerHandler: markerHandler, + tenants: make(map[string]struct{}), + }, nil +} + +// shards manages multiple parallel queues for processing and sending log entries to Loki. +// It uses sharding to distribute entries across multiple worker goroutines based on label fingerprints, +// enabling parallel processing and improved throughput. Each shard has its own queue and worker goroutine. +// Entries are routed to shards using a hash of their label fingerprint. +type shards struct { + cfg Config + logger log.Logger + metrics *Metrics + client *http.Client + markerHandler SentDataMarkerHandler + + mut sync.Mutex + tenants map[string]struct{} + queues []*queue + + // running is used to track the number of running shards. + running atomic.Int32 + // done is used to signal that all shards have finished. + done chan struct{} + + // softShutdown is used to signal that no new entries should be accepted. + softShutdown chan struct{} + ctx context.Context + // cancel is used to cancel the context when a hard shutdown is initiated. + cancel context.CancelFunc +} + +// start initializes n shards and starts worker goroutines for each one. +// Each shard gets its own queue and a dedicated worker that processes batches +// from that queue. The number of shards determines the parallelism level. +func (s *shards) start(n int) { + n = max(n, 1) + + s.mut.Lock() + defer s.mut.Unlock() + + queues := make([]*queue, n) + + for i := range n { + queues[i] = newQueue(s.metrics, s.logger, s.cfg) + } + + s.queues = queues + s.ctx, s.cancel = context.WithCancel(context.Background()) + s.running.Store(int32(n)) + s.done = make(chan struct{}) + s.softShutdown = make(chan struct{}) + + for i := range n { + go s.runShard(s.queues[i]) + } +} + +// stop tries to perform a graceful shutdown of all shards. +// It first attempts a soft shutdown by signaling that no new entries should be accepted +// and allowing all queues to flush their remaining batches within the drain timeout. +// If the drain timeout is exceeded, it performs a hard shutdown that will drop any remaining batches. +func (s *shards) stop() { + s.mut.Lock() + defer s.mut.Unlock() + + // Attempt a soft shutdown, meaning that all shards try to flush their remaining batches. + close(s.softShutdown) + + for _, q := range s.queues { + go q.flushAndShutdown(s.done) + } + + select { + case <-s.done: + return + case <-time.After(s.cfg.Queue.DrainTimeout): + } + + // Perform hard shutdown + s.cancel() + <-s.done +} + +// runShard is the worker goroutine that processes batches from a single queue. +func (s *shards) runShard(q *queue) { + // Given that a shard handles multiple batches (1 per tenant) and each batch + // can be created at a different point in time, we look for batches whose + // max wait time has been reached every 10 times per BatchWait, so that the + // maximum delay we have sending batches is 10% of the max waiting time. + // We apply a cap of 10ms to the ticker, to avoid too frequent checks in + // case the BatchWait is very low. + const minWaitCheckFrequency = 10 * time.Millisecond + maxWaitCheckFrequency := max(s.cfg.BatchWait/10, minWaitCheckFrequency) + + maxWaitCheck := time.NewTicker(maxWaitCheckFrequency) + defer func() { + maxWaitCheck.Stop() + + if s.running.Dec() == 0 { + close(s.done) + } + }() + + for { + select { + case <-s.ctx.Done(): + // Context is closed when hard shutdown is initiated. + return + case b, ok := <-q.channel(): + if !ok { + // Channel is closed, when a graceful shutdown is successful. + return + } + s.sendBatch(b.TenantID, b.Batch) + case <-maxWaitCheck.C: + // Drain all batches that have exceeded the max wait time. + for _, b := range q.drain() { + s.sendBatch(b.TenantID, b.Batch) + } + } + } +} + +// enqueue routes a log entry to the appropriate shard based on its label fingerprint. +// Returns false if we could not enqueue the entry, either because the shard is shutting down or the queue is full. +// It is up to the caller to retry or drop the entry. +func (s *shards) enqueue(entry loki.Entry, segmentNum int) bool { + s.mut.Lock() + defer s.mut.Unlock() + + entry, tenantID := s.processEntry(entry) + if _, ok := s.tenants[tenantID]; !ok { + s.tenants[tenantID] = struct{}{} + s.initBatchMetrics(tenantID) + } + + fingerprint := entry.Labels.FastFingerprint() + shard := uint64(fingerprint) % uint64(len(s.queues)) + + select { + case <-s.softShutdown: + return false + default: + return s.queues[shard].append(tenantID, entry, segmentNum) + } +} + +func (s *shards) initBatchMetrics(tenantID string) { + // Initialize counters to 0 so the metrics are exported before the first + // occurrence of incrementing to avoid missing metrics. + for _, counter := range s.metrics.countersWithHostTenantReason { + for _, reason := range Reasons { + counter.WithLabelValues(s.cfg.URL.Host, tenantID, reason).Add(0) + } + } + + for _, counter := range s.metrics.countersWithHostTenant { + counter.WithLabelValues(s.cfg.URL.Host, tenantID).Add(0) + } +} + +func (s *shards) processEntry(e loki.Entry) (loki.Entry, string) { + // Check if it has been overridden while processing the pipeline stages + if value, ok := e.Labels[ReservedLabelTenantID]; ok { + return e, string(value) + } + + return e, s.cfg.TenantID +} + +// sendBatch encodes a batch and sends it to Loki with retry logic. +func (s *shards) sendBatch(tenantID string, batch *batch) { + defer batch.reportAsSentData(s.markerHandler) + buf, entriesCount, err := batch.encode() + + if err != nil { + level.Error(s.logger).Log("msg", "error encoding batch", "error", err) + return + } + + bufBytes := float64(len(buf)) + s.metrics.encodedBytes.WithLabelValues(s.cfg.URL.Host, tenantID).Add(bufBytes) + + backoff := backoff.New(s.ctx, s.cfg.BackoffConfig) + var status int + for { + start := time.Now() + // send uses `timeout` internally, so `context.Background` is good enough. + status, err = s.send(context.Background(), tenantID, buf) + + s.metrics.requestDuration.WithLabelValues(strconv.Itoa(status), s.cfg.URL.Host, tenantID).Observe(time.Since(start).Seconds()) + + // Immediately drop rate limited batches to avoid HOL blocking for other tenants not experiencing throttling + if s.cfg.DropRateLimitedBatches && batchIsRateLimited(status) { + level.Warn(s.logger).Log("msg", "dropping batch due to rate limiting applied at ingester") + s.metrics.droppedBytes.WithLabelValues(s.cfg.URL.Host, tenantID, ReasonRateLimited).Add(bufBytes) + s.metrics.droppedEntries.WithLabelValues(s.cfg.URL.Host, tenantID, ReasonRateLimited).Add(float64(entriesCount)) + return + } + + if err == nil { + s.metrics.sentBytes.WithLabelValues(s.cfg.URL.Host, tenantID).Add(bufBytes) + s.metrics.sentEntries.WithLabelValues(s.cfg.URL.Host, tenantID).Add(float64(entriesCount)) + return + } + + // Only retry 429s, 500s and connection-level errors. + if status > 0 && !batchIsRateLimited(status) && status/100 != 5 { + break + } + + level.Debug(s.logger).Log("msg", "error sending batch, will retry", "status", status, "tenant", tenantID, "error", err) + s.metrics.batchRetries.WithLabelValues(s.cfg.URL.Host, tenantID).Inc() + backoff.Wait() + + // Make sure it sends at least once before checking for retry. + if !backoff.Ongoing() { + break + } + } + + level.Error(s.logger).Log("msg", "final error sending batch, no retries left, dropping data", "status", status, "tenant", tenantID, "error", err) + // If the reason for the last retry error was rate limiting, count the drops as such, even if the previous errors + // were for a different reason + dropReason := ReasonGeneric + if batchIsRateLimited(status) { + dropReason = ReasonRateLimited + } + s.metrics.droppedBytes.WithLabelValues(s.cfg.URL.Host, tenantID, dropReason).Add(bufBytes) + s.metrics.droppedEntries.WithLabelValues(s.cfg.URL.Host, tenantID, dropReason).Add(float64(entriesCount)) +} + +// send performs the HTTP POST request to send a batch to Loki. +func (s *shards) send(ctx context.Context, tenantID string, buf []byte) (int, error) { + ctx, cancel := context.WithTimeout(ctx, s.cfg.Timeout) + defer cancel() + req, err := http.NewRequestWithContext(ctx, "POST", s.cfg.URL.String(), bytes.NewReader(buf)) + if err != nil { + return -1, err + } + + const contentType = "application/x-protobuf" + req.Header.Set("Content-Type", contentType) + req.Header.Set("User-Agent", userAgent) + + // If the tenant ID is not empty alloy is running in multi-tenant mode, so + // we should send it to Loki + if tenantID != "" { + req.Header.Set("X-Scope-OrgID", tenantID) + } + + // Add custom headers on request + if len(s.cfg.Headers) > 0 { + for k, v := range s.cfg.Headers { + if req.Header.Get(k) == "" { + req.Header.Add(k, v) + } else { + level.Warn(s.logger).Log("msg", "custom header key already exists, skipping", "key", k) + } + } + } + + resp, err := s.client.Do(req) + if err != nil { + return -1, err + } + + // NOTE: it is important in go to fully read the body and + // close it so that the connection can be reused. + // We only partially read the body if we encounter a non 2xx error + // so we should always consume whats left. + // https://github.com/golang/go/blob/32a9804c7ba3f4a0e0bd26cc24b9204860a49ec8/src/net/http/response.go#L59-L64 + // It is unclear that we always need to drain the body but + // https://github.com/golang/go/issues/60240#issuecomment-1551060433 seems to indicate that we should. + defer func() { + _, _ = io.Copy(io.Discard, resp.Body) + _ = resp.Body.Close() + }() + + if resp.StatusCode/100 != 2 { + const maxErrMsgLen = 1024 + scanner := bufio.NewScanner(io.LimitReader(resp.Body, maxErrMsgLen)) + line := "" + if scanner.Scan() { + line = scanner.Text() + } + err = fmt.Errorf("server returned HTTP status %s (%d): %s", resp.Status, resp.StatusCode, line) + } + return resp.StatusCode, err +} + +func batchIsRateLimited(status int) bool { + return status == 429 +} diff --git a/internal/component/common/loki/client/shards_test.go b/internal/component/common/loki/client/shards_test.go new file mode 100644 index 00000000000..d3368148eee --- /dev/null +++ b/internal/component/common/loki/client/shards_test.go @@ -0,0 +1,176 @@ +package client + +import ( + "sync" + "testing" + "time" + + "github.com/go-kit/log" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/common/model" + "github.com/stretchr/testify/assert" + + "github.com/grafana/alloy/internal/component/common/loki" + "github.com/grafana/loki/pkg/push" +) + +// each entry counts as 4 bytes. +var entry = loki.Entry{ + Labels: model.LabelSet{"foo": "bar"}, + Entry: push.Entry{Timestamp: time.Now(), Line: "test"}, +} + +func TestQueue_append(t *testing.T) { + // a queue with 8 bytes batches and only one batch can queued. + q := newQueue(NewMetrics(prometheus.NewRegistry()), log.NewNopLogger(), Config{ + BatchSize: 8, + Queue: QueueConfig{ + Capacity: 8, + }, + }) + + // add 2 entries to the queue + for range 2 { + queued := q.append("tenant-1", entry, 0) + assert.True(t, queued) + } + assert.Equal(t, q.batches["tenant-1"].sizeBytes(), 8) + + // add two more entries, the current batch should be queued and a new batch should be created. + for range 2 { + queued := q.append("tenant-1", entry, 0) + assert.True(t, queued) + } + assert.Equal(t, q.batches["tenant-1"].sizeBytes(), 8) + + // adding one more should fail because both queue and batch is full + queued := q.append("tenant-1", entry, 0) + assert.False(t, queued) + + // dequeue current batch. + <-q.channel() + + // add batch again. + queued = q.append("tenant-1", entry, 0) + assert.True(t, queued) + assert.Equal(t, q.batches["tenant-1"].sizeBytes(), 4) +} + +func TestQueue_drain(t *testing.T) { + t.Run("should drain queue and current batch", func(t *testing.T) { + // a queue with 8 bytes batches and only one batch can queued at any given time. + q := newQueue(NewMetrics(prometheus.NewRegistry()), log.NewNopLogger(), Config{ + BatchSize: 8, + Queue: QueueConfig{ + Capacity: 8, + }, + }) + + // fill up queue and current batch + for range 4 { + queued := q.append("tenant-1", entry, 0) + assert.True(t, queued) + } + assert.Equal(t, q.batches["tenant-1"].sizeBytes(), 8) + + batches := q.drain() + // We should drain queued batch and batch stored in memory + assert.Len(t, batches, 2) + }) + + t.Run("should only drain queue", func(t *testing.T) { + // a queue with 8 bytes batches and only one batch can queued at any given time. + q := newQueue(NewMetrics(prometheus.NewRegistry()), log.NewNopLogger(), Config{ + BatchSize: 8, + BatchWait: 10 * time.Second, + Queue: QueueConfig{ + Capacity: 8, + }, + }) + + // fill up queue and current batch + for range 4 { + queued := q.append("tenant-1", entry, 0) + assert.True(t, queued) + } + assert.Equal(t, q.batches["tenant-1"].sizeBytes(), 8) + + batches := q.drain() + // We should drain queued batch and batch stored in memory + assert.Len(t, batches, 1) + }) +} + +func TestQueue_flushAndShutdown(t *testing.T) { + t.Run("should flush all batches to queue", func(t *testing.T) { + // a queue with 8 bytes batches and only one batch can queued at any given time. + q := newQueue(NewMetrics(prometheus.NewRegistry()), log.NewNopLogger(), Config{ + BatchSize: 8, + Queue: QueueConfig{ + Capacity: 8, + }, + }) + + // fill current batch but don't queue it. + for range 2 { + queued := q.append("tenant-1", entry, 0) + assert.True(t, queued) + } + assert.Equal(t, q.batches["tenant-1"].sizeBytes(), 8) + + var wg sync.WaitGroup + + wg.Go(func() { + done := make(chan struct{}) + defer close(done) + q.flushAndShutdown(done) + }) + + wg.Go(func() { + var batches []queuedBatch + for { + b, ok := <-q.channel() + if !ok { + break + } + batches = append(batches, b) + } + assert.Len(t, batches, 1) + }) + wg.Wait() + }) + + t.Run("should stop early if done channel is closed", func(t *testing.T) { + // a queue with 8 bytes batches and only one batch can queued at any given time. + q := newQueue(NewMetrics(prometheus.NewRegistry()), log.NewNopLogger(), Config{ + BatchSize: 8, + Queue: QueueConfig{ + Capacity: 8, + }, + }) + + // fill current batch but don't queue it. + for range 4 { + queued := q.append("tenant-1", entry, 0) + assert.True(t, queued) + } + + // Create and immediately close the done channel. + done := make(chan struct{}) + close(done) + + // Flush and shutdown - should stop early when done channel is signaled. + q.flushAndShutdown(done) + + // Verify batches map is nil. + assert.Nil(t, q.batches) + + // First batch should already be in queue. + _, ok := <-q.channel() + assert.True(t, ok) + + // Second batch should not have been queued + _, ok = <-q.channel() + assert.False(t, ok) + }) +} diff --git a/internal/component/loki/write/types.go b/internal/component/loki/write/types.go index e83b715a560..d355aee04c7 100644 --- a/internal/component/loki/write/types.go +++ b/internal/component/loki/write/types.go @@ -70,10 +70,10 @@ func (r *EndpointOptions) Validate() error { return nil } -// QueueConfig controls how the queue logs remote write client is configured. Note that this client is only used when the -// loki.write component has WAL support enabled. +// QueueConfig controls how shards and queue are configured for client. type QueueConfig struct { Capacity units.Base2Bytes `alloy:"capacity,attr,optional"` + MinShards int `alloy:"min_shards,attr,optional"` DrainTimeout time.Duration `alloy:"drain_timeout,attr,optional"` } @@ -81,6 +81,7 @@ type QueueConfig struct { func (q *QueueConfig) SetToDefault() { *q = QueueConfig{ Capacity: 10 * units.MiB, // considering the default BatchSize of 1MiB, this gives us a default buffered channel of size 10 + MinShards: 1, DrainTimeout: 15 * time.Second, } } @@ -107,6 +108,7 @@ func (args Arguments) convertClientConfigs() []client.Config { DropRateLimitedBatches: !cfg.RetryOnHTTP429, Queue: client.QueueConfig{ Capacity: int(cfg.QueueConfig.Capacity), + MinShards: cfg.QueueConfig.MinShards, DrainTimeout: cfg.QueueConfig.DrainTimeout, }, } From 4181ced2e2709c2a75068955e354d2ab1ecac1e6 Mon Sep 17 00:00:00 2001 From: Karl Persson <23356117+kalleep@users.noreply.github.com> Date: Wed, 19 Nov 2025 14:25:22 +0100 Subject: [PATCH 02/24] Update docs/sources/reference/components/loki/loki.write.md Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- docs/sources/reference/components/loki/loki.write.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sources/reference/components/loki/loki.write.md b/docs/sources/reference/components/loki/loki.write.md index 19cd96a6412..409782dc280 100644 --- a/docs/sources/reference/components/loki/loki.write.md +++ b/docs/sources/reference/components/loki/loki.write.md @@ -104,7 +104,7 @@ The following arguments are supported: If no `tenant_id` is provided, the component assumes that the Loki instance at `endpoint` is running in single-tenant mode and no X-Scope-OrgID header is sent. When multiple `endpoint` blocks are provided, the `loki.write` component creates a client for each. -Received log entries are fanned-out to these endpoints in succession. That means that if one endpint is bottlenecked, it may impact the rest. +Received log entries are fanned-out to these endpoints in succession. That means that if one endpoint is bottlenecked, it may impact the rest. Each endpoint has a _queue_ of batches to be sent. The `queue_config` block can be used to customize the behavior of this queue. From cf855506e37c9110e7e99a566cd6f35cc00366a4 Mon Sep 17 00:00:00 2001 From: Karl Persson <23356117+kalleep@users.noreply.github.com> Date: Thu, 20 Nov 2025 14:37:52 +0100 Subject: [PATCH 03/24] Update docs/sources/reference/components/loki/loki.write.md Co-authored-by: Piotr <17101802+thampiotr@users.noreply.github.com> --- docs/sources/reference/components/loki/loki.write.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sources/reference/components/loki/loki.write.md b/docs/sources/reference/components/loki/loki.write.md index 409782dc280..ecadbde7243 100644 --- a/docs/sources/reference/components/loki/loki.write.md +++ b/docs/sources/reference/components/loki/loki.write.md @@ -46,7 +46,7 @@ You can use the following blocks with `loki.write`: | `endpoint` > [`basic_auth`][basic_auth] | Configure `basic_auth` for authenticating to the endpoint. | no | | `endpoint` > [`oauth2`][oauth2] | Configure OAuth 2.0 for authenticating to the endpoint. | no | | `endpoint` > `oauth2` > [`tls_config`][tls_config] | Configure TLS settings for connecting to the endpoint. | no | -| `endpoint` > [`queue_config`][queue_config] | Configure the queue used for endpoint. | no | +| `endpoint` > [`queue_config`][queue_config] | Configure the queue used for the endpoint. | no | | `endpoint` > [`tls_config`][tls_config] | Configure TLS settings for connecting to the endpoint. | no | | [`wal`][wal] | Write-ahead log configuration. | no | From 307aeb08149f8b7640bbea5029ebe99c205afa83 Mon Sep 17 00:00:00 2001 From: Karl Persson <23356117+kalleep@users.noreply.github.com> Date: Thu, 20 Nov 2025 14:38:06 +0100 Subject: [PATCH 04/24] Update docs/sources/reference/components/loki/loki.write.md Co-authored-by: Piotr <17101802+thampiotr@users.noreply.github.com> --- docs/sources/reference/components/loki/loki.write.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sources/reference/components/loki/loki.write.md b/docs/sources/reference/components/loki/loki.write.md index ecadbde7243..f2bde96c9f4 100644 --- a/docs/sources/reference/components/loki/loki.write.md +++ b/docs/sources/reference/components/loki/loki.write.md @@ -136,7 +136,7 @@ The following arguments are supported: | --------------- | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- | -------- | | `capacity` | `string` | Controls the size of the underlying send queue buffer. This setting should be considered a worst-case scenario of memory consumption, in which all enqueued batches are full. | `10MiB` | no | | `drain_timeout` | `duration` | Configures the maximum time the client can take to drain the send queue upon shutdown. During that time, it enqueues pending batches and drains the send queue sending each. | `"1m"` | no | -| `min_shards` | `number` | Minimum amount of concurrent shards sending samples to the endpoint. | `1` | no | +| `min_shards` | `number` | Minimum number of concurrent shards sending samples to the endpoint. | `1` | no | Each endpoint manages a number of concurrent _shards_ which is responsible for sending a fraction of batches, number of shards are controlled with `min_shards` argument. Each shard has a queue of batches it keeps in memory, controlled with the `capacity` argument. From 601ddec00de4e1a38706544d3600897c277c1580 Mon Sep 17 00:00:00 2001 From: Karl Persson <23356117+kalleep@users.noreply.github.com> Date: Thu, 20 Nov 2025 14:42:27 +0100 Subject: [PATCH 05/24] Update docs/sources/reference/components/loki/loki.write.md Co-authored-by: Piotr <17101802+thampiotr@users.noreply.github.com> --- docs/sources/reference/components/loki/loki.write.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sources/reference/components/loki/loki.write.md b/docs/sources/reference/components/loki/loki.write.md index f2bde96c9f4..f6d6d448e4d 100644 --- a/docs/sources/reference/components/loki/loki.write.md +++ b/docs/sources/reference/components/loki/loki.write.md @@ -138,7 +138,7 @@ The following arguments are supported: | `drain_timeout` | `duration` | Configures the maximum time the client can take to drain the send queue upon shutdown. During that time, it enqueues pending batches and drains the send queue sending each. | `"1m"` | no | | `min_shards` | `number` | Minimum number of concurrent shards sending samples to the endpoint. | `1` | no | -Each endpoint manages a number of concurrent _shards_ which is responsible for sending a fraction of batches, number of shards are controlled with `min_shards` argument. +Each endpoint is divided into a number of concurrent _shards_ which are responsible for sending a fraction of batches. The number of shards is controlled with `min_shards` argument. Each shard has a queue of batches it keeps in memory, controlled with the `capacity` argument. ### `tls_config` From 39dcafa25e4e35b89abfb4fa302d0e44ae31d53d Mon Sep 17 00:00:00 2001 From: Karl Persson <23356117+kalleep@users.noreply.github.com> Date: Thu, 20 Nov 2025 14:42:38 +0100 Subject: [PATCH 06/24] Update internal/component/common/loki/client/config.go Co-authored-by: Piotr <17101802+thampiotr@users.noreply.github.com> --- internal/component/common/loki/client/config.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/component/common/loki/client/config.go b/internal/component/common/loki/client/config.go index f999bba99b5..f88d3715414 100644 --- a/internal/component/common/loki/client/config.go +++ b/internal/component/common/loki/client/config.go @@ -47,7 +47,7 @@ type QueueConfig struct { // is the 1 MiB default, and a capacity of 100 MiB, the underlying buffered channel would buffer up to 100 batches. Capacity int - // MinShards is the minimum amount of concurrent shards sending batches to the endpoint. + // MinShards is the minimum number of concurrent shards sending batches to the endpoint. MinShards int // DrainTimeout controls the maximum time that draining the send queue can take. From fce3f82681e3d1b447a73c7a98b5608549ffc915 Mon Sep 17 00:00:00 2001 From: Kalle <23356117+kalleep@users.noreply.github.com> Date: Thu, 20 Nov 2025 15:07:13 +0100 Subject: [PATCH 07/24] Update "client" to "endpoint" to better match documentation --- .../common/loki/client/consumer_fanout.go | 65 ++++---- .../loki/client/consumer_fanout_test.go | 155 +++++++++--------- .../common/loki/client/consumer_wal.go | 99 ++++++----- .../common/loki/client/consumer_wal_test.go | 53 +++--- .../component/common/loki/client/metrics.go | 10 +- .../component/common/loki/client/shards.go | 2 +- 6 files changed, 187 insertions(+), 197 deletions(-) diff --git a/internal/component/common/loki/client/consumer_fanout.go b/internal/component/common/loki/client/consumer_fanout.go index a8017d62d9c..402f71b2f3d 100644 --- a/internal/component/common/loki/client/consumer_fanout.go +++ b/internal/component/common/loki/client/consumer_fanout.go @@ -8,43 +8,43 @@ import ( "time" "github.com/go-kit/log" + "github.com/grafana/dskit/backoff" "github.com/prometheus/client_golang/prometheus" "github.com/grafana/alloy/internal/component/common/loki" "github.com/grafana/alloy/internal/component/common/loki/client/internal" "github.com/grafana/alloy/internal/useragent" - "github.com/grafana/dskit/backoff" ) -func NewFanoutConsumer(logger log.Logger, reg prometheus.Registerer, clientCfgs ...Config) (*FanoutConsumer, error) { - if len(clientCfgs) == 0 { +func NewFanoutConsumer(logger log.Logger, reg prometheus.Registerer, cfgs ...Config) (*FanoutConsumer, error) { + if len(cfgs) == 0 { return nil, fmt.Errorf("at least one client config must be provided") } m := &FanoutConsumer{ - clients: make([]*client, 0, len(clientCfgs)), - recv: make(chan loki.Entry), + endpoints: make([]*endpoint, 0, len(cfgs)), + recv: make(chan loki.Entry), } var ( - metrics = NewMetrics(reg) - clientsCheck = make(map[string]struct{}) + metrics = NewMetrics(reg) + endpointsCheck = make(map[string]struct{}) ) - for _, cfg := range clientCfgs { - // Don't allow duplicate clients, we have client specific metrics that need at least one unique label value (name). - clientName := getClientName(cfg) - if _, ok := clientsCheck[clientName]; ok { - return nil, fmt.Errorf("duplicate client configs are not allowed, found duplicate for name: %s", cfg.Name) + for _, cfg := range cfgs { + // Don't allow duplicate endpoints, we have endpoint specific metrics that need at least one unique label value (name). + name := getEndpointName(cfg) + if _, ok := endpointsCheck[name]; ok { + return nil, fmt.Errorf("duplicate endpoint configs are not allowed, found duplicate for name: %s", cfg.Name) } - clientsCheck[clientName] = struct{}{} - client, err := newClient(metrics, cfg, logger) + endpointsCheck[name] = struct{}{} + endpoint, err := newEndpoint(metrics, cfg, logger) if err != nil { return nil, fmt.Errorf("error starting client: %w", err) } - m.clients = append(m.clients, client) + m.endpoints = append(m.endpoints, endpoint) } m.wg.Go(m.run) @@ -54,15 +54,15 @@ func NewFanoutConsumer(logger log.Logger, reg prometheus.Registerer, clientCfgs var _ Consumer = (*FanoutConsumer)(nil) type FanoutConsumer struct { - clients []*client - wg sync.WaitGroup - once sync.Once - recv chan loki.Entry + endpoints []*endpoint + wg sync.WaitGroup + once sync.Once + recv chan loki.Entry } func (c *FanoutConsumer) run() { for e := range c.recv { - for _, c := range c.clients { + for _, c := range c.endpoints { c.Chan() <- e } } @@ -78,20 +78,20 @@ func (c *FanoutConsumer) Stop() { c.wg.Wait() var stopWG sync.WaitGroup - // Stop all clients. - for _, c := range c.clients { + // Stop all endpoints. + for _, c := range c.endpoints { stopWG.Go(func() { c.Stop() }) } - // Wait for all clients to stop. + // Wait for all endpoints to stop. stopWG.Wait() } -// getClientName computes the specific name for each client config. The name is either the configured Name setting in Config, +// getEndpointName computes the specific name for each endpoint config. The name is either the configured Name setting in Config, // or a hash of the config as whole, this allows us to detect repeated configs. -func getClientName(cfg Config) string { +func getEndpointName(cfg Config) string { if cfg.Name != "" { return cfg.Name } @@ -108,8 +108,7 @@ func asSha256(o any) string { var userAgent = useragent.Get() -// Client for pushing logs in snappy-compressed protos over HTTP. -type client struct { +type endpoint struct { cfg Config entries chan loki.Entry @@ -121,8 +120,8 @@ type client struct { shards *shards } -func newClient(metrics *Metrics, cfg Config, logger log.Logger) (*client, error) { - logger = log.With(logger, "component", "client", "host", cfg.URL.Host) +func newEndpoint(metrics *Metrics, cfg Config, logger log.Logger) (*endpoint, error) { + logger = log.With(logger, "component", "endpoint", "host", cfg.URL.Host) shards, err := newShards(metrics, logger, internal.NewNopMarkerHandler(), cfg) if err != nil { @@ -131,7 +130,7 @@ func newClient(metrics *Metrics, cfg Config, logger log.Logger) (*client, error) ctx, cancel := context.WithCancel(context.Background()) - c := &client{ + c := &endpoint{ cfg: cfg, entries: make(chan loki.Entry), shards: shards, @@ -145,7 +144,7 @@ func newClient(metrics *Metrics, cfg Config, logger log.Logger) (*client, error) return c, nil } -func (c *client) run() { +func (c *endpoint) run() { for { select { case <-c.ctx.Done(): @@ -164,11 +163,11 @@ func (c *client) run() { } } -func (c *client) Chan() chan<- loki.Entry { +func (c *endpoint) Chan() chan<- loki.Entry { return c.entries } -func (c *client) Stop() { +func (c *endpoint) Stop() { c.shards.stop() c.cancel() c.wg.Wait() diff --git a/internal/component/common/loki/client/consumer_fanout_test.go b/internal/component/common/loki/client/consumer_fanout_test.go index e3094149a21..6a592532b11 100644 --- a/internal/component/common/loki/client/consumer_fanout_test.go +++ b/internal/component/common/loki/client/consumer_fanout_test.go @@ -25,9 +25,9 @@ import ( ) func TestFanoutConsumer(t *testing.T) { - testClientConfig, rwReceivedReqs, closeServer := newServerAndClientConfig(t) + testEndpointConfig, rwReceivedReqs, closeServer := newServerAndEndpointConfig(t) - consumer, err := NewFanoutConsumer(log.NewNopLogger(), prometheus.NewRegistry(), testClientConfig) + consumer, err := NewFanoutConsumer(log.NewNopLogger(), prometheus.NewRegistry(), testEndpointConfig) require.NoError(t, err) receivedRequests := util.NewSyncSlice[util.RemoteWriteRequest]() @@ -61,7 +61,7 @@ func TestFanoutConsumer(t *testing.T) { }, 5*time.Second, time.Second, "timed out waiting for requests to be received") var seenEntries = map[string]struct{}{} - // assert over rw client received entries + // assert over rw received entries defer receivedRequests.DoneIterate() for _, req := range receivedRequests.StartIterate() { require.Len(t, req.Request.Streams, 1, "expected 1 stream requests to be received") @@ -73,12 +73,12 @@ func TestFanoutConsumer(t *testing.T) { } func TestFanoutConsumer_MultipleConfigs(t *testing.T) { - testClientConfig, rwReceivedReqs, closeServer := newServerAndClientConfig(t) - testClientConfig2, rwReceivedReqs2, closeServer2 := newServerAndClientConfig(t) - testClientConfig2.Name = "test-client-2" + testEndpointConfig, rwReceivedReqs, closeServer := newServerAndEndpointConfig(t) + testEndpointConfig2, rwReceivedReqs2, closeServer2 := newServerAndEndpointConfig(t) + testEndpointConfig2.Name = "test-client-2" // start writer and consumer - consumer, err := NewFanoutConsumer(log.NewNopLogger(), prometheus.NewRegistry(), testClientConfig, testClientConfig2) + consumer, err := NewFanoutConsumer(log.NewNopLogger(), prometheus.NewRegistry(), testEndpointConfig, testEndpointConfig2) require.NoError(t, err) receivedRequests := util.NewSyncSlice[util.RemoteWriteRequest]() @@ -117,14 +117,14 @@ func TestFanoutConsumer_MultipleConfigs(t *testing.T) { } } - // times 2 due to clients being run + // times 2 due to endpoints being run expectedTotalLines := totalLines * 2 require.Eventually(t, func() bool { return receivedRequests.Length() == expectedTotalLines }, 5*time.Second, time.Second, "timed out waiting for requests to be received") var seenEntries int - // assert over rw client received entries + // assert over rw received entries defer receivedRequests.DoneIterate() for _, req := range receivedRequests.StartIterate() { require.Len(t, req.Request.Streams, 1, "expected 1 stream requests to be received") @@ -135,12 +135,12 @@ func TestFanoutConsumer_MultipleConfigs(t *testing.T) { } func TestFanoutConsumer_InvalidConfig(t *testing.T) { - t.Run("no clients", func(t *testing.T) { + t.Run("no endpoints", func(t *testing.T) { _, err := NewFanoutConsumer(log.NewNopLogger(), prometheus.NewRegistry()) require.Error(t, err) }) - t.Run("repeated client", func(t *testing.T) { + t.Run("repeated endpoint", func(t *testing.T) { host, _ := url.Parse("http://localhost:3100") config := Config{URL: flagext.URLValue{URL: host}} _, err := NewFanoutConsumer(log.NewNopLogger(), prometheus.NewRegistry(), config, config) @@ -182,23 +182,20 @@ var logEntries = []loki.Entry{ }, } -func TestClient_Handle(t *testing.T) { +func TestEndpoint(t *testing.T) { tests := map[string]struct { - clientBatchSize int - clientBatchWait time.Duration - clientMaxRetries int - clientTenantID string - clientDropRateLimited bool - serverResponseStatus int - inputEntries []loki.Entry - inputDelay time.Duration - expectedReqs []util.RemoteWriteRequest - expectedMetrics string + endpointConfig Config + serverResponseStatus int + inputEntries []loki.Entry + inputDelay time.Duration + expectedReqs []util.RemoteWriteRequest + expectedMetrics string }{ "batch log entries together until the batch size is reached": { - clientBatchSize: 10, - clientBatchWait: 100 * time.Millisecond, - clientMaxRetries: 3, + endpointConfig: Config{ + BatchSize: 10, + BatchWait: 100 * time.Millisecond, + }, serverResponseStatus: 200, inputEntries: []loki.Entry{logEntries[0], logEntries[1], logEntries[2]}, expectedReqs: []util.RemoteWriteRequest{ @@ -236,9 +233,10 @@ func TestClient_Handle(t *testing.T) { `, }, "batch log entries together until the batch wait time is reached": { - clientBatchSize: 10, - clientBatchWait: 100 * time.Millisecond, - clientMaxRetries: 3, + endpointConfig: Config{ + BatchSize: 10, + BatchWait: 100 * time.Millisecond, + }, serverResponseStatus: 200, inputEntries: []loki.Entry{logEntries[0], logEntries[1]}, inputDelay: 110 * time.Millisecond, @@ -277,9 +275,10 @@ func TestClient_Handle(t *testing.T) { `, }, "retry send a batch up to backoff's max retries in case the server responds with a 5xx": { - clientBatchSize: 10, - clientBatchWait: 10 * time.Millisecond, - clientMaxRetries: 3, + endpointConfig: Config{ + BatchSize: 10, + BatchWait: 10 * time.Millisecond, + }, serverResponseStatus: 500, inputEntries: []loki.Entry{logEntries[0]}, expectedReqs: []util.RemoteWriteRequest{ @@ -321,9 +320,10 @@ func TestClient_Handle(t *testing.T) { `, }, "do not retry send a batch in case the server responds with a 4xx": { - clientBatchSize: 10, - clientBatchWait: 10 * time.Millisecond, - clientMaxRetries: 3, + endpointConfig: Config{ + BatchSize: 10, + BatchWait: 10 * time.Millisecond, + }, serverResponseStatus: 400, inputEntries: []loki.Entry{logEntries[0]}, expectedReqs: []util.RemoteWriteRequest{ @@ -357,9 +357,10 @@ func TestClient_Handle(t *testing.T) { `, }, "do retry sending a batch in case the server responds with a 429": { - clientBatchSize: 10, - clientBatchWait: 10 * time.Millisecond, - clientMaxRetries: 3, + endpointConfig: Config{ + BatchSize: 10, + BatchWait: 10 * time.Millisecond, + }, serverResponseStatus: 429, inputEntries: []loki.Entry{logEntries[0]}, expectedReqs: []util.RemoteWriteRequest{ @@ -400,13 +401,14 @@ func TestClient_Handle(t *testing.T) { loki_write_sent_entries_total{host="__HOST__",tenant=""} 0 `, }, - "do not retry in case of 429 when client is configured to drop rate limited batches": { - clientBatchSize: 10, - clientBatchWait: 10 * time.Millisecond, - clientMaxRetries: 3, - clientDropRateLimited: true, - serverResponseStatus: 429, - inputEntries: []loki.Entry{logEntries[0]}, + "do not retry in case of 429 when endpoint is configured to drop rate limited batches": { + endpointConfig: Config{ + BatchSize: 10, + BatchWait: 10 * time.Millisecond, + DropRateLimitedBatches: true, + }, + serverResponseStatus: 429, + inputEntries: []loki.Entry{logEntries[0]}, expectedReqs: []util.RemoteWriteRequest{ { TenantID: "", @@ -437,11 +439,12 @@ func TestClient_Handle(t *testing.T) { loki_write_sent_entries_total{host="__HOST__",tenant=""} 0 `, }, - "batch log entries together honoring the client tenant ID": { - clientBatchSize: 100, - clientBatchWait: 100 * time.Millisecond, - clientMaxRetries: 3, - clientTenantID: "tenant-default", + "batch log entries together honoring the endpoint tenant ID": { + endpointConfig: Config{ + BatchSize: 100, + BatchWait: 100 * time.Millisecond, + TenantID: "tenant-default", + }, serverResponseStatus: 200, inputEntries: []loki.Entry{logEntries[0], logEntries[1]}, expectedReqs: []util.RemoteWriteRequest{ @@ -475,10 +478,11 @@ func TestClient_Handle(t *testing.T) { `, }, "batch log entries together honoring the tenant ID overridden while processing the pipeline stages": { - clientBatchSize: 100, - clientBatchWait: 100 * time.Millisecond, - clientMaxRetries: 3, - clientTenantID: "tenant-default", + endpointConfig: Config{ + BatchSize: 100, + BatchWait: 100 * time.Millisecond, + TenantID: "tenant-default", + }, serverResponseStatus: 200, inputEntries: []loki.Entry{logEntries[0], logEntries[3], logEntries[4], logEntries[5]}, expectedReqs: []util.RemoteWriteRequest{ @@ -547,7 +551,7 @@ func TestClient_Handle(t *testing.T) { }, } - for testName, testData := range tests { + for testName, tt := range tests { t.Run(testName, func(t *testing.T) { reg := prometheus.NewRegistry() @@ -555,7 +559,7 @@ func TestClient_Handle(t *testing.T) { receivedReqsChan := make(chan util.RemoteWriteRequest, 10) // Start a local HTTP server - server := util.NewRemoteWriteServer(receivedReqsChan, testData.serverResponseStatus) + server := util.NewRemoteWriteServer(receivedReqsChan, tt.serverResponseStatus) require.NotNil(t, server) defer server.Close() @@ -564,38 +568,31 @@ func TestClient_Handle(t *testing.T) { err := serverURL.Set(server.URL) require.NoError(t, err) - // Instance the client - cfg := Config{ - URL: serverURL, - BatchWait: testData.clientBatchWait, - BatchSize: testData.clientBatchSize, - DropRateLimitedBatches: testData.clientDropRateLimited, - Client: config.DefaultHTTPClientConfig, - BackoffConfig: backoff.Config{MinBackoff: 1 * time.Millisecond, MaxBackoff: 2 * time.Millisecond, MaxRetries: testData.clientMaxRetries}, - Timeout: 1 * time.Second, - TenantID: testData.clientTenantID, - } + tt.endpointConfig.URL = serverURL + tt.endpointConfig.Client = config.DefaultHTTPClientConfig + tt.endpointConfig.BackoffConfig = backoff.Config{MinBackoff: 1 * time.Millisecond, MaxBackoff: 2 * time.Millisecond, MaxRetries: 3} + tt.endpointConfig.Timeout = 1 * time.Second m := NewMetrics(reg) - c, err := newClient(m, cfg, log.NewNopLogger()) + c, err := newEndpoint(m, tt.endpointConfig, log.NewNopLogger()) require.NoError(t, err) // Send all the input log entries - for i, logEntry := range testData.inputEntries { + for i, logEntry := range tt.inputEntries { c.Chan() <- logEntry - if testData.inputDelay > 0 && i < len(testData.inputEntries)-1 { - time.Sleep(testData.inputDelay) + if tt.inputDelay > 0 && i < len(tt.inputEntries)-1 { + time.Sleep(tt.inputDelay) } } // Wait until the expected push requests are received (with a timeout) deadline := time.Now().Add(1 * time.Second) - for len(receivedReqsChan) < len(testData.expectedReqs) && time.Now().Before(deadline) { + for len(receivedReqsChan) < len(tt.expectedReqs) && time.Now().Before(deadline) { time.Sleep(5 * time.Millisecond) } - // Stop the client: it waits until the current batch is sent + // Stop the endpoint: it waits until the current batch is sent c.Stop() close(receivedReqsChan) @@ -605,26 +602,26 @@ func TestClient_Handle(t *testing.T) { receivedReqs = append(receivedReqs, req) } - assert.ElementsMatch(t, testData.expectedReqs, receivedReqs) + assert.ElementsMatch(t, tt.expectedReqs, receivedReqs) - expectedMetrics := strings.ReplaceAll(testData.expectedMetrics, "__HOST__", serverURL.Host) + expectedMetrics := strings.ReplaceAll(tt.expectedMetrics, "__HOST__", serverURL.Host) err = testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), "loki_write_sent_entries_total", "loki_write_dropped_entries_total", "loki_write_mutated_entries_total", "loki_write_mutated_bytes_total") assert.NoError(t, err) }) } } -func newServerAndClientConfig(t *testing.T) (Config, chan util.RemoteWriteRequest, func()) { +func newServerAndEndpointConfig(t *testing.T) (Config, chan util.RemoteWriteRequest, func()) { receivedReqsChan := make(chan util.RemoteWriteRequest, 10) // Start a local HTTP server server := util.NewRemoteWriteServer(receivedReqsChan, http.StatusOK) require.NotNil(t, server) - testClientURL, _ := url.Parse(server.URL) - testClientConfig := Config{ + url, _ := url.Parse(server.URL) + endpointConfig := Config{ Name: "test-client", - URL: flagext.URLValue{URL: testClientURL}, + URL: flagext.URLValue{URL: url}, Timeout: time.Second * 2, BatchSize: 1, BackoffConfig: backoff.Config{ @@ -635,7 +632,7 @@ func newServerAndClientConfig(t *testing.T) (Config, chan util.RemoteWriteReques DrainTimeout: time.Second * 10, }, } - return testClientConfig, receivedReqsChan, func() { + return endpointConfig, receivedReqsChan, func() { server.Close() close(receivedReqsChan) } diff --git a/internal/component/common/loki/client/consumer_wal.go b/internal/component/common/loki/client/consumer_wal.go index 63163a700b9..f1cd1397fc0 100644 --- a/internal/component/common/loki/client/consumer_wal.go +++ b/internal/component/common/loki/client/consumer_wal.go @@ -19,9 +19,9 @@ import ( "github.com/grafana/alloy/internal/component/common/loki/wal" ) -func NewWALConsumer(logger log.Logger, reg prometheus.Registerer, walCfg wal.Config, clientCfgs ...Config) (*WALConsumer, error) { - if len(clientCfgs) == 0 { - return nil, fmt.Errorf("at least one client config must be provided") +func NewWALConsumer(logger log.Logger, reg prometheus.Registerer, walCfg wal.Config, cfgs ...Config) (*WALConsumer, error) { + if len(cfgs) == 0 { + return nil, fmt.Errorf("at least one endpoint config must be provided") } writer, err := wal.NewWriter(walCfg, logger, reg) @@ -31,55 +31,52 @@ func NewWALConsumer(logger log.Logger, reg prometheus.Registerer, walCfg wal.Con m := &WALConsumer{ writer: writer, - pairs: make([]clientWatcherPair, 0, len(clientCfgs)), + pairs: make([]endpointWatcherPair, 0, len(cfgs)), } var ( - metrics = NewMetrics(reg) - clientsCheck = make(map[string]struct{}) + metrics = NewMetrics(reg) + endpointsCheck = make(map[string]struct{}) - walWatcherMetrics = wal.NewWatcherMetrics(reg) - walMarkerMetrics = internal.NewMarkerMetrics(reg) - walClientMetrics = NewWALClientMetrics(reg) + walWatcherMetrics = wal.NewWatcherMetrics(reg) + walMarkerMetrics = internal.NewMarkerMetrics(reg) + walEndpointMetrics = NewWALEndpointMetrics(reg) ) - for _, cfg := range clientCfgs { - // Don't allow duplicate clients, we have client specific metrics that need at least one unique label value (name). - clientName := getClientName(cfg) - if _, ok := clientsCheck[clientName]; ok { - return nil, fmt.Errorf("duplicate client configs are not allowed, found duplicate for name: %s", cfg.Name) + for _, cfg := range cfgs { + // Don't allow duplicate endpoints, we have endpoint specific metrics that need at least one unique label value (name). + name := getEndpointName(cfg) + if _, ok := endpointsCheck[name]; ok { + return nil, fmt.Errorf("duplicate endpoint configs are not allowed, found duplicate for name: %s", cfg.Name) } - clientsCheck[clientName] = struct{}{} - - // add some context information for the logger the watcher uses - wlog := log.With(logger, "client", clientName) + endpointsCheck[name] = struct{}{} markerFileHandler, err := internal.NewMarkerFileHandler(logger, walCfg.Dir) if err != nil { return nil, err } - markerHandler := internal.NewMarkerHandler(markerFileHandler, walCfg.MaxSegmentAge, logger, walMarkerMetrics.WithCurriedId(clientName)) + markerHandler := internal.NewMarkerHandler(markerFileHandler, walCfg.MaxSegmentAge, logger, walMarkerMetrics.WithCurriedId(name)) - client, err := newWalClient(metrics, walClientMetrics.CurryWithId(clientName), cfg, logger, markerHandler) + endpoint, err := newWalEndpoint(metrics, walEndpointMetrics.CurryWithId(name), cfg, logger, markerHandler) if err != nil { - return nil, fmt.Errorf("error starting wal client: %w", err) + return nil, fmt.Errorf("error starting wal endpoint: %w", err) } // subscribe watcher's wal.WriteTo to writer events. This will make the writer trigger the cleanup of the wal.WriteTo // series cache whenever a segment is deleted. - writer.SubscribeCleanup(client) + writer.SubscribeCleanup(endpoint) - watcher := wal.NewWatcher(walCfg.Dir, clientName, walWatcherMetrics, client, wlog, walCfg.WatchConfig, markerHandler) + watcher := wal.NewWatcher(walCfg.Dir, name, walWatcherMetrics, endpoint, log.With(logger, "component", name), walCfg.WatchConfig, markerHandler) // subscribe watcher to wal write events writer.SubscribeWrite(watcher) - level.Debug(logger).Log("msg", "starting WAL watcher for client", "client", clientName) + level.Debug(logger).Log("msg", "starting WAL watcher for endpoint", "endpoint", name) watcher.Start() - m.pairs = append(m.pairs, clientWatcherPair{ - watcher: watcher, - client: client, + m.pairs = append(m.pairs, endpointWatcherPair{ + watcher: watcher, + endpoint: endpoint, }) } @@ -88,28 +85,28 @@ func NewWALConsumer(logger log.Logger, reg prometheus.Registerer, walCfg wal.Con return m, nil } -type clientWatcherPair struct { - watcher *wal.Watcher - client *walClient +type endpointWatcherPair struct { + watcher *wal.Watcher + endpoint *walEndpoint } -// Stop will proceed to stop, in order, watcher and the client. -func (p clientWatcherPair) Stop(drain bool) { +// Stop will proceed to stop, in order, watcher and the endpoint. +func (p endpointWatcherPair) Stop(drain bool) { // If drain enabled, drain the WAL. if drain { p.watcher.Drain() } p.watcher.Stop() - // Subsequently stop the client. - p.client.Stop() + // Subsequently stop the endpoint. + p.endpoint.Stop() } var _ DrainableConsumer = (*WALConsumer)(nil) type WALConsumer struct { writer *wal.Writer - pairs []clientWatcherPair + pairs []endpointWatcherPair } func (m *WALConsumer) Chan() chan<- loki.Entry { @@ -133,7 +130,7 @@ func (m *WALConsumer) stop(drain bool) { // Depending on whether drain is enabled, the maximum time stopping a watcher and it's queue can take is // the drain time of the watcher + drain time queue. To minimize this, and since we keep a separate WAL for each - // client config, each (watcher, queue) pair is stopped concurrently. + // endpoint config, each (watcher, queue) pair is stopped concurrently. for _, pair := range m.pairs { stopWG.Go(func() { pair.Stop(drain) @@ -144,8 +141,8 @@ func (m *WALConsumer) stop(drain bool) { stopWG.Wait() } -func newWalClient(metrics *Metrics, wcMetrics *WALClientMetrics, cfg Config, logger log.Logger, markerHandler internal.MarkerHandler) (*walClient, error) { - logger = log.With(logger, "component", "client", "host", cfg.URL.Host) +func newWalEndpoint(metrics *Metrics, wcMetrics *WALEndpointMetrics, cfg Config, logger log.Logger, markerHandler internal.MarkerHandler) (*walEndpoint, error) { + logger = log.With(logger, "component", "endpoint", "host", cfg.URL.Host) shards, err := newShards(metrics, logger, markerHandler, cfg) if err != nil { @@ -154,10 +151,10 @@ func newWalClient(metrics *Metrics, wcMetrics *WALClientMetrics, cfg Config, log ctx, cancel := context.WithCancel(context.Background()) - c := &walClient{ + c := &walEndpoint{ logger: logger, cfg: cfg, - wcMetrics: wcMetrics, + weMetrics: wcMetrics, shards: shards, ctx: ctx, @@ -174,11 +171,11 @@ func newWalClient(metrics *Metrics, wcMetrics *WALClientMetrics, cfg Config, log return c, nil } -// walClient is a WAL-specific remote write client implementation. This client attests to the wal.WriteTo interface, +// walEndpoint is a WAL-specific remote write implementation. This endpoint attests to the wal.WriteTo interface, // which allows it to be injected in the wal.Watcher as a destination where to write read series and entries. As the watcher // reads from the WAL, batches are created and dispatched onto a send queue when ready to be sent. -type walClient struct { - wcMetrics *WALClientMetrics +type walEndpoint struct { + weMetrics *WALEndpointMetrics logger log.Logger cfg Config shards *shards @@ -194,7 +191,7 @@ type walClient struct { markerHandler internal.MarkerHandler } -func (c *walClient) SeriesReset(segmentNum int) { +func (c *walEndpoint) SeriesReset(segmentNum int) { c.seriesLock.Lock() defer c.seriesLock.Unlock() for k, v := range c.seriesSegment { @@ -206,7 +203,7 @@ func (c *walClient) SeriesReset(segmentNum int) { } } -func (c *walClient) StoreSeries(series []record.RefSeries, segment int) { +func (c *walEndpoint) StoreSeries(series []record.RefSeries, segment int) { c.seriesLock.Lock() defer c.seriesLock.Unlock() for _, seriesRec := range series { @@ -215,7 +212,7 @@ func (c *walClient) StoreSeries(series []record.RefSeries, segment int) { } } -func (c *walClient) AppendEntries(entries wal.RefEntries, segment int) error { +func (c *walEndpoint) AppendEntries(entries wal.RefEntries, segment int) error { c.seriesLock.RLock() l, ok := c.series[entries.Ref] c.seriesLock.RUnlock() @@ -240,28 +237,28 @@ func (c *walClient) AppendEntries(entries wal.RefEntries, segment int) error { // It's safe to assume that upon an AppendEntries call, there will always be at least // one entry. - c.wcMetrics.lastReadTimestamp.WithLabelValues().Set(float64(maxSeenTimestamp)) + c.weMetrics.lastReadTimestamp.WithLabelValues().Set(float64(maxSeenTimestamp)) return nil } -func (c *walClient) appendSingleEntry(entry loki.Entry, segmentNum int) bool { +func (c *walEndpoint) appendSingleEntry(entry loki.Entry, segmentNum int) bool { backoff := backoff.New(c.ctx, backoff.Config{ MinBackoff: 5 * time.Millisecond, MaxBackoff: 50 * time.Millisecond, }) for !c.shards.enqueue(entry, segmentNum) { if !backoff.Ongoing() { - // we could not enqueue and client is stopped. + // we could not enqueue and endpoint is stopped. return false } } return true } -// Stop the client, enqueueing pending batches and draining the send queue accordingly. Both closing operations are +// Stop the endpoint, enqueueing pending batches and draining the send queue accordingly. Both closing operations are // limited by a deadline, controlled by a configured drain timeout, which is global to the Stop call. -func (c *walClient) Stop() { +func (c *walEndpoint) Stop() { // drain shards c.shards.stop() c.markerHandler.Stop() diff --git a/internal/component/common/loki/client/consumer_wal_test.go b/internal/component/common/loki/client/consumer_wal_test.go index dd72f308e16..34e53e2ed9f 100644 --- a/internal/component/common/loki/client/consumer_wal_test.go +++ b/internal/component/common/loki/client/consumer_wal_test.go @@ -36,9 +36,9 @@ func TestWALConsumer(t *testing.T) { WatchConfig: wal.DefaultWatchConfig, } // start all necessary resources - testClientConfig, rwReceivedReqs, closeServer := newServerAndClientConfig(t) + testEndpointConfig, rwReceivedReqs, closeServer := newServerAndEndpointConfig(t) - consumer, err := NewWALConsumer(log.NewNopLogger(), prometheus.NewRegistry(), walConfig, testClientConfig) + consumer, err := NewWALConsumer(log.NewNopLogger(), prometheus.NewRegistry(), walConfig, testEndpointConfig) require.NoError(t, err) receivedRequests := util.NewSyncSlice[util.RemoteWriteRequest]() @@ -72,7 +72,7 @@ func TestWALConsumer(t *testing.T) { }, 5*time.Second, time.Second, "timed out waiting for requests to be received") var seenEntries = map[string]struct{}{} - // assert over rw client received entries + // assert over rw received entries defer receivedRequests.DoneIterate() for _, req := range receivedRequests.StartIterate() { require.Len(t, req.Request.Streams, 1, "expected 1 stream requests to be received") @@ -84,9 +84,9 @@ func TestWALConsumer(t *testing.T) { } func TestWALConsumer_MultipleConfigs(t *testing.T) { - testClientConfig, rwReceivedReqs, closeServer := newServerAndClientConfig(t) - testClientConfig2, rwReceivedReqs2, closeServer2 := newServerAndClientConfig(t) - testClientConfig2.Name = "test-client-2" + testEndpointConfig, rwReceivedReqs, closeServer := newServerAndEndpointConfig(t) + testEndpointConfig2, rwReceivedReqs2, closeServer2 := newServerAndEndpointConfig(t) + testEndpointConfig2.Name = "test-client-2" walConfig := wal.Config{ Dir: t.TempDir(), @@ -95,7 +95,7 @@ func TestWALConsumer_MultipleConfigs(t *testing.T) { MaxSegmentAge: time.Second * 10, } - consumer, err := NewWALConsumer(log.NewNopLogger(), prometheus.NewRegistry(), walConfig, testClientConfig, testClientConfig2) + consumer, err := NewWALConsumer(log.NewNopLogger(), prometheus.NewRegistry(), walConfig, testEndpointConfig, testEndpointConfig2) require.NoError(t, err) receivedRequests := util.NewSyncSlice[util.RemoteWriteRequest]() @@ -134,14 +134,14 @@ func TestWALConsumer_MultipleConfigs(t *testing.T) { } } - // times 2 due to clients being run + // times 2 due to endpoint being run expectedTotalLines := totalLines * 2 require.Eventually(t, func() bool { return receivedRequests.Length() == expectedTotalLines }, 5*time.Second, time.Second, "timed out waiting for requests to be received") var seenEntries int - // assert over rw client received entries + // assert over rw received entries defer receivedRequests.DoneIterate() for _, req := range receivedRequests.StartIterate() { require.Len(t, req.Request.Streams, 1, "expected 1 stream requests to be received") @@ -152,12 +152,12 @@ func TestWALConsumer_MultipleConfigs(t *testing.T) { } func TestWALConsumer_InvalidConfig(t *testing.T) { - t.Run("no clients", func(t *testing.T) { + t.Run("no endpoints", func(t *testing.T) { _, err := NewWALConsumer(log.NewNopLogger(), prometheus.NewRegistry(), wal.Config{}) require.Error(t, err) }) - t.Run("repeated client", func(t *testing.T) { + t.Run("repeated endpoints", func(t *testing.T) { host, _ := url.Parse("http://localhost:3100") config := Config{URL: flagext.URLValue{URL: host}} _, err := NewWALConsumer(log.NewNopLogger(), prometheus.NewRegistry(), wal.Config{}, config, config) @@ -166,7 +166,7 @@ func TestWALConsumer_InvalidConfig(t *testing.T) { } type testCase struct { - // numLines is the total number of lines sent through the client in the benchmark. + // numLines is the total number of lines sent through the endpoint in the benchmark. numLines int // numSeries is the different number of series to use in entries. Series are dynamically generated for each entry, but @@ -182,7 +182,7 @@ type testCase struct { expectedRWReqsCount int64 } -func TestWALClient(t *testing.T) { +func TestWALEndpoint(t *testing.T) { for name, tc := range map[string]testCase{ "small test": { numLines: 3, @@ -247,7 +247,6 @@ func TestWALClient(t *testing.T) { err := serverURL.Set(server.URL) require.NoError(t, err) - // Instance the client cfg := Config{ URL: serverURL, BatchWait: tc.batchWait, @@ -261,7 +260,7 @@ func TestWALClient(t *testing.T) { logger := log.NewLogfmtLogger(os.Stdout) - wc, err := newWalClient(NewMetrics(reg), NewWALClientMetrics(reg).CurryWithId("test"), cfg, logger, internal.NewNopMarkerHandler()) + wc, err := newWalEndpoint(NewMetrics(reg), NewWALEndpointMetrics(reg).CurryWithId("test"), cfg, logger, internal.NewNopMarkerHandler()) require.NoError(t, err) //labels := model.LabelSet{"app": "test"} @@ -299,14 +298,14 @@ func TestWALClient(t *testing.T) { require.Equal(t, tc.expectedRWReqsCount, receivedRWsCount.Load(), "number for remote write request not expected") } - // Stop the client: it waits until the current batch is sent + // Stop the endpoint: it waits until the current batch is sent wc.Stop() close(receivedReqsChan) }) } } -func BenchmarkClientImplementations(b *testing.B) { +func BenchmarkEndpointImplementations(b *testing.B) { for name, bc := range map[string]testCase{ "100 entries, single series, no batching": { numLines: 100, @@ -331,13 +330,13 @@ func BenchmarkClientImplementations(b *testing.B) { } { b.Run(name, func(b *testing.B) { b.Run("implementation=wal_nil_marker_handler", func(b *testing.B) { - runWALClientBenchCase(b, bc, func(t *testing.B) internal.MarkerHandler { + runWALEndpointBenchCase(b, bc, func(t *testing.B) internal.MarkerHandler { return internal.NewNopMarkerHandler() }) }) b.Run("implementation=wal_marker_handler", func(b *testing.B) { - runWALClientBenchCase(b, bc, func(t *testing.B) internal.MarkerHandler { + runWALEndpointBenchCase(b, bc, func(t *testing.B) internal.MarkerHandler { dir := b.TempDir() nopLogger := log.NewNopLogger() @@ -351,13 +350,13 @@ func BenchmarkClientImplementations(b *testing.B) { }) b.Run("implementation=regular", func(b *testing.B) { - runRegularClientBenchCase(b, bc) + runEndpointBenchCase(b, bc) }) }) } } -func runWALClientBenchCase(b *testing.B, bc testCase, mhFactory func(t *testing.B) internal.MarkerHandler) { +func runWALEndpointBenchCase(b *testing.B, bc testCase, mhFactory func(t *testing.B) internal.MarkerHandler) { reg := prometheus.NewRegistry() // Create a buffer channel where we do enqueue received requests @@ -387,7 +386,6 @@ func runWALClientBenchCase(b *testing.B, bc testCase, mhFactory func(t *testing. err := serverURL.Set(server.URL) require.NoError(b, err) - // Instance the client cfg := Config{ URL: serverURL, BatchWait: time.Millisecond * 50, @@ -404,7 +402,7 @@ func runWALClientBenchCase(b *testing.B, bc testCase, mhFactory func(t *testing. logger := log.NewLogfmtLogger(os.Stdout) - qc, err := newWalClient(NewMetrics(reg), NewWALClientMetrics(reg).CurryWithId("test"), cfg, logger, mhFactory(b)) + qc, err := newWalEndpoint(NewMetrics(reg), NewWALEndpointMetrics(reg).CurryWithId("test"), cfg, logger, mhFactory(b)) require.NoError(b, err) //labels := model.LabelSet{"app": "test"} @@ -444,12 +442,12 @@ func runWALClientBenchCase(b *testing.B, bc testCase, mhFactory func(t *testing. reset() } - // Stop the client: it waits until the current batch is sent + // Stop the endpoint: it waits until the current batch is sent qc.Stop() close(receivedReqsChan) } -func runRegularClientBenchCase(b *testing.B, bc testCase) { +func runEndpointBenchCase(b *testing.B, bc testCase) { reg := prometheus.NewRegistry() // Create a buffer channel where we do enqueue received requests @@ -479,7 +477,6 @@ func runRegularClientBenchCase(b *testing.B, bc testCase) { err := serverURL.Set(server.URL) require.NoError(b, err) - // Instance the client cfg := Config{ URL: serverURL, BatchWait: time.Millisecond * 50, @@ -497,7 +494,7 @@ func runRegularClientBenchCase(b *testing.B, bc testCase) { logger := log.NewLogfmtLogger(os.Stdout) m := NewMetrics(reg) - qc, err := newClient(m, cfg, logger) + qc, err := newEndpoint(m, cfg, logger) require.NoError(b, err) //labels := model.LabelSet{"app": "test"} @@ -530,7 +527,7 @@ func runRegularClientBenchCase(b *testing.B, bc testCase) { reset() } - // Stop the client: it waits until the current batch is sent + // Stop the endpoint: it waits until the current batch is sent qc.Stop() close(receivedReqsChan) } diff --git a/internal/component/common/loki/client/metrics.go b/internal/component/common/loki/client/metrics.go index 8129829a03e..046f76dd3f0 100644 --- a/internal/component/common/loki/client/metrics.go +++ b/internal/component/common/loki/client/metrics.go @@ -95,12 +95,12 @@ func NewMetrics(reg prometheus.Registerer) *Metrics { return &m } -type WALClientMetrics struct { +type WALEndpointMetrics struct { lastReadTimestamp *prometheus.GaugeVec } -func NewWALClientMetrics(reg prometheus.Registerer) *WALClientMetrics { - m := &WALClientMetrics{ +func NewWALEndpointMetrics(reg prometheus.Registerer) *WALEndpointMetrics { + m := &WALEndpointMetrics{ lastReadTimestamp: prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: "loki_write", @@ -118,8 +118,8 @@ func NewWALClientMetrics(reg prometheus.Registerer) *WALClientMetrics { return m } -func (m *WALClientMetrics) CurryWithId(id string) *WALClientMetrics { - return &WALClientMetrics{ +func (m *WALEndpointMetrics) CurryWithId(id string) *WALEndpointMetrics { + return &WALEndpointMetrics{ lastReadTimestamp: m.lastReadTimestamp.MustCurryWith(map[string]string{ "id": id, }), diff --git a/internal/component/common/loki/client/shards.go b/internal/component/common/loki/client/shards.go index 1733ae80b33..49c27b17865 100644 --- a/internal/component/common/loki/client/shards.go +++ b/internal/component/common/loki/client/shards.go @@ -181,7 +181,7 @@ loop: // It validates the configuration and creates an HTTP client for sending batches to Loki. func newShards(metrics *Metrics, logger log.Logger, markerHandler SentDataMarkerHandler, cfg Config) (*shards, error) { if cfg.URL.URL == nil { - return nil, errors.New("client needs target URL") + return nil, errors.New("endpoint needs target URL") } err := cfg.Client.Validate() From ef924897502a0f5930df07823e46ccaaf7c278dc Mon Sep 17 00:00:00 2001 From: Kalle <23356117+kalleep@users.noreply.github.com> Date: Thu, 20 Nov 2025 15:18:31 +0100 Subject: [PATCH 08/24] Update naming and comments --- internal/component/common/loki/client/config.go | 8 ++++---- .../component/common/loki/client/consumer_fanout.go | 2 +- .../common/loki/client/consumer_fanout_test.go | 2 +- internal/component/common/loki/client/consumer_wal.go | 2 +- .../component/common/loki/client/consumer_wal_test.go | 6 +++--- internal/component/common/loki/client/shards.go | 4 ++-- internal/component/common/loki/client/shards_test.go | 10 +++++----- internal/component/loki/write/types.go | 6 +++--- internal/component/loki/write/write.go | 2 +- 9 files changed, 21 insertions(+), 21 deletions(-) diff --git a/internal/component/common/loki/client/config.go b/internal/component/common/loki/client/config.go index f88d3715414..fc1cefefca1 100644 --- a/internal/component/common/loki/client/config.go +++ b/internal/component/common/loki/client/config.go @@ -33,11 +33,11 @@ type Config struct { // prevent HOL blocking in multitenant deployments. DropRateLimitedBatches bool - // Queue controls configuration parameters specific to the queue client - Queue QueueConfig + // QueueConfig controls how shards and queues are configured for endpoints. + QueueConfig QueueConfig } -// QueueConfig holds configurations for the queue-based remote-write client. +// QueueConfig controls how shards and queue are configured for client. type QueueConfig struct { // Capacity is the worst case size in bytes desired for the send queue. This value is used to calculate the size of // the buffered channel used underneath. The worst case scenario assumed is that every batch buffered in full, hence @@ -50,6 +50,6 @@ type QueueConfig struct { // MinShards is the minimum number of concurrent shards sending batches to the endpoint. MinShards int - // DrainTimeout controls the maximum time that draining the send queue can take. + // DrainTimeout controls the maximum time that draining the queue can take. DrainTimeout time.Duration } diff --git a/internal/component/common/loki/client/consumer_fanout.go b/internal/component/common/loki/client/consumer_fanout.go index 402f71b2f3d..83bdc686399 100644 --- a/internal/component/common/loki/client/consumer_fanout.go +++ b/internal/component/common/loki/client/consumer_fanout.go @@ -138,7 +138,7 @@ func newEndpoint(metrics *Metrics, cfg Config, logger log.Logger) (*endpoint, er cancel: cancel, } - c.shards.start(cfg.Queue.MinShards) + c.shards.start(cfg.QueueConfig.MinShards) c.wg.Go(func() { c.run() }) return c, nil diff --git a/internal/component/common/loki/client/consumer_fanout_test.go b/internal/component/common/loki/client/consumer_fanout_test.go index 6a592532b11..9f5fe17aab6 100644 --- a/internal/component/common/loki/client/consumer_fanout_test.go +++ b/internal/component/common/loki/client/consumer_fanout_test.go @@ -627,7 +627,7 @@ func newServerAndEndpointConfig(t *testing.T) (Config, chan util.RemoteWriteRequ BackoffConfig: backoff.Config{ MaxRetries: 0, }, - Queue: QueueConfig{ + QueueConfig: QueueConfig{ Capacity: 10, // buffered channel of size 10 DrainTimeout: time.Second * 10, }, diff --git a/internal/component/common/loki/client/consumer_wal.go b/internal/component/common/loki/client/consumer_wal.go index f1cd1397fc0..03d1abbf85b 100644 --- a/internal/component/common/loki/client/consumer_wal.go +++ b/internal/component/common/loki/client/consumer_wal.go @@ -166,7 +166,7 @@ func newWalEndpoint(metrics *Metrics, wcMetrics *WALEndpointMetrics, cfg Config, markerHandler: markerHandler, } - c.shards.start(cfg.Queue.MinShards) + c.shards.start(cfg.QueueConfig.MinShards) return c, nil } diff --git a/internal/component/common/loki/client/consumer_wal_test.go b/internal/component/common/loki/client/consumer_wal_test.go index 34e53e2ed9f..99dfc4449ef 100644 --- a/internal/component/common/loki/client/consumer_wal_test.go +++ b/internal/component/common/loki/client/consumer_wal_test.go @@ -255,7 +255,7 @@ func TestWALEndpoint(t *testing.T) { BackoffConfig: backoff.Config{MinBackoff: 5 * time.Second, MaxBackoff: 10 * time.Second, MaxRetries: 1}, Timeout: 1 * time.Second, TenantID: "", - Queue: tc.queueConfig, + QueueConfig: tc.queueConfig, } logger := log.NewLogfmtLogger(os.Stdout) @@ -394,7 +394,7 @@ func runWALEndpointBenchCase(b *testing.B, bc testCase, mhFactory func(t *testin BackoffConfig: backoff.Config{MinBackoff: 5 * time.Second, MaxBackoff: 10 * time.Second, MaxRetries: 1}, Timeout: 1 * time.Second, TenantID: "", - Queue: QueueConfig{ + QueueConfig: QueueConfig{ Capacity: 1000, // queue size of 100 DrainTimeout: time.Second * 10, }, @@ -485,7 +485,7 @@ func runEndpointBenchCase(b *testing.B, bc testCase) { BackoffConfig: backoff.Config{MinBackoff: 5 * time.Second, MaxBackoff: 10 * time.Second, MaxRetries: 1}, Timeout: 1 * time.Second, TenantID: "", - Queue: QueueConfig{ + QueueConfig: QueueConfig{ Capacity: 1000, // queue size of 100 DrainTimeout: time.Second * 10, }, diff --git a/internal/component/common/loki/client/shards.go b/internal/component/common/loki/client/shards.go index 49c27b17865..8522f2812f7 100644 --- a/internal/component/common/loki/client/shards.go +++ b/internal/component/common/loki/client/shards.go @@ -40,7 +40,7 @@ func newQueue(metrics *Metrics, logger log.Logger, cfg Config) *queue { // the channel capacity would be calculated as: bufferChannelSize = Capacity / BatchSize. // For example, assuming BatchSize is the 1 MiB default and Capacity is 100 MiB, // the underlying buffered channel would buffer up to 100 batches. - capacity := max(cfg.Queue.Capacity/max(cfg.BatchSize, 1), 1) + capacity := max(cfg.QueueConfig.Capacity/max(cfg.BatchSize, 1), 1) return &queue{ cfg: cfg, @@ -277,7 +277,7 @@ func (s *shards) stop() { select { case <-s.done: return - case <-time.After(s.cfg.Queue.DrainTimeout): + case <-time.After(s.cfg.QueueConfig.DrainTimeout): } // Perform hard shutdown diff --git a/internal/component/common/loki/client/shards_test.go b/internal/component/common/loki/client/shards_test.go index d3368148eee..5a0a2dcdf55 100644 --- a/internal/component/common/loki/client/shards_test.go +++ b/internal/component/common/loki/client/shards_test.go @@ -24,7 +24,7 @@ func TestQueue_append(t *testing.T) { // a queue with 8 bytes batches and only one batch can queued. q := newQueue(NewMetrics(prometheus.NewRegistry()), log.NewNopLogger(), Config{ BatchSize: 8, - Queue: QueueConfig{ + QueueConfig: QueueConfig{ Capacity: 8, }, }) @@ -61,7 +61,7 @@ func TestQueue_drain(t *testing.T) { // a queue with 8 bytes batches and only one batch can queued at any given time. q := newQueue(NewMetrics(prometheus.NewRegistry()), log.NewNopLogger(), Config{ BatchSize: 8, - Queue: QueueConfig{ + QueueConfig: QueueConfig{ Capacity: 8, }, }) @@ -83,7 +83,7 @@ func TestQueue_drain(t *testing.T) { q := newQueue(NewMetrics(prometheus.NewRegistry()), log.NewNopLogger(), Config{ BatchSize: 8, BatchWait: 10 * time.Second, - Queue: QueueConfig{ + QueueConfig: QueueConfig{ Capacity: 8, }, }) @@ -106,7 +106,7 @@ func TestQueue_flushAndShutdown(t *testing.T) { // a queue with 8 bytes batches and only one batch can queued at any given time. q := newQueue(NewMetrics(prometheus.NewRegistry()), log.NewNopLogger(), Config{ BatchSize: 8, - Queue: QueueConfig{ + QueueConfig: QueueConfig{ Capacity: 8, }, }) @@ -144,7 +144,7 @@ func TestQueue_flushAndShutdown(t *testing.T) { // a queue with 8 bytes batches and only one batch can queued at any given time. q := newQueue(NewMetrics(prometheus.NewRegistry()), log.NewNopLogger(), Config{ BatchSize: 8, - Queue: QueueConfig{ + QueueConfig: QueueConfig{ Capacity: 8, }, }) diff --git a/internal/component/loki/write/types.go b/internal/component/loki/write/types.go index d355aee04c7..8d0970315ae 100644 --- a/internal/component/loki/write/types.go +++ b/internal/component/loki/write/types.go @@ -70,7 +70,7 @@ func (r *EndpointOptions) Validate() error { return nil } -// QueueConfig controls how shards and queue are configured for client. +// QueueConfig controls how shards and queue are configured for endpoint. type QueueConfig struct { Capacity units.Base2Bytes `alloy:"capacity,attr,optional"` MinShards int `alloy:"min_shards,attr,optional"` @@ -86,7 +86,7 @@ func (q *QueueConfig) SetToDefault() { } } -func (args Arguments) convertClientConfigs() []client.Config { +func (args Arguments) convertEndpointConfigs() []client.Config { var res []client.Config for _, cfg := range args.Endpoints { url, _ := url.Parse(cfg.URL) @@ -106,7 +106,7 @@ func (args Arguments) convertClientConfigs() []client.Config { TenantID: cfg.TenantID, MaxStreams: args.MaxStreams, DropRateLimitedBatches: !cfg.RetryOnHTTP429, - Queue: client.QueueConfig{ + QueueConfig: client.QueueConfig{ Capacity: int(cfg.QueueConfig.Capacity), MinShards: cfg.QueueConfig.MinShards, DrainTimeout: cfg.QueueConfig.DrainTimeout, diff --git a/internal/component/loki/write/write.go b/internal/component/loki/write/write.go index 61d568ed4be..287305280f6 100644 --- a/internal/component/loki/write/write.go +++ b/internal/component/loki/write/write.go @@ -164,7 +164,7 @@ func (c *Component) Update(args component.Arguments) error { c.consumer.Stop() } - cfgs := newArgs.convertClientConfigs() + cfgs := newArgs.convertEndpointConfigs() uid := alloyseed.Get().UID for i := range cfgs { From 1b17a30f74f9a367926d0b140ce590d73754f8ac Mon Sep 17 00:00:00 2001 From: Kalle <23356117+kalleep@users.noreply.github.com> Date: Thu, 20 Nov 2025 16:25:50 +0100 Subject: [PATCH 09/24] Refactor so we can reuse endpoint for wal and non wal implementation --- .../common/loki/client/consumer_fanout.go | 97 +--- .../loki/client/consumer_fanout_test.go | 433 ----------------- .../common/loki/client/consumer_wal.go | 119 ++--- .../common/loki/client/consumer_wal_test.go | 28 +- .../component/common/loki/client/endpoint.go | 78 +++ .../common/loki/client/endpoint_test.go | 450 ++++++++++++++++++ .../component/common/loki/client/shards.go | 2 + 7 files changed, 594 insertions(+), 613 deletions(-) create mode 100644 internal/component/common/loki/client/endpoint.go create mode 100644 internal/component/common/loki/client/endpoint_test.go diff --git a/internal/component/common/loki/client/consumer_fanout.go b/internal/component/common/loki/client/consumer_fanout.go index 83bdc686399..0efdc0ffa26 100644 --- a/internal/component/common/loki/client/consumer_fanout.go +++ b/internal/component/common/loki/client/consumer_fanout.go @@ -1,24 +1,19 @@ package client import ( - "context" - "crypto/sha256" "fmt" "sync" - "time" "github.com/go-kit/log" - "github.com/grafana/dskit/backoff" "github.com/prometheus/client_golang/prometheus" "github.com/grafana/alloy/internal/component/common/loki" "github.com/grafana/alloy/internal/component/common/loki/client/internal" - "github.com/grafana/alloy/internal/useragent" ) func NewFanoutConsumer(logger log.Logger, reg prometheus.Registerer, cfgs ...Config) (*FanoutConsumer, error) { if len(cfgs) == 0 { - return nil, fmt.Errorf("at least one client config must be provided") + return nil, fmt.Errorf("at least one endpoint config must be provided") } m := &FanoutConsumer{ @@ -39,9 +34,9 @@ func NewFanoutConsumer(logger log.Logger, reg prometheus.Registerer, cfgs ...Con } endpointsCheck[name] = struct{}{} - endpoint, err := newEndpoint(metrics, cfg, logger) + endpoint, err := newEndpoint(metrics, cfg, logger, internal.NewNopMarkerHandler()) if err != nil { - return nil, fmt.Errorf("error starting client: %w", err) + return nil, fmt.Errorf("error starting endpoint: %w", err) } m.endpoints = append(m.endpoints, endpoint) @@ -63,7 +58,7 @@ type FanoutConsumer struct { func (c *FanoutConsumer) run() { for e := range c.recv { for _, c := range c.endpoints { - c.Chan() <- e + c.enqueue(e, 0) } } } @@ -88,87 +83,3 @@ func (c *FanoutConsumer) Stop() { // Wait for all endpoints to stop. stopWG.Wait() } - -// getEndpointName computes the specific name for each endpoint config. The name is either the configured Name setting in Config, -// or a hash of the config as whole, this allows us to detect repeated configs. -func getEndpointName(cfg Config) string { - if cfg.Name != "" { - return cfg.Name - } - return asSha256(cfg) -} - -func asSha256(o any) string { - h := sha256.New() - _, _ = fmt.Fprintf(h, "%v", o) - - temp := fmt.Sprintf("%x", h.Sum(nil)) - return temp[:6] -} - -var userAgent = useragent.Get() - -type endpoint struct { - cfg Config - entries chan loki.Entry - - wg sync.WaitGroup - - ctx context.Context - cancel context.CancelFunc - - shards *shards -} - -func newEndpoint(metrics *Metrics, cfg Config, logger log.Logger) (*endpoint, error) { - logger = log.With(logger, "component", "endpoint", "host", cfg.URL.Host) - - shards, err := newShards(metrics, logger, internal.NewNopMarkerHandler(), cfg) - if err != nil { - return nil, err - } - - ctx, cancel := context.WithCancel(context.Background()) - - c := &endpoint{ - cfg: cfg, - entries: make(chan loki.Entry), - shards: shards, - ctx: ctx, - cancel: cancel, - } - - c.shards.start(cfg.QueueConfig.MinShards) - - c.wg.Go(func() { c.run() }) - return c, nil -} - -func (c *endpoint) run() { - for { - select { - case <-c.ctx.Done(): - return - case e := <-c.entries: - backoff := backoff.New(c.ctx, backoff.Config{ - MinBackoff: 5 * time.Millisecond, - MaxBackoff: 50 * time.Millisecond, - }) - for !c.shards.enqueue(e, 0) { - if !backoff.Ongoing() { - break - } - } - } - } -} - -func (c *endpoint) Chan() chan<- loki.Entry { - return c.entries -} - -func (c *endpoint) Stop() { - c.shards.stop() - c.cancel() - c.wg.Wait() -} diff --git a/internal/component/common/loki/client/consumer_fanout_test.go b/internal/component/common/loki/client/consumer_fanout_test.go index 9f5fe17aab6..db95abff999 100644 --- a/internal/component/common/loki/client/consumer_fanout_test.go +++ b/internal/component/common/loki/client/consumer_fanout_test.go @@ -5,7 +5,6 @@ import ( "fmt" "net/http" "net/url" - "strings" "testing" "time" @@ -14,10 +13,7 @@ import ( "github.com/grafana/dskit/flagext" "github.com/grafana/loki/pkg/push" "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/testutil" - "github.com/prometheus/common/config" "github.com/prometheus/common/model" - "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "github.com/grafana/alloy/internal/component/common/loki" @@ -182,435 +178,6 @@ var logEntries = []loki.Entry{ }, } -func TestEndpoint(t *testing.T) { - tests := map[string]struct { - endpointConfig Config - serverResponseStatus int - inputEntries []loki.Entry - inputDelay time.Duration - expectedReqs []util.RemoteWriteRequest - expectedMetrics string - }{ - "batch log entries together until the batch size is reached": { - endpointConfig: Config{ - BatchSize: 10, - BatchWait: 100 * time.Millisecond, - }, - serverResponseStatus: 200, - inputEntries: []loki.Entry{logEntries[0], logEntries[1], logEntries[2]}, - expectedReqs: []util.RemoteWriteRequest{ - { - TenantID: "", - Request: push.PushRequest{Streams: []push.Stream{{Labels: "{}", Entries: []push.Entry{logEntries[0].Entry, logEntries[1].Entry}}}}, - }, - { - TenantID: "", - Request: push.PushRequest{Streams: []push.Stream{{Labels: "{}", Entries: []push.Entry{logEntries[2].Entry}}}}, - }, - }, - expectedMetrics: ` - # HELP loki_write_sent_entries_total Number of log entries sent to the ingester. - # TYPE loki_write_sent_entries_total counter - loki_write_sent_entries_total{host="__HOST__",tenant=""} 3.0 - # HELP loki_write_dropped_entries_total Number of log entries dropped because failed to be sent to the ingester after all retries. - # TYPE loki_write_dropped_entries_total counter - loki_write_dropped_entries_total{host="__HOST__",reason="ingester_error",tenant=""} 0 - loki_write_dropped_entries_total{host="__HOST__",reason="line_too_long",tenant=""} 0 - loki_write_dropped_entries_total{host="__HOST__",reason="rate_limited",tenant=""} 0 - loki_write_dropped_entries_total{host="__HOST__",reason="stream_limited",tenant=""} 0 - # HELP loki_write_mutated_entries_total The total number of log entries that have been mutated. - # TYPE loki_write_mutated_entries_total counter - loki_write_mutated_entries_total{host="__HOST__",reason="ingester_error",tenant=""} 0 - loki_write_mutated_entries_total{host="__HOST__",reason="line_too_long",tenant=""} 0 - loki_write_mutated_entries_total{host="__HOST__",reason="rate_limited",tenant=""} 0 - loki_write_mutated_entries_total{host="__HOST__",reason="stream_limited",tenant=""} 0 - # HELP loki_write_mutated_bytes_total The total number of bytes that have been mutated. - # TYPE loki_write_mutated_bytes_total counter - loki_write_mutated_bytes_total{host="__HOST__",reason="ingester_error",tenant=""} 0 - loki_write_mutated_bytes_total{host="__HOST__",reason="line_too_long",tenant=""} 0 - loki_write_mutated_bytes_total{host="__HOST__",reason="rate_limited",tenant=""} 0 - loki_write_mutated_bytes_total{host="__HOST__",reason="stream_limited",tenant=""} 0 - `, - }, - "batch log entries together until the batch wait time is reached": { - endpointConfig: Config{ - BatchSize: 10, - BatchWait: 100 * time.Millisecond, - }, - serverResponseStatus: 200, - inputEntries: []loki.Entry{logEntries[0], logEntries[1]}, - inputDelay: 110 * time.Millisecond, - expectedReqs: []util.RemoteWriteRequest{ - { - TenantID: "", - Request: push.PushRequest{Streams: []push.Stream{{Labels: "{}", Entries: []push.Entry{logEntries[0].Entry}}}}, - }, - { - TenantID: "", - Request: push.PushRequest{Streams: []push.Stream{{Labels: "{}", Entries: []push.Entry{logEntries[1].Entry}}}}, - }, - }, - expectedMetrics: ` - # HELP loki_write_sent_entries_total Number of log entries sent to the ingester. - # TYPE loki_write_sent_entries_total counter - loki_write_sent_entries_total{host="__HOST__",tenant=""} 2.0 - # HELP loki_write_dropped_entries_total Number of log entries dropped because failed to be sent to the ingester after all retries. - # TYPE loki_write_dropped_entries_total counter - loki_write_dropped_entries_total{host="__HOST__",reason="ingester_error",tenant=""} 0 - loki_write_dropped_entries_total{host="__HOST__",reason="line_too_long",tenant=""} 0 - loki_write_dropped_entries_total{host="__HOST__",reason="rate_limited",tenant=""} 0 - loki_write_dropped_entries_total{host="__HOST__",reason="stream_limited",tenant=""} 0 - # HELP loki_write_mutated_entries_total The total number of log entries that have been mutated. - # TYPE loki_write_mutated_entries_total counter - loki_write_mutated_entries_total{host="__HOST__",reason="ingester_error",tenant=""} 0 - loki_write_mutated_entries_total{host="__HOST__",reason="line_too_long",tenant=""} 0 - loki_write_mutated_entries_total{host="__HOST__",reason="rate_limited",tenant=""} 0 - loki_write_mutated_entries_total{host="__HOST__",reason="stream_limited",tenant=""} 0 - # HELP loki_write_mutated_bytes_total The total number of bytes that have been mutated. - # TYPE loki_write_mutated_bytes_total counter - loki_write_mutated_bytes_total{host="__HOST__",reason="ingester_error",tenant=""} 0 - loki_write_mutated_bytes_total{host="__HOST__",reason="line_too_long",tenant=""} 0 - loki_write_mutated_bytes_total{host="__HOST__",reason="rate_limited",tenant=""} 0 - loki_write_mutated_bytes_total{host="__HOST__",reason="stream_limited",tenant=""} 0 - `, - }, - "retry send a batch up to backoff's max retries in case the server responds with a 5xx": { - endpointConfig: Config{ - BatchSize: 10, - BatchWait: 10 * time.Millisecond, - }, - serverResponseStatus: 500, - inputEntries: []loki.Entry{logEntries[0]}, - expectedReqs: []util.RemoteWriteRequest{ - { - TenantID: "", - Request: push.PushRequest{Streams: []push.Stream{{Labels: "{}", Entries: []push.Entry{logEntries[0].Entry}}}}, - }, - { - TenantID: "", - Request: push.PushRequest{Streams: []push.Stream{{Labels: "{}", Entries: []push.Entry{logEntries[0].Entry}}}}, - }, - { - TenantID: "", - Request: push.PushRequest{Streams: []push.Stream{{Labels: "{}", Entries: []push.Entry{logEntries[0].Entry}}}}, - }, - }, - expectedMetrics: ` - # HELP loki_write_dropped_entries_total Number of log entries dropped because failed to be sent to the ingester after all retries. - # TYPE loki_write_dropped_entries_total counter - loki_write_dropped_entries_total{host="__HOST__",reason="ingester_error",tenant=""} 1 - loki_write_dropped_entries_total{host="__HOST__",reason="line_too_long",tenant=""} 0 - loki_write_dropped_entries_total{host="__HOST__",reason="rate_limited",tenant=""} 0 - loki_write_dropped_entries_total{host="__HOST__",reason="stream_limited",tenant=""} 0 - # HELP loki_write_mutated_entries_total The total number of log entries that have been mutated. - # TYPE loki_write_mutated_entries_total counter - loki_write_mutated_entries_total{host="__HOST__",reason="ingester_error",tenant=""} 0 - loki_write_mutated_entries_total{host="__HOST__",reason="line_too_long",tenant=""} 0 - loki_write_mutated_entries_total{host="__HOST__",reason="rate_limited",tenant=""} 0 - loki_write_mutated_entries_total{host="__HOST__",reason="stream_limited",tenant=""} 0 - # HELP loki_write_mutated_bytes_total The total number of bytes that have been mutated. - # TYPE loki_write_mutated_bytes_total counter - loki_write_mutated_bytes_total{host="__HOST__",reason="ingester_error",tenant=""} 0 - loki_write_mutated_bytes_total{host="__HOST__",reason="line_too_long",tenant=""} 0 - loki_write_mutated_bytes_total{host="__HOST__",reason="rate_limited",tenant=""} 0 - loki_write_mutated_bytes_total{host="__HOST__",reason="stream_limited",tenant=""} 0 - # HELP loki_write_sent_entries_total Number of log entries sent to the ingester. - # TYPE loki_write_sent_entries_total counter - loki_write_sent_entries_total{host="__HOST__",tenant=""} 0 - `, - }, - "do not retry send a batch in case the server responds with a 4xx": { - endpointConfig: Config{ - BatchSize: 10, - BatchWait: 10 * time.Millisecond, - }, - serverResponseStatus: 400, - inputEntries: []loki.Entry{logEntries[0]}, - expectedReqs: []util.RemoteWriteRequest{ - { - TenantID: "", - Request: push.PushRequest{Streams: []push.Stream{{Labels: "{}", Entries: []push.Entry{logEntries[0].Entry}}}}, - }, - }, - expectedMetrics: ` - # HELP loki_write_dropped_entries_total Number of log entries dropped because failed to be sent to the ingester after all retries. - # TYPE loki_write_dropped_entries_total counter - loki_write_dropped_entries_total{host="__HOST__",reason="ingester_error",tenant=""} 1 - loki_write_dropped_entries_total{host="__HOST__",reason="line_too_long",tenant=""} 0 - loki_write_dropped_entries_total{host="__HOST__",reason="rate_limited",tenant=""} 0 - loki_write_dropped_entries_total{host="__HOST__",reason="stream_limited",tenant=""} 0 - # HELP loki_write_mutated_entries_total The total number of log entries that have been mutated. - # TYPE loki_write_mutated_entries_total counter - loki_write_mutated_entries_total{host="__HOST__",reason="ingester_error",tenant=""} 0 - loki_write_mutated_entries_total{host="__HOST__",reason="line_too_long",tenant=""} 0 - loki_write_mutated_entries_total{host="__HOST__",reason="rate_limited",tenant=""} 0 - loki_write_mutated_entries_total{host="__HOST__",reason="stream_limited",tenant=""} 0 - # HELP loki_write_mutated_bytes_total The total number of bytes that have been mutated. - # TYPE loki_write_mutated_bytes_total counter - loki_write_mutated_bytes_total{host="__HOST__",reason="ingester_error",tenant=""} 0 - loki_write_mutated_bytes_total{host="__HOST__",reason="line_too_long",tenant=""} 0 - loki_write_mutated_bytes_total{host="__HOST__",reason="rate_limited",tenant=""} 0 - loki_write_mutated_bytes_total{host="__HOST__",reason="stream_limited",tenant=""} 0 - # HELP loki_write_sent_entries_total Number of log entries sent to the ingester. - # TYPE loki_write_sent_entries_total counter - loki_write_sent_entries_total{host="__HOST__",tenant=""} 0 - `, - }, - "do retry sending a batch in case the server responds with a 429": { - endpointConfig: Config{ - BatchSize: 10, - BatchWait: 10 * time.Millisecond, - }, - serverResponseStatus: 429, - inputEntries: []loki.Entry{logEntries[0]}, - expectedReqs: []util.RemoteWriteRequest{ - { - TenantID: "", - Request: push.PushRequest{Streams: []push.Stream{{Labels: "{}", Entries: []push.Entry{logEntries[0].Entry}}}}, - }, - { - TenantID: "", - Request: push.PushRequest{Streams: []push.Stream{{Labels: "{}", Entries: []push.Entry{logEntries[0].Entry}}}}, - }, - { - TenantID: "", - Request: push.PushRequest{Streams: []push.Stream{{Labels: "{}", Entries: []push.Entry{logEntries[0].Entry}}}}, - }, - }, - expectedMetrics: ` - # HELP loki_write_dropped_entries_total Number of log entries dropped because failed to be sent to the ingester after all retries. - # TYPE loki_write_dropped_entries_total counter - loki_write_dropped_entries_total{host="__HOST__",reason="ingester_error",tenant=""} 0 - loki_write_dropped_entries_total{host="__HOST__",reason="line_too_long",tenant=""} 0 - loki_write_dropped_entries_total{host="__HOST__",reason="rate_limited",tenant=""} 1 - loki_write_dropped_entries_total{host="__HOST__",reason="stream_limited",tenant=""} 0 - # HELP loki_write_mutated_entries_total The total number of log entries that have been mutated. - # TYPE loki_write_mutated_entries_total counter - loki_write_mutated_entries_total{host="__HOST__",reason="ingester_error",tenant=""} 0 - loki_write_mutated_entries_total{host="__HOST__",reason="line_too_long",tenant=""} 0 - loki_write_mutated_entries_total{host="__HOST__",reason="rate_limited",tenant=""} 0 - loki_write_mutated_entries_total{host="__HOST__",reason="stream_limited",tenant=""} 0 - # HELP loki_write_mutated_bytes_total The total number of bytes that have been mutated. - # TYPE loki_write_mutated_bytes_total counter - loki_write_mutated_bytes_total{host="__HOST__",reason="ingester_error",tenant=""} 0 - loki_write_mutated_bytes_total{host="__HOST__",reason="line_too_long",tenant=""} 0 - loki_write_mutated_bytes_total{host="__HOST__",reason="rate_limited",tenant=""} 0 - loki_write_mutated_bytes_total{host="__HOST__",reason="stream_limited",tenant=""} 0 - # HELP loki_write_sent_entries_total Number of log entries sent to the ingester. - # TYPE loki_write_sent_entries_total counter - loki_write_sent_entries_total{host="__HOST__",tenant=""} 0 - `, - }, - "do not retry in case of 429 when endpoint is configured to drop rate limited batches": { - endpointConfig: Config{ - BatchSize: 10, - BatchWait: 10 * time.Millisecond, - DropRateLimitedBatches: true, - }, - serverResponseStatus: 429, - inputEntries: []loki.Entry{logEntries[0]}, - expectedReqs: []util.RemoteWriteRequest{ - { - TenantID: "", - Request: push.PushRequest{Streams: []push.Stream{{Labels: "{}", Entries: []push.Entry{logEntries[0].Entry}}}}, - }, - }, - expectedMetrics: ` - # HELP loki_write_dropped_entries_total Number of log entries dropped because failed to be sent to the ingester after all retries. - # TYPE loki_write_dropped_entries_total counter - loki_write_dropped_entries_total{host="__HOST__",reason="ingester_error",tenant=""} 0 - loki_write_dropped_entries_total{host="__HOST__",reason="line_too_long",tenant=""} 0 - loki_write_dropped_entries_total{host="__HOST__",reason="rate_limited",tenant=""} 1 - loki_write_dropped_entries_total{host="__HOST__",reason="stream_limited",tenant=""} 0 - # HELP loki_write_mutated_entries_total The total number of log entries that have been mutated. - # TYPE loki_write_mutated_entries_total counter - loki_write_mutated_entries_total{host="__HOST__",reason="ingester_error",tenant=""} 0 - loki_write_mutated_entries_total{host="__HOST__",reason="line_too_long",tenant=""} 0 - loki_write_mutated_entries_total{host="__HOST__",reason="rate_limited",tenant=""} 0 - loki_write_mutated_entries_total{host="__HOST__",reason="stream_limited",tenant=""} 0 - # HELP loki_write_mutated_bytes_total The total number of bytes that have been mutated. - # TYPE loki_write_mutated_bytes_total counter - loki_write_mutated_bytes_total{host="__HOST__",reason="ingester_error",tenant=""} 0 - loki_write_mutated_bytes_total{host="__HOST__",reason="line_too_long",tenant=""} 0 - loki_write_mutated_bytes_total{host="__HOST__",reason="rate_limited",tenant=""} 0 - loki_write_mutated_bytes_total{host="__HOST__",reason="stream_limited",tenant=""} 0 - # HELP loki_write_sent_entries_total Number of log entries sent to the ingester. - # TYPE loki_write_sent_entries_total counter - loki_write_sent_entries_total{host="__HOST__",tenant=""} 0 - `, - }, - "batch log entries together honoring the endpoint tenant ID": { - endpointConfig: Config{ - BatchSize: 100, - BatchWait: 100 * time.Millisecond, - TenantID: "tenant-default", - }, - serverResponseStatus: 200, - inputEntries: []loki.Entry{logEntries[0], logEntries[1]}, - expectedReqs: []util.RemoteWriteRequest{ - { - TenantID: "tenant-default", - Request: push.PushRequest{Streams: []push.Stream{{Labels: "{}", Entries: []push.Entry{logEntries[0].Entry, logEntries[1].Entry}}}}, - }, - }, - expectedMetrics: ` - # HELP loki_write_sent_entries_total Number of log entries sent to the ingester. - # TYPE loki_write_sent_entries_total counter - loki_write_sent_entries_total{host="__HOST__",tenant="tenant-default"} 2.0 - # HELP loki_write_dropped_entries_total Number of log entries dropped because failed to be sent to the ingester after all retries. - # TYPE loki_write_dropped_entries_total counter - loki_write_dropped_entries_total{host="__HOST__", reason="ingester_error", tenant="tenant-default"} 0 - loki_write_dropped_entries_total{host="__HOST__",reason="line_too_long",tenant="tenant-default"} 0 - loki_write_dropped_entries_total{host="__HOST__", reason="rate_limited", tenant="tenant-default"} 0 - loki_write_dropped_entries_total{host="__HOST__",reason="stream_limited",tenant="tenant-default"} 0 - # HELP loki_write_mutated_entries_total The total number of log entries that have been mutated. - # TYPE loki_write_mutated_entries_total counter - loki_write_mutated_entries_total{host="__HOST__",reason="ingester_error",tenant="tenant-default"} 0 - loki_write_mutated_entries_total{host="__HOST__",reason="line_too_long",tenant="tenant-default"} 0 - loki_write_mutated_entries_total{host="__HOST__",reason="rate_limited",tenant="tenant-default"} 0 - loki_write_mutated_entries_total{host="__HOST__",reason="stream_limited",tenant="tenant-default"} 0 - # HELP loki_write_mutated_bytes_total The total number of bytes that have been mutated. - # TYPE loki_write_mutated_bytes_total counter - loki_write_mutated_bytes_total{host="__HOST__",reason="ingester_error",tenant="tenant-default"} 0 - loki_write_mutated_bytes_total{host="__HOST__",reason="line_too_long",tenant="tenant-default"} 0 - loki_write_mutated_bytes_total{host="__HOST__",reason="rate_limited",tenant="tenant-default"} 0 - loki_write_mutated_bytes_total{host="__HOST__",reason="stream_limited",tenant="tenant-default"} 0 - `, - }, - "batch log entries together honoring the tenant ID overridden while processing the pipeline stages": { - endpointConfig: Config{ - BatchSize: 100, - BatchWait: 100 * time.Millisecond, - TenantID: "tenant-default", - }, - serverResponseStatus: 200, - inputEntries: []loki.Entry{logEntries[0], logEntries[3], logEntries[4], logEntries[5]}, - expectedReqs: []util.RemoteWriteRequest{ - { - TenantID: "tenant-default", - Request: push.PushRequest{Streams: []push.Stream{{Labels: "{}", Entries: []push.Entry{logEntries[0].Entry}}}}, - }, - { - TenantID: "tenant-1", - Request: push.PushRequest{Streams: []push.Stream{{Labels: "{}", Entries: []push.Entry{logEntries[3].Entry, logEntries[4].Entry}}}}, - }, - { - TenantID: "tenant-2", - Request: push.PushRequest{Streams: []push.Stream{{Labels: "{}", Entries: []push.Entry{logEntries[5].Entry}}}}, - }, - }, - expectedMetrics: ` - # HELP loki_write_sent_entries_total Number of log entries sent to the ingester. - # TYPE loki_write_sent_entries_total counter - loki_write_sent_entries_total{host="__HOST__",tenant="tenant-1"} 2.0 - loki_write_sent_entries_total{host="__HOST__",tenant="tenant-2"} 1.0 - loki_write_sent_entries_total{host="__HOST__",tenant="tenant-default"} 1.0 - # HELP loki_write_dropped_entries_total Number of log entries dropped because failed to be sent to the ingester after all retries. - # TYPE loki_write_dropped_entries_total counter - loki_write_dropped_entries_total{host="__HOST__",reason="ingester_error",tenant="tenant-1"} 0 - loki_write_dropped_entries_total{host="__HOST__",reason="ingester_error",tenant="tenant-2"} 0 - loki_write_dropped_entries_total{host="__HOST__",reason="ingester_error",tenant="tenant-default"} 0 - loki_write_dropped_entries_total{host="__HOST__",reason="line_too_long",tenant="tenant-1"} 0 - loki_write_dropped_entries_total{host="__HOST__",reason="line_too_long",tenant="tenant-2"} 0 - loki_write_dropped_entries_total{host="__HOST__",reason="line_too_long",tenant="tenant-default"} 0 - loki_write_dropped_entries_total{host="__HOST__",reason="rate_limited",tenant="tenant-1"} 0 - loki_write_dropped_entries_total{host="__HOST__",reason="rate_limited",tenant="tenant-2"} 0 - loki_write_dropped_entries_total{host="__HOST__",reason="rate_limited",tenant="tenant-default"} 0 - loki_write_dropped_entries_total{host="__HOST__",reason="stream_limited",tenant="tenant-1"} 0 - loki_write_dropped_entries_total{host="__HOST__",reason="stream_limited",tenant="tenant-2"} 0 - loki_write_dropped_entries_total{host="__HOST__",reason="stream_limited",tenant="tenant-default"} 0 - # HELP loki_write_mutated_entries_total The total number of log entries that have been mutated. - # TYPE loki_write_mutated_entries_total counter - loki_write_mutated_entries_total{host="__HOST__",reason="ingester_error",tenant="tenant-1"} 0 - loki_write_mutated_entries_total{host="__HOST__",reason="ingester_error",tenant="tenant-2"} 0 - loki_write_mutated_entries_total{host="__HOST__",reason="ingester_error",tenant="tenant-default"} 0 - loki_write_mutated_entries_total{host="__HOST__",reason="line_too_long",tenant="tenant-1"} 0 - loki_write_mutated_entries_total{host="__HOST__",reason="line_too_long",tenant="tenant-2"} 0 - loki_write_mutated_entries_total{host="__HOST__",reason="line_too_long",tenant="tenant-default"} 0 - loki_write_mutated_entries_total{host="__HOST__",reason="rate_limited",tenant="tenant-1"} 0 - loki_write_mutated_entries_total{host="__HOST__",reason="rate_limited",tenant="tenant-2"} 0 - loki_write_mutated_entries_total{host="__HOST__",reason="rate_limited",tenant="tenant-default"} 0 - loki_write_mutated_entries_total{host="__HOST__",reason="stream_limited",tenant="tenant-1"} 0 - loki_write_mutated_entries_total{host="__HOST__",reason="stream_limited",tenant="tenant-2"} 0 - loki_write_mutated_entries_total{host="__HOST__",reason="stream_limited",tenant="tenant-default"} 0 - # HELP loki_write_mutated_bytes_total The total number of bytes that have been mutated. - # TYPE loki_write_mutated_bytes_total counter - loki_write_mutated_bytes_total{host="__HOST__",reason="ingester_error",tenant="tenant-1"} 0 - loki_write_mutated_bytes_total{host="__HOST__",reason="ingester_error",tenant="tenant-2"} 0 - loki_write_mutated_bytes_total{host="__HOST__",reason="ingester_error",tenant="tenant-default"} 0 - loki_write_mutated_bytes_total{host="__HOST__",reason="line_too_long",tenant="tenant-1"} 0 - loki_write_mutated_bytes_total{host="__HOST__",reason="line_too_long",tenant="tenant-2"} 0 - loki_write_mutated_bytes_total{host="__HOST__",reason="line_too_long",tenant="tenant-default"} 0 - loki_write_mutated_bytes_total{host="__HOST__",reason="rate_limited",tenant="tenant-1"} 0 - loki_write_mutated_bytes_total{host="__HOST__",reason="rate_limited",tenant="tenant-2"} 0 - loki_write_mutated_bytes_total{host="__HOST__",reason="rate_limited",tenant="tenant-default"} 0 - loki_write_mutated_bytes_total{host="__HOST__",reason="stream_limited",tenant="tenant-1"} 0 - loki_write_mutated_bytes_total{host="__HOST__",reason="stream_limited",tenant="tenant-2"} 0 - loki_write_mutated_bytes_total{host="__HOST__",reason="stream_limited",tenant="tenant-default"} 0 - `, - }, - } - - for testName, tt := range tests { - t.Run(testName, func(t *testing.T) { - reg := prometheus.NewRegistry() - - // Create a buffer channel where we do enqueue received requests - receivedReqsChan := make(chan util.RemoteWriteRequest, 10) - - // Start a local HTTP server - server := util.NewRemoteWriteServer(receivedReqsChan, tt.serverResponseStatus) - require.NotNil(t, server) - defer server.Close() - - // Get the URL at which the local test server is listening to - serverURL := flagext.URLValue{} - err := serverURL.Set(server.URL) - require.NoError(t, err) - - tt.endpointConfig.URL = serverURL - tt.endpointConfig.Client = config.DefaultHTTPClientConfig - tt.endpointConfig.BackoffConfig = backoff.Config{MinBackoff: 1 * time.Millisecond, MaxBackoff: 2 * time.Millisecond, MaxRetries: 3} - tt.endpointConfig.Timeout = 1 * time.Second - - m := NewMetrics(reg) - c, err := newEndpoint(m, tt.endpointConfig, log.NewNopLogger()) - require.NoError(t, err) - - // Send all the input log entries - for i, logEntry := range tt.inputEntries { - c.Chan() <- logEntry - - if tt.inputDelay > 0 && i < len(tt.inputEntries)-1 { - time.Sleep(tt.inputDelay) - } - } - - // Wait until the expected push requests are received (with a timeout) - deadline := time.Now().Add(1 * time.Second) - for len(receivedReqsChan) < len(tt.expectedReqs) && time.Now().Before(deadline) { - time.Sleep(5 * time.Millisecond) - } - - // Stop the endpoint: it waits until the current batch is sent - c.Stop() - close(receivedReqsChan) - - // Get all push requests received on the server side - receivedReqs := make([]util.RemoteWriteRequest, 0) - for req := range receivedReqsChan { - receivedReqs = append(receivedReqs, req) - } - - assert.ElementsMatch(t, tt.expectedReqs, receivedReqs) - - expectedMetrics := strings.ReplaceAll(tt.expectedMetrics, "__HOST__", serverURL.Host) - err = testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), "loki_write_sent_entries_total", "loki_write_dropped_entries_total", "loki_write_mutated_entries_total", "loki_write_mutated_bytes_total") - assert.NoError(t, err) - }) - } -} - func newServerAndEndpointConfig(t *testing.T) (Config, chan util.RemoteWriteRequest, func()) { receivedReqsChan := make(chan util.RemoteWriteRequest, 10) diff --git a/internal/component/common/loki/client/consumer_wal.go b/internal/component/common/loki/client/consumer_wal.go index 03d1abbf85b..d3377ed1377 100644 --- a/internal/component/common/loki/client/consumer_wal.go +++ b/internal/component/common/loki/client/consumer_wal.go @@ -1,14 +1,11 @@ package client import ( - "context" "fmt" "sync" - "time" "github.com/go-kit/log" "github.com/go-kit/log/level" - "github.com/grafana/dskit/backoff" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/model" "github.com/prometheus/prometheus/tsdb/chunks" @@ -57,16 +54,18 @@ func NewWALConsumer(logger log.Logger, reg prometheus.Registerer, walCfg wal.Con } markerHandler := internal.NewMarkerHandler(markerFileHandler, walCfg.MaxSegmentAge, logger, walMarkerMetrics.WithCurriedId(name)) - endpoint, err := newWalEndpoint(metrics, walEndpointMetrics.CurryWithId(name), cfg, logger, markerHandler) + endpoint, err := newEndpoint(metrics, cfg, logger, markerHandler) if err != nil { - return nil, fmt.Errorf("error starting wal endpoint: %w", err) + return nil, fmt.Errorf("error starting endpoint: %w", err) } + adapter := newWalEndpointAdapter(endpoint, logger, walEndpointMetrics, markerHandler) + // subscribe watcher's wal.WriteTo to writer events. This will make the writer trigger the cleanup of the wal.WriteTo // series cache whenever a segment is deleted. - writer.SubscribeCleanup(endpoint) + writer.SubscribeCleanup(adapter) - watcher := wal.NewWatcher(walCfg.Dir, name, walWatcherMetrics, endpoint, log.With(logger, "component", name), walCfg.WatchConfig, markerHandler) + watcher := wal.NewWatcher(walCfg.Dir, name, walWatcherMetrics, adapter, log.With(logger, "component", name), walCfg.WatchConfig, markerHandler) // subscribe watcher to wal write events writer.SubscribeWrite(watcher) @@ -76,7 +75,7 @@ func NewWALConsumer(logger log.Logger, reg prometheus.Registerer, walCfg wal.Con m.pairs = append(m.pairs, endpointWatcherPair{ watcher: watcher, - endpoint: endpoint, + endpoint: adapter, }) } @@ -87,7 +86,7 @@ func NewWALConsumer(logger log.Logger, reg prometheus.Registerer, walCfg wal.Con type endpointWatcherPair struct { watcher *wal.Watcher - endpoint *walEndpoint + endpoint *walEndpointAdapter } // Stop will proceed to stop, in order, watcher and the endpoint. @@ -117,8 +116,8 @@ func (m *WALConsumer) Stop() { m.stop(false) } -// StopAndDrain will stop the manager, its WalWriter, Write-Ahead Log watchers, -// and queues accordingly. It attempt to drain the WAL completely. +// StopAndDrain will stop the consumer, its WalWriter, Write-Ahead Log watchers, +// and endpoints accordingly. It attempt to drain the WAL completely. func (m *WALConsumer) StopAndDrain() { m.stop(true) } @@ -141,24 +140,11 @@ func (m *WALConsumer) stop(drain bool) { stopWG.Wait() } -func newWalEndpoint(metrics *Metrics, wcMetrics *WALEndpointMetrics, cfg Config, logger log.Logger, markerHandler internal.MarkerHandler) (*walEndpoint, error) { - logger = log.With(logger, "component", "endpoint", "host", cfg.URL.Host) - - shards, err := newShards(metrics, logger, markerHandler, cfg) - if err != nil { - return nil, err - } - - ctx, cancel := context.WithCancel(context.Background()) - - c := &walEndpoint{ - logger: logger, - cfg: cfg, +func newWalEndpointAdapter(endpoint *endpoint, logger log.Logger, wcMetrics *WALEndpointMetrics, markerHandler internal.MarkerHandler) *walEndpointAdapter { + c := &walEndpointAdapter{ + logger: log.With(logger, "component", "waladapter"), weMetrics: wcMetrics, - shards: shards, - - ctx: ctx, - cancel: cancel, + endpoint: endpoint, series: make(map[chunks.HeadSeriesRef]model.LabelSet), seriesSegment: make(map[chunks.HeadSeriesRef]int), @@ -166,22 +152,17 @@ func newWalEndpoint(metrics *Metrics, wcMetrics *WALEndpointMetrics, cfg Config, markerHandler: markerHandler, } - c.shards.start(cfg.QueueConfig.MinShards) - - return c, nil + return c } -// walEndpoint is a WAL-specific remote write implementation. This endpoint attests to the wal.WriteTo interface, -// which allows it to be injected in the wal.Watcher as a destination where to write read series and entries. As the watcher -// reads from the WAL, batches are created and dispatched onto a send queue when ready to be sent. -type walEndpoint struct { - weMetrics *WALEndpointMetrics +// walEndpointAdapter is an adapter between watcher and endpoint. This component attests to the wal.WriteTo interface, +// which allows it to be injected in the wal.Watcher as a destination where to write series and entries. As the watcher +// reads from the WAL, entires are forwarded here so it can be written to endpoint. +type walEndpointAdapter struct { logger log.Logger - cfg Config - shards *shards + weMetrics *WALEndpointMetrics - ctx context.Context - cancel context.CancelFunc + endpoint *endpoint // series cache series map[chunks.HeadSeriesRef]model.LabelSet @@ -191,7 +172,7 @@ type walEndpoint struct { markerHandler internal.MarkerHandler } -func (c *walEndpoint) SeriesReset(segmentNum int) { +func (c *walEndpointAdapter) SeriesReset(segmentNum int) { c.seriesLock.Lock() defer c.seriesLock.Unlock() for k, v := range c.seriesSegment { @@ -203,7 +184,7 @@ func (c *walEndpoint) SeriesReset(segmentNum int) { } } -func (c *walEndpoint) StoreSeries(series []record.RefSeries, segment int) { +func (c *walEndpointAdapter) StoreSeries(series []record.RefSeries, segment int) { c.seriesLock.Lock() defer c.seriesLock.Unlock() for _, seriesRec := range series { @@ -212,29 +193,32 @@ func (c *walEndpoint) StoreSeries(series []record.RefSeries, segment int) { } } -func (c *walEndpoint) AppendEntries(entries wal.RefEntries, segment int) error { +func (c *walEndpointAdapter) AppendEntries(entries wal.RefEntries, segment int) error { c.seriesLock.RLock() l, ok := c.series[entries.Ref] c.seriesLock.RUnlock() - var maxSeenTimestamp int64 = -1 - if ok { - for _, e := range entries.Entries { - ok := c.appendSingleEntry(loki.Entry{Labels: l, Entry: e}, segment) - if !ok { - return nil - } - - if e.Timestamp.Unix() > maxSeenTimestamp { - maxSeenTimestamp = e.Timestamp.Unix() - } - } - // count all enqueued appended entries as received from WAL - c.markerHandler.UpdateReceivedData(segment, len(entries.Entries)) - } else { + + if !ok { // TODO(thepalbi): Add metric here level.Debug(c.logger).Log("msg", "series for entry not found") + return nil + } + + var maxSeenTimestamp int64 = -1 + for _, e := range entries.Entries { + ok := c.endpoint.enqueue(loki.Entry{Labels: l, Entry: e}, segment) + if !ok { + return nil + } + + if e.Timestamp.Unix() > maxSeenTimestamp { + maxSeenTimestamp = e.Timestamp.Unix() + } } + // count all enqueued appended entries as received from WAL + c.markerHandler.UpdateReceivedData(segment, len(entries.Entries)) + // It's safe to assume that upon an AppendEntries call, there will always be at least // one entry. c.weMetrics.lastReadTimestamp.WithLabelValues().Set(float64(maxSeenTimestamp)) @@ -242,24 +226,9 @@ func (c *walEndpoint) AppendEntries(entries wal.RefEntries, segment int) error { return nil } -func (c *walEndpoint) appendSingleEntry(entry loki.Entry, segmentNum int) bool { - backoff := backoff.New(c.ctx, backoff.Config{ - MinBackoff: 5 * time.Millisecond, - MaxBackoff: 50 * time.Millisecond, - }) - for !c.shards.enqueue(entry, segmentNum) { - if !backoff.Ongoing() { - // we could not enqueue and endpoint is stopped. - return false - } - } - return true -} - // Stop the endpoint, enqueueing pending batches and draining the send queue accordingly. Both closing operations are // limited by a deadline, controlled by a configured drain timeout, which is global to the Stop call. -func (c *walEndpoint) Stop() { - // drain shards - c.shards.stop() +func (c *walEndpointAdapter) Stop() { + c.endpoint.Stop() c.markerHandler.Stop() } diff --git a/internal/component/common/loki/client/consumer_wal_test.go b/internal/component/common/loki/client/consumer_wal_test.go index 99dfc4449ef..e0d4d1db31b 100644 --- a/internal/component/common/loki/client/consumer_wal_test.go +++ b/internal/component/common/loki/client/consumer_wal_test.go @@ -259,9 +259,11 @@ func TestWALEndpoint(t *testing.T) { } logger := log.NewLogfmtLogger(os.Stdout) + marker := internal.NewNopMarkerHandler() - wc, err := newWalEndpoint(NewMetrics(reg), NewWALEndpointMetrics(reg).CurryWithId("test"), cfg, logger, internal.NewNopMarkerHandler()) + endpoint, err := newEndpoint(NewMetrics(reg), cfg, logger, marker) require.NoError(t, err) + adapter := newWalEndpointAdapter(endpoint, logger, NewWALEndpointMetrics(reg).CurryWithId("test"), marker) //labels := model.LabelSet{"app": "test"} lines := make([]string, 0, tc.numLines) @@ -272,7 +274,7 @@ func TestWALEndpoint(t *testing.T) { // Send all the input log entries for i, l := range lines { mod := i % tc.numSeries - wc.StoreSeries([]record.RefSeries{ + adapter.StoreSeries([]record.RefSeries{ { Labels: labels.New( labels.Label{Name: "app", Value: fmt.Sprintf("test-%d", mod)}, @@ -281,7 +283,7 @@ func TestWALEndpoint(t *testing.T) { }, }, 0) - _ = wc.AppendEntries(wal.RefEntries{ + _ = adapter.AppendEntries(wal.RefEntries{ Ref: chunks.HeadSeriesRef(mod), Entries: []push.Entry{{ Timestamp: time.Now(), @@ -299,7 +301,7 @@ func TestWALEndpoint(t *testing.T) { } // Stop the endpoint: it waits until the current batch is sent - wc.Stop() + adapter.Stop() close(receivedReqsChan) }) } @@ -401,9 +403,11 @@ func runWALEndpointBenchCase(b *testing.B, bc testCase, mhFactory func(t *testin } logger := log.NewLogfmtLogger(os.Stdout) + marker := mhFactory(b) - qc, err := newWalEndpoint(NewMetrics(reg), NewWALEndpointMetrics(reg).CurryWithId("test"), cfg, logger, mhFactory(b)) + endpoint, err := newEndpoint(NewMetrics(reg), cfg, logger, marker) require.NoError(b, err) + adapter := newWalEndpointAdapter(endpoint, logger, NewWALEndpointMetrics(reg).CurryWithId("test"), marker) //labels := model.LabelSet{"app": "test"} var lines []string @@ -415,7 +419,7 @@ func runWALEndpointBenchCase(b *testing.B, bc testCase, mhFactory func(t *testin // Send all the input log entries for j, l := range lines { seriesId := j % bc.numSeries - qc.StoreSeries([]record.RefSeries{ + adapter.StoreSeries([]record.RefSeries{ { Labels: labels.New( // take j module bc.numSeries to evenly distribute those numSeries across all sent entries @@ -425,7 +429,7 @@ func runWALEndpointBenchCase(b *testing.B, bc testCase, mhFactory func(t *testin }, }, 0) - _ = qc.AppendEntries(wal.RefEntries{ + _ = adapter.AppendEntries(wal.RefEntries{ Ref: chunks.HeadSeriesRef(seriesId), Entries: []push.Entry{{ Timestamp: time.Now(), @@ -443,7 +447,7 @@ func runWALEndpointBenchCase(b *testing.B, bc testCase, mhFactory func(t *testin } // Stop the endpoint: it waits until the current batch is sent - qc.Stop() + adapter.Stop() close(receivedReqsChan) } @@ -494,7 +498,7 @@ func runEndpointBenchCase(b *testing.B, bc testCase) { logger := log.NewLogfmtLogger(os.Stdout) m := NewMetrics(reg) - qc, err := newEndpoint(m, cfg, logger) + endpoint, err := newEndpoint(m, cfg, logger, internal.NewNopMarkerHandler()) require.NoError(b, err) //labels := model.LabelSet{"app": "test"} @@ -507,7 +511,7 @@ func runEndpointBenchCase(b *testing.B, bc testCase) { // Send all the input log entries for j, l := range lines { seriesId := j % bc.numSeries - qc.Chan() <- loki.Entry{ + endpoint.enqueue(loki.Entry{ Labels: model.LabelSet{ // take j module bc.numSeries to evenly distribute those numSeries across all sent entries "app": model.LabelValue(fmt.Sprintf("series-%d", seriesId)), @@ -516,7 +520,7 @@ func runEndpointBenchCase(b *testing.B, bc testCase) { Timestamp: time.Now(), Line: l, }, - } + }, 0) } require.Eventually(b, func() bool { @@ -528,6 +532,6 @@ func runEndpointBenchCase(b *testing.B, bc testCase) { } // Stop the endpoint: it waits until the current batch is sent - qc.Stop() + endpoint.Stop() close(receivedReqsChan) } diff --git a/internal/component/common/loki/client/endpoint.go b/internal/component/common/loki/client/endpoint.go new file mode 100644 index 00000000000..41000559f85 --- /dev/null +++ b/internal/component/common/loki/client/endpoint.go @@ -0,0 +1,78 @@ +package client + +import ( + "context" + "crypto/sha256" + "fmt" + "time" + + "github.com/go-kit/log" + "github.com/grafana/alloy/internal/component/common/loki" + "github.com/grafana/alloy/internal/component/common/loki/client/internal" + "github.com/grafana/dskit/backoff" +) + +type endpoint struct { + cfg Config + entries chan loki.Entry + + ctx context.Context + cancel context.CancelFunc + + shards *shards +} + +func newEndpoint(metrics *Metrics, cfg Config, logger log.Logger, markerHandler internal.MarkerHandler) (*endpoint, error) { + logger = log.With(logger, "component", "endpoint", "host", cfg.URL.Host) + + shards, err := newShards(metrics, logger, markerHandler, cfg) + if err != nil { + return nil, err + } + + ctx, cancel := context.WithCancel(context.Background()) + + c := &endpoint{ + cfg: cfg, + entries: make(chan loki.Entry), + shards: shards, + ctx: ctx, + cancel: cancel, + } + + c.shards.start(cfg.QueueConfig.MinShards) + return c, nil +} + +// enqueue will try to enqueue entry. If endpoint it stopped any active attempts will +// be stopped and false will be returned. +func (c *endpoint) enqueue(entry loki.Entry, segmentNum int) bool { + backoff := backoff.New(c.ctx, backoff.Config{ + MinBackoff: 5 * time.Millisecond, + MaxBackoff: 50 * time.Millisecond, + }) + for !c.shards.enqueue(entry, segmentNum) { + backoff.Wait() + if !backoff.Ongoing() { + return false + } + } + return true +} + +func (c *endpoint) Stop() { + c.shards.stop() + c.cancel() +} + +// getEndpointName computes the specific name for each endpoint config. The name is either the configured Name setting in Config, +// or a hash of the config as whole, this allows us to detect repeated configs. +func getEndpointName(cfg Config) string { + if cfg.Name != "" { + return cfg.Name + } + + h := sha256.New() + _, _ = fmt.Fprintf(h, "%v", cfg) + return fmt.Sprintf("%x", h.Sum(nil))[:6] +} diff --git a/internal/component/common/loki/client/endpoint_test.go b/internal/component/common/loki/client/endpoint_test.go new file mode 100644 index 00000000000..a5987026ed3 --- /dev/null +++ b/internal/component/common/loki/client/endpoint_test.go @@ -0,0 +1,450 @@ +package client + +import ( + "strings" + "testing" + "time" + + "github.com/go-kit/log" + "github.com/grafana/dskit/backoff" + "github.com/grafana/dskit/flagext" + "github.com/grafana/loki/pkg/push" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/prometheus/common/config" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/grafana/alloy/internal/component/common/loki" + "github.com/grafana/alloy/internal/component/common/loki/client/internal" + "github.com/grafana/alloy/internal/loki/util" +) + +func TestEndpoint(t *testing.T) { + tests := map[string]struct { + endpointConfig Config + serverResponseStatus int + inputEntries []loki.Entry + inputDelay time.Duration + expectedReqs []util.RemoteWriteRequest + expectedMetrics string + }{ + "batch log entries together until the batch size is reached": { + endpointConfig: Config{ + BatchSize: 10, + BatchWait: 100 * time.Millisecond, + }, + serverResponseStatus: 200, + inputEntries: []loki.Entry{logEntries[0], logEntries[1], logEntries[2]}, + expectedReqs: []util.RemoteWriteRequest{ + { + TenantID: "", + Request: push.PushRequest{Streams: []push.Stream{{Labels: "{}", Entries: []push.Entry{logEntries[0].Entry, logEntries[1].Entry}}}}, + }, + { + TenantID: "", + Request: push.PushRequest{Streams: []push.Stream{{Labels: "{}", Entries: []push.Entry{logEntries[2].Entry}}}}, + }, + }, + expectedMetrics: ` + # HELP loki_write_sent_entries_total Number of log entries sent to the ingester. + # TYPE loki_write_sent_entries_total counter + loki_write_sent_entries_total{host="__HOST__",tenant=""} 3.0 + # HELP loki_write_dropped_entries_total Number of log entries dropped because failed to be sent to the ingester after all retries. + # TYPE loki_write_dropped_entries_total counter + loki_write_dropped_entries_total{host="__HOST__",reason="ingester_error",tenant=""} 0 + loki_write_dropped_entries_total{host="__HOST__",reason="line_too_long",tenant=""} 0 + loki_write_dropped_entries_total{host="__HOST__",reason="rate_limited",tenant=""} 0 + loki_write_dropped_entries_total{host="__HOST__",reason="stream_limited",tenant=""} 0 + # HELP loki_write_mutated_entries_total The total number of log entries that have been mutated. + # TYPE loki_write_mutated_entries_total counter + loki_write_mutated_entries_total{host="__HOST__",reason="ingester_error",tenant=""} 0 + loki_write_mutated_entries_total{host="__HOST__",reason="line_too_long",tenant=""} 0 + loki_write_mutated_entries_total{host="__HOST__",reason="rate_limited",tenant=""} 0 + loki_write_mutated_entries_total{host="__HOST__",reason="stream_limited",tenant=""} 0 + # HELP loki_write_mutated_bytes_total The total number of bytes that have been mutated. + # TYPE loki_write_mutated_bytes_total counter + loki_write_mutated_bytes_total{host="__HOST__",reason="ingester_error",tenant=""} 0 + loki_write_mutated_bytes_total{host="__HOST__",reason="line_too_long",tenant=""} 0 + loki_write_mutated_bytes_total{host="__HOST__",reason="rate_limited",tenant=""} 0 + loki_write_mutated_bytes_total{host="__HOST__",reason="stream_limited",tenant=""} 0 + `, + }, + "batch log entries together until the batch wait time is reached": { + endpointConfig: Config{ + BatchSize: 10, + BatchWait: 100 * time.Millisecond, + }, + serverResponseStatus: 200, + inputEntries: []loki.Entry{logEntries[0], logEntries[1]}, + inputDelay: 110 * time.Millisecond, + expectedReqs: []util.RemoteWriteRequest{ + { + TenantID: "", + Request: push.PushRequest{Streams: []push.Stream{{Labels: "{}", Entries: []push.Entry{logEntries[0].Entry}}}}, + }, + { + TenantID: "", + Request: push.PushRequest{Streams: []push.Stream{{Labels: "{}", Entries: []push.Entry{logEntries[1].Entry}}}}, + }, + }, + expectedMetrics: ` + # HELP loki_write_sent_entries_total Number of log entries sent to the ingester. + # TYPE loki_write_sent_entries_total counter + loki_write_sent_entries_total{host="__HOST__",tenant=""} 2.0 + # HELP loki_write_dropped_entries_total Number of log entries dropped because failed to be sent to the ingester after all retries. + # TYPE loki_write_dropped_entries_total counter + loki_write_dropped_entries_total{host="__HOST__",reason="ingester_error",tenant=""} 0 + loki_write_dropped_entries_total{host="__HOST__",reason="line_too_long",tenant=""} 0 + loki_write_dropped_entries_total{host="__HOST__",reason="rate_limited",tenant=""} 0 + loki_write_dropped_entries_total{host="__HOST__",reason="stream_limited",tenant=""} 0 + # HELP loki_write_mutated_entries_total The total number of log entries that have been mutated. + # TYPE loki_write_mutated_entries_total counter + loki_write_mutated_entries_total{host="__HOST__",reason="ingester_error",tenant=""} 0 + loki_write_mutated_entries_total{host="__HOST__",reason="line_too_long",tenant=""} 0 + loki_write_mutated_entries_total{host="__HOST__",reason="rate_limited",tenant=""} 0 + loki_write_mutated_entries_total{host="__HOST__",reason="stream_limited",tenant=""} 0 + # HELP loki_write_mutated_bytes_total The total number of bytes that have been mutated. + # TYPE loki_write_mutated_bytes_total counter + loki_write_mutated_bytes_total{host="__HOST__",reason="ingester_error",tenant=""} 0 + loki_write_mutated_bytes_total{host="__HOST__",reason="line_too_long",tenant=""} 0 + loki_write_mutated_bytes_total{host="__HOST__",reason="rate_limited",tenant=""} 0 + loki_write_mutated_bytes_total{host="__HOST__",reason="stream_limited",tenant=""} 0 + `, + }, + "retry send a batch up to backoff's max retries in case the server responds with a 5xx": { + endpointConfig: Config{ + BatchSize: 10, + BatchWait: 10 * time.Millisecond, + }, + serverResponseStatus: 500, + inputEntries: []loki.Entry{logEntries[0]}, + expectedReqs: []util.RemoteWriteRequest{ + { + TenantID: "", + Request: push.PushRequest{Streams: []push.Stream{{Labels: "{}", Entries: []push.Entry{logEntries[0].Entry}}}}, + }, + { + TenantID: "", + Request: push.PushRequest{Streams: []push.Stream{{Labels: "{}", Entries: []push.Entry{logEntries[0].Entry}}}}, + }, + { + TenantID: "", + Request: push.PushRequest{Streams: []push.Stream{{Labels: "{}", Entries: []push.Entry{logEntries[0].Entry}}}}, + }, + }, + expectedMetrics: ` + # HELP loki_write_dropped_entries_total Number of log entries dropped because failed to be sent to the ingester after all retries. + # TYPE loki_write_dropped_entries_total counter + loki_write_dropped_entries_total{host="__HOST__",reason="ingester_error",tenant=""} 1 + loki_write_dropped_entries_total{host="__HOST__",reason="line_too_long",tenant=""} 0 + loki_write_dropped_entries_total{host="__HOST__",reason="rate_limited",tenant=""} 0 + loki_write_dropped_entries_total{host="__HOST__",reason="stream_limited",tenant=""} 0 + # HELP loki_write_mutated_entries_total The total number of log entries that have been mutated. + # TYPE loki_write_mutated_entries_total counter + loki_write_mutated_entries_total{host="__HOST__",reason="ingester_error",tenant=""} 0 + loki_write_mutated_entries_total{host="__HOST__",reason="line_too_long",tenant=""} 0 + loki_write_mutated_entries_total{host="__HOST__",reason="rate_limited",tenant=""} 0 + loki_write_mutated_entries_total{host="__HOST__",reason="stream_limited",tenant=""} 0 + # HELP loki_write_mutated_bytes_total The total number of bytes that have been mutated. + # TYPE loki_write_mutated_bytes_total counter + loki_write_mutated_bytes_total{host="__HOST__",reason="ingester_error",tenant=""} 0 + loki_write_mutated_bytes_total{host="__HOST__",reason="line_too_long",tenant=""} 0 + loki_write_mutated_bytes_total{host="__HOST__",reason="rate_limited",tenant=""} 0 + loki_write_mutated_bytes_total{host="__HOST__",reason="stream_limited",tenant=""} 0 + # HELP loki_write_sent_entries_total Number of log entries sent to the ingester. + # TYPE loki_write_sent_entries_total counter + loki_write_sent_entries_total{host="__HOST__",tenant=""} 0 + `, + }, + "do not retry send a batch in case the server responds with a 4xx": { + endpointConfig: Config{ + BatchSize: 10, + BatchWait: 10 * time.Millisecond, + }, + serverResponseStatus: 400, + inputEntries: []loki.Entry{logEntries[0]}, + expectedReqs: []util.RemoteWriteRequest{ + { + TenantID: "", + Request: push.PushRequest{Streams: []push.Stream{{Labels: "{}", Entries: []push.Entry{logEntries[0].Entry}}}}, + }, + }, + expectedMetrics: ` + # HELP loki_write_dropped_entries_total Number of log entries dropped because failed to be sent to the ingester after all retries. + # TYPE loki_write_dropped_entries_total counter + loki_write_dropped_entries_total{host="__HOST__",reason="ingester_error",tenant=""} 1 + loki_write_dropped_entries_total{host="__HOST__",reason="line_too_long",tenant=""} 0 + loki_write_dropped_entries_total{host="__HOST__",reason="rate_limited",tenant=""} 0 + loki_write_dropped_entries_total{host="__HOST__",reason="stream_limited",tenant=""} 0 + # HELP loki_write_mutated_entries_total The total number of log entries that have been mutated. + # TYPE loki_write_mutated_entries_total counter + loki_write_mutated_entries_total{host="__HOST__",reason="ingester_error",tenant=""} 0 + loki_write_mutated_entries_total{host="__HOST__",reason="line_too_long",tenant=""} 0 + loki_write_mutated_entries_total{host="__HOST__",reason="rate_limited",tenant=""} 0 + loki_write_mutated_entries_total{host="__HOST__",reason="stream_limited",tenant=""} 0 + # HELP loki_write_mutated_bytes_total The total number of bytes that have been mutated. + # TYPE loki_write_mutated_bytes_total counter + loki_write_mutated_bytes_total{host="__HOST__",reason="ingester_error",tenant=""} 0 + loki_write_mutated_bytes_total{host="__HOST__",reason="line_too_long",tenant=""} 0 + loki_write_mutated_bytes_total{host="__HOST__",reason="rate_limited",tenant=""} 0 + loki_write_mutated_bytes_total{host="__HOST__",reason="stream_limited",tenant=""} 0 + # HELP loki_write_sent_entries_total Number of log entries sent to the ingester. + # TYPE loki_write_sent_entries_total counter + loki_write_sent_entries_total{host="__HOST__",tenant=""} 0 + `, + }, + "do retry sending a batch in case the server responds with a 429": { + endpointConfig: Config{ + BatchSize: 10, + BatchWait: 10 * time.Millisecond, + }, + serverResponseStatus: 429, + inputEntries: []loki.Entry{logEntries[0]}, + expectedReqs: []util.RemoteWriteRequest{ + { + TenantID: "", + Request: push.PushRequest{Streams: []push.Stream{{Labels: "{}", Entries: []push.Entry{logEntries[0].Entry}}}}, + }, + { + TenantID: "", + Request: push.PushRequest{Streams: []push.Stream{{Labels: "{}", Entries: []push.Entry{logEntries[0].Entry}}}}, + }, + { + TenantID: "", + Request: push.PushRequest{Streams: []push.Stream{{Labels: "{}", Entries: []push.Entry{logEntries[0].Entry}}}}, + }, + }, + expectedMetrics: ` + # HELP loki_write_dropped_entries_total Number of log entries dropped because failed to be sent to the ingester after all retries. + # TYPE loki_write_dropped_entries_total counter + loki_write_dropped_entries_total{host="__HOST__",reason="ingester_error",tenant=""} 0 + loki_write_dropped_entries_total{host="__HOST__",reason="line_too_long",tenant=""} 0 + loki_write_dropped_entries_total{host="__HOST__",reason="rate_limited",tenant=""} 1 + loki_write_dropped_entries_total{host="__HOST__",reason="stream_limited",tenant=""} 0 + # HELP loki_write_mutated_entries_total The total number of log entries that have been mutated. + # TYPE loki_write_mutated_entries_total counter + loki_write_mutated_entries_total{host="__HOST__",reason="ingester_error",tenant=""} 0 + loki_write_mutated_entries_total{host="__HOST__",reason="line_too_long",tenant=""} 0 + loki_write_mutated_entries_total{host="__HOST__",reason="rate_limited",tenant=""} 0 + loki_write_mutated_entries_total{host="__HOST__",reason="stream_limited",tenant=""} 0 + # HELP loki_write_mutated_bytes_total The total number of bytes that have been mutated. + # TYPE loki_write_mutated_bytes_total counter + loki_write_mutated_bytes_total{host="__HOST__",reason="ingester_error",tenant=""} 0 + loki_write_mutated_bytes_total{host="__HOST__",reason="line_too_long",tenant=""} 0 + loki_write_mutated_bytes_total{host="__HOST__",reason="rate_limited",tenant=""} 0 + loki_write_mutated_bytes_total{host="__HOST__",reason="stream_limited",tenant=""} 0 + # HELP loki_write_sent_entries_total Number of log entries sent to the ingester. + # TYPE loki_write_sent_entries_total counter + loki_write_sent_entries_total{host="__HOST__",tenant=""} 0 + `, + }, + "do not retry in case of 429 when endpoint is configured to drop rate limited batches": { + endpointConfig: Config{ + BatchSize: 10, + BatchWait: 10 * time.Millisecond, + DropRateLimitedBatches: true, + }, + serverResponseStatus: 429, + inputEntries: []loki.Entry{logEntries[0]}, + expectedReqs: []util.RemoteWriteRequest{ + { + TenantID: "", + Request: push.PushRequest{Streams: []push.Stream{{Labels: "{}", Entries: []push.Entry{logEntries[0].Entry}}}}, + }, + }, + expectedMetrics: ` + # HELP loki_write_dropped_entries_total Number of log entries dropped because failed to be sent to the ingester after all retries. + # TYPE loki_write_dropped_entries_total counter + loki_write_dropped_entries_total{host="__HOST__",reason="ingester_error",tenant=""} 0 + loki_write_dropped_entries_total{host="__HOST__",reason="line_too_long",tenant=""} 0 + loki_write_dropped_entries_total{host="__HOST__",reason="rate_limited",tenant=""} 1 + loki_write_dropped_entries_total{host="__HOST__",reason="stream_limited",tenant=""} 0 + # HELP loki_write_mutated_entries_total The total number of log entries that have been mutated. + # TYPE loki_write_mutated_entries_total counter + loki_write_mutated_entries_total{host="__HOST__",reason="ingester_error",tenant=""} 0 + loki_write_mutated_entries_total{host="__HOST__",reason="line_too_long",tenant=""} 0 + loki_write_mutated_entries_total{host="__HOST__",reason="rate_limited",tenant=""} 0 + loki_write_mutated_entries_total{host="__HOST__",reason="stream_limited",tenant=""} 0 + # HELP loki_write_mutated_bytes_total The total number of bytes that have been mutated. + # TYPE loki_write_mutated_bytes_total counter + loki_write_mutated_bytes_total{host="__HOST__",reason="ingester_error",tenant=""} 0 + loki_write_mutated_bytes_total{host="__HOST__",reason="line_too_long",tenant=""} 0 + loki_write_mutated_bytes_total{host="__HOST__",reason="rate_limited",tenant=""} 0 + loki_write_mutated_bytes_total{host="__HOST__",reason="stream_limited",tenant=""} 0 + # HELP loki_write_sent_entries_total Number of log entries sent to the ingester. + # TYPE loki_write_sent_entries_total counter + loki_write_sent_entries_total{host="__HOST__",tenant=""} 0 + `, + }, + "batch log entries together honoring the endpoint tenant ID": { + endpointConfig: Config{ + BatchSize: 100, + BatchWait: 100 * time.Millisecond, + TenantID: "tenant-default", + }, + serverResponseStatus: 200, + inputEntries: []loki.Entry{logEntries[0], logEntries[1]}, + expectedReqs: []util.RemoteWriteRequest{ + { + TenantID: "tenant-default", + Request: push.PushRequest{Streams: []push.Stream{{Labels: "{}", Entries: []push.Entry{logEntries[0].Entry, logEntries[1].Entry}}}}, + }, + }, + expectedMetrics: ` + # HELP loki_write_sent_entries_total Number of log entries sent to the ingester. + # TYPE loki_write_sent_entries_total counter + loki_write_sent_entries_total{host="__HOST__",tenant="tenant-default"} 2.0 + # HELP loki_write_dropped_entries_total Number of log entries dropped because failed to be sent to the ingester after all retries. + # TYPE loki_write_dropped_entries_total counter + loki_write_dropped_entries_total{host="__HOST__", reason="ingester_error", tenant="tenant-default"} 0 + loki_write_dropped_entries_total{host="__HOST__",reason="line_too_long",tenant="tenant-default"} 0 + loki_write_dropped_entries_total{host="__HOST__", reason="rate_limited", tenant="tenant-default"} 0 + loki_write_dropped_entries_total{host="__HOST__",reason="stream_limited",tenant="tenant-default"} 0 + # HELP loki_write_mutated_entries_total The total number of log entries that have been mutated. + # TYPE loki_write_mutated_entries_total counter + loki_write_mutated_entries_total{host="__HOST__",reason="ingester_error",tenant="tenant-default"} 0 + loki_write_mutated_entries_total{host="__HOST__",reason="line_too_long",tenant="tenant-default"} 0 + loki_write_mutated_entries_total{host="__HOST__",reason="rate_limited",tenant="tenant-default"} 0 + loki_write_mutated_entries_total{host="__HOST__",reason="stream_limited",tenant="tenant-default"} 0 + # HELP loki_write_mutated_bytes_total The total number of bytes that have been mutated. + # TYPE loki_write_mutated_bytes_total counter + loki_write_mutated_bytes_total{host="__HOST__",reason="ingester_error",tenant="tenant-default"} 0 + loki_write_mutated_bytes_total{host="__HOST__",reason="line_too_long",tenant="tenant-default"} 0 + loki_write_mutated_bytes_total{host="__HOST__",reason="rate_limited",tenant="tenant-default"} 0 + loki_write_mutated_bytes_total{host="__HOST__",reason="stream_limited",tenant="tenant-default"} 0 + `, + }, + "batch log entries together honoring the tenant ID overridden while processing the pipeline stages": { + endpointConfig: Config{ + BatchSize: 100, + BatchWait: 100 * time.Millisecond, + TenantID: "tenant-default", + }, + serverResponseStatus: 200, + inputEntries: []loki.Entry{logEntries[0], logEntries[3], logEntries[4], logEntries[5]}, + expectedReqs: []util.RemoteWriteRequest{ + { + TenantID: "tenant-default", + Request: push.PushRequest{Streams: []push.Stream{{Labels: "{}", Entries: []push.Entry{logEntries[0].Entry}}}}, + }, + { + TenantID: "tenant-1", + Request: push.PushRequest{Streams: []push.Stream{{Labels: "{}", Entries: []push.Entry{logEntries[3].Entry, logEntries[4].Entry}}}}, + }, + { + TenantID: "tenant-2", + Request: push.PushRequest{Streams: []push.Stream{{Labels: "{}", Entries: []push.Entry{logEntries[5].Entry}}}}, + }, + }, + expectedMetrics: ` + # HELP loki_write_sent_entries_total Number of log entries sent to the ingester. + # TYPE loki_write_sent_entries_total counter + loki_write_sent_entries_total{host="__HOST__",tenant="tenant-1"} 2.0 + loki_write_sent_entries_total{host="__HOST__",tenant="tenant-2"} 1.0 + loki_write_sent_entries_total{host="__HOST__",tenant="tenant-default"} 1.0 + # HELP loki_write_dropped_entries_total Number of log entries dropped because failed to be sent to the ingester after all retries. + # TYPE loki_write_dropped_entries_total counter + loki_write_dropped_entries_total{host="__HOST__",reason="ingester_error",tenant="tenant-1"} 0 + loki_write_dropped_entries_total{host="__HOST__",reason="ingester_error",tenant="tenant-2"} 0 + loki_write_dropped_entries_total{host="__HOST__",reason="ingester_error",tenant="tenant-default"} 0 + loki_write_dropped_entries_total{host="__HOST__",reason="line_too_long",tenant="tenant-1"} 0 + loki_write_dropped_entries_total{host="__HOST__",reason="line_too_long",tenant="tenant-2"} 0 + loki_write_dropped_entries_total{host="__HOST__",reason="line_too_long",tenant="tenant-default"} 0 + loki_write_dropped_entries_total{host="__HOST__",reason="rate_limited",tenant="tenant-1"} 0 + loki_write_dropped_entries_total{host="__HOST__",reason="rate_limited",tenant="tenant-2"} 0 + loki_write_dropped_entries_total{host="__HOST__",reason="rate_limited",tenant="tenant-default"} 0 + loki_write_dropped_entries_total{host="__HOST__",reason="stream_limited",tenant="tenant-1"} 0 + loki_write_dropped_entries_total{host="__HOST__",reason="stream_limited",tenant="tenant-2"} 0 + loki_write_dropped_entries_total{host="__HOST__",reason="stream_limited",tenant="tenant-default"} 0 + # HELP loki_write_mutated_entries_total The total number of log entries that have been mutated. + # TYPE loki_write_mutated_entries_total counter + loki_write_mutated_entries_total{host="__HOST__",reason="ingester_error",tenant="tenant-1"} 0 + loki_write_mutated_entries_total{host="__HOST__",reason="ingester_error",tenant="tenant-2"} 0 + loki_write_mutated_entries_total{host="__HOST__",reason="ingester_error",tenant="tenant-default"} 0 + loki_write_mutated_entries_total{host="__HOST__",reason="line_too_long",tenant="tenant-1"} 0 + loki_write_mutated_entries_total{host="__HOST__",reason="line_too_long",tenant="tenant-2"} 0 + loki_write_mutated_entries_total{host="__HOST__",reason="line_too_long",tenant="tenant-default"} 0 + loki_write_mutated_entries_total{host="__HOST__",reason="rate_limited",tenant="tenant-1"} 0 + loki_write_mutated_entries_total{host="__HOST__",reason="rate_limited",tenant="tenant-2"} 0 + loki_write_mutated_entries_total{host="__HOST__",reason="rate_limited",tenant="tenant-default"} 0 + loki_write_mutated_entries_total{host="__HOST__",reason="stream_limited",tenant="tenant-1"} 0 + loki_write_mutated_entries_total{host="__HOST__",reason="stream_limited",tenant="tenant-2"} 0 + loki_write_mutated_entries_total{host="__HOST__",reason="stream_limited",tenant="tenant-default"} 0 + # HELP loki_write_mutated_bytes_total The total number of bytes that have been mutated. + # TYPE loki_write_mutated_bytes_total counter + loki_write_mutated_bytes_total{host="__HOST__",reason="ingester_error",tenant="tenant-1"} 0 + loki_write_mutated_bytes_total{host="__HOST__",reason="ingester_error",tenant="tenant-2"} 0 + loki_write_mutated_bytes_total{host="__HOST__",reason="ingester_error",tenant="tenant-default"} 0 + loki_write_mutated_bytes_total{host="__HOST__",reason="line_too_long",tenant="tenant-1"} 0 + loki_write_mutated_bytes_total{host="__HOST__",reason="line_too_long",tenant="tenant-2"} 0 + loki_write_mutated_bytes_total{host="__HOST__",reason="line_too_long",tenant="tenant-default"} 0 + loki_write_mutated_bytes_total{host="__HOST__",reason="rate_limited",tenant="tenant-1"} 0 + loki_write_mutated_bytes_total{host="__HOST__",reason="rate_limited",tenant="tenant-2"} 0 + loki_write_mutated_bytes_total{host="__HOST__",reason="rate_limited",tenant="tenant-default"} 0 + loki_write_mutated_bytes_total{host="__HOST__",reason="stream_limited",tenant="tenant-1"} 0 + loki_write_mutated_bytes_total{host="__HOST__",reason="stream_limited",tenant="tenant-2"} 0 + loki_write_mutated_bytes_total{host="__HOST__",reason="stream_limited",tenant="tenant-default"} 0 + `, + }, + } + + for testName, tt := range tests { + t.Run(testName, func(t *testing.T) { + reg := prometheus.NewRegistry() + + // Create a buffer channel where we do enqueue received requests + receivedReqsChan := make(chan util.RemoteWriteRequest, 10) + + // Start a local HTTP server + server := util.NewRemoteWriteServer(receivedReqsChan, tt.serverResponseStatus) + require.NotNil(t, server) + defer server.Close() + + // Get the URL at which the local test server is listening to + serverURL := flagext.URLValue{} + err := serverURL.Set(server.URL) + require.NoError(t, err) + + tt.endpointConfig.URL = serverURL + tt.endpointConfig.Client = config.DefaultHTTPClientConfig + tt.endpointConfig.BackoffConfig = backoff.Config{MinBackoff: 1 * time.Millisecond, MaxBackoff: 2 * time.Millisecond, MaxRetries: 3} + tt.endpointConfig.Timeout = 1 * time.Second + + m := NewMetrics(reg) + c, err := newEndpoint(m, tt.endpointConfig, log.NewNopLogger(), internal.NewNopMarkerHandler()) + require.NoError(t, err) + + // Send all the input log entries + for i, logEntry := range tt.inputEntries { + c.enqueue(logEntry, 0) + + if tt.inputDelay > 0 && i < len(tt.inputEntries)-1 { + time.Sleep(tt.inputDelay) + } + } + + // Wait until the expected push requests are received (with a timeout) + deadline := time.Now().Add(1 * time.Second) + for len(receivedReqsChan) < len(tt.expectedReqs) && time.Now().Before(deadline) { + time.Sleep(5 * time.Millisecond) + } + + // Stop the endpoint: it waits until the current batch is sent + c.Stop() + close(receivedReqsChan) + + // Get all push requests received on the server side + receivedReqs := make([]util.RemoteWriteRequest, 0) + for req := range receivedReqsChan { + receivedReqs = append(receivedReqs, req) + } + + assert.ElementsMatch(t, tt.expectedReqs, receivedReqs) + + expectedMetrics := strings.ReplaceAll(tt.expectedMetrics, "__HOST__", serverURL.Host) + err = testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), "loki_write_sent_entries_total", "loki_write_dropped_entries_total", "loki_write_mutated_entries_total", "loki_write_mutated_bytes_total") + assert.NoError(t, err) + }) + } +} diff --git a/internal/component/common/loki/client/shards.go b/internal/component/common/loki/client/shards.go index 8522f2812f7..5a909d758f2 100644 --- a/internal/component/common/loki/client/shards.go +++ b/internal/component/common/loki/client/shards.go @@ -434,6 +434,8 @@ func (s *shards) sendBatch(tenantID string, batch *batch) { s.metrics.droppedEntries.WithLabelValues(s.cfg.URL.Host, tenantID, dropReason).Add(float64(entriesCount)) } +var userAgent = useragent.Get() + // send performs the HTTP POST request to send a batch to Loki. func (s *shards) send(ctx context.Context, tenantID string, buf []byte) (int, error) { ctx, cancel := context.WithTimeout(ctx, s.cfg.Timeout) From 8c47db9b4aa98a9634ea03945803ed9295650634 Mon Sep 17 00:00:00 2001 From: Kalle <23356117+kalleep@users.noreply.github.com> Date: Thu, 20 Nov 2025 16:33:27 +0100 Subject: [PATCH 10/24] wrapp close done in once --- internal/component/common/loki/client/shards.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/internal/component/common/loki/client/shards.go b/internal/component/common/loki/client/shards.go index 5a909d758f2..be758794933 100644 --- a/internal/component/common/loki/client/shards.go +++ b/internal/component/common/loki/client/shards.go @@ -222,7 +222,8 @@ type shards struct { queues []*queue // running is used to track the number of running shards. - running atomic.Int32 + running atomic.Int32 + onceDone sync.Once // done is used to signal that all shards have finished. done chan struct{} @@ -251,6 +252,7 @@ func (s *shards) start(n int) { s.queues = queues s.ctx, s.cancel = context.WithCancel(context.Background()) s.running.Store(int32(n)) + s.onceDone = sync.Once{} s.done = make(chan struct{}) s.softShutdown = make(chan struct{}) @@ -301,7 +303,7 @@ func (s *shards) runShard(q *queue) { maxWaitCheck.Stop() if s.running.Dec() == 0 { - close(s.done) + s.onceDone.Do(func() { close(s.done) }) } }() From 22483b91c4af06029edff43f7de8ba6a3ff69841 Mon Sep 17 00:00:00 2001 From: Kalle <23356117+kalleep@users.noreply.github.com> Date: Thu, 20 Nov 2025 16:35:50 +0100 Subject: [PATCH 11/24] update comment --- internal/component/common/loki/client/endpoint.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/component/common/loki/client/endpoint.go b/internal/component/common/loki/client/endpoint.go index 41000559f85..d879770989b 100644 --- a/internal/component/common/loki/client/endpoint.go +++ b/internal/component/common/loki/client/endpoint.go @@ -44,7 +44,7 @@ func newEndpoint(metrics *Metrics, cfg Config, logger log.Logger, markerHandler return c, nil } -// enqueue will try to enqueue entry. If endpoint it stopped any active attempts will +// enqueue will try to enqueue entry. If endpoint is stopped any active attempts will // be stopped and false will be returned. func (c *endpoint) enqueue(entry loki.Entry, segmentNum int) bool { backoff := backoff.New(c.ctx, backoff.Config{ From b561650f7aa17279318bc34a2f331a86da9b99c0 Mon Sep 17 00:00:00 2001 From: Kalle <23356117+kalleep@users.noreply.github.com> Date: Thu, 20 Nov 2025 16:52:09 +0100 Subject: [PATCH 12/24] fix --- .../common/loki/client/consumer_fanout.go | 2 +- .../common/loki/client/consumer_wal.go | 49 +++++++++---------- .../common/loki/client/consumer_wal_test.go | 2 +- .../component/common/loki/client/endpoint.go | 3 +- .../common/loki/client/endpoint_test.go | 2 +- 5 files changed, 27 insertions(+), 31 deletions(-) diff --git a/internal/component/common/loki/client/consumer_fanout.go b/internal/component/common/loki/client/consumer_fanout.go index 0efdc0ffa26..bea9ac74015 100644 --- a/internal/component/common/loki/client/consumer_fanout.go +++ b/internal/component/common/loki/client/consumer_fanout.go @@ -76,7 +76,7 @@ func (c *FanoutConsumer) Stop() { // Stop all endpoints. for _, c := range c.endpoints { stopWG.Go(func() { - c.Stop() + c.stop() }) } diff --git a/internal/component/common/loki/client/consumer_wal.go b/internal/component/common/loki/client/consumer_wal.go index d3377ed1377..c9be85fd856 100644 --- a/internal/component/common/loki/client/consumer_wal.go +++ b/internal/component/common/loki/client/consumer_wal.go @@ -140,11 +140,11 @@ func (m *WALConsumer) stop(drain bool) { stopWG.Wait() } -func newWalEndpointAdapter(endpoint *endpoint, logger log.Logger, wcMetrics *WALEndpointMetrics, markerHandler internal.MarkerHandler) *walEndpointAdapter { +func newWalEndpointAdapter(endpoint *endpoint, logger log.Logger, metrics *WALEndpointMetrics, markerHandler internal.MarkerHandler) *walEndpointAdapter { c := &walEndpointAdapter{ - logger: log.With(logger, "component", "waladapter"), - weMetrics: wcMetrics, - endpoint: endpoint, + logger: log.With(logger, "component", "waladapter"), + metrics: metrics, + endpoint: endpoint, series: make(map[chunks.HeadSeriesRef]model.LabelSet), seriesSegment: make(map[chunks.HeadSeriesRef]int), @@ -159,8 +159,8 @@ func newWalEndpointAdapter(endpoint *endpoint, logger log.Logger, wcMetrics *WAL // which allows it to be injected in the wal.Watcher as a destination where to write series and entries. As the watcher // reads from the WAL, entires are forwarded here so it can be written to endpoint. type walEndpointAdapter struct { - logger log.Logger - weMetrics *WALEndpointMetrics + logger log.Logger + metrics *WALEndpointMetrics endpoint *endpoint @@ -197,38 +197,33 @@ func (c *walEndpointAdapter) AppendEntries(entries wal.RefEntries, segment int) c.seriesLock.RLock() l, ok := c.series[entries.Ref] c.seriesLock.RUnlock() - - if !ok { - // TODO(thepalbi): Add metric here - level.Debug(c.logger).Log("msg", "series for entry not found") - return nil - } - var maxSeenTimestamp int64 = -1 - for _, e := range entries.Entries { - ok := c.endpoint.enqueue(loki.Entry{Labels: l, Entry: e}, segment) - if !ok { - return nil - } - - if e.Timestamp.Unix() > maxSeenTimestamp { - maxSeenTimestamp = e.Timestamp.Unix() + if ok { + for _, e := range entries.Entries { + ok := c.endpoint.enqueue(loki.Entry{Labels: l, Entry: e}, segment) + if !ok { + return nil + } + if e.Timestamp.Unix() > maxSeenTimestamp { + maxSeenTimestamp = e.Timestamp.Unix() + } } + // count all enqueued appended entries as received from WAL + c.markerHandler.UpdateReceivedData(segment, len(entries.Entries)) + } else { + // TODO(thepalbi): Add metric here + level.Debug(c.logger).Log("msg", "series for entry not found") } - // count all enqueued appended entries as received from WAL - c.markerHandler.UpdateReceivedData(segment, len(entries.Entries)) - // It's safe to assume that upon an AppendEntries call, there will always be at least // one entry. - c.weMetrics.lastReadTimestamp.WithLabelValues().Set(float64(maxSeenTimestamp)) - + c.metrics.lastReadTimestamp.WithLabelValues().Set(float64(maxSeenTimestamp)) return nil } // Stop the endpoint, enqueueing pending batches and draining the send queue accordingly. Both closing operations are // limited by a deadline, controlled by a configured drain timeout, which is global to the Stop call. func (c *walEndpointAdapter) Stop() { - c.endpoint.Stop() + c.endpoint.stop() c.markerHandler.Stop() } diff --git a/internal/component/common/loki/client/consumer_wal_test.go b/internal/component/common/loki/client/consumer_wal_test.go index e0d4d1db31b..502de224401 100644 --- a/internal/component/common/loki/client/consumer_wal_test.go +++ b/internal/component/common/loki/client/consumer_wal_test.go @@ -532,6 +532,6 @@ func runEndpointBenchCase(b *testing.B, bc testCase) { } // Stop the endpoint: it waits until the current batch is sent - endpoint.Stop() + endpoint.stop() close(receivedReqsChan) } diff --git a/internal/component/common/loki/client/endpoint.go b/internal/component/common/loki/client/endpoint.go index d879770989b..971ac41f76e 100644 --- a/internal/component/common/loki/client/endpoint.go +++ b/internal/component/common/loki/client/endpoint.go @@ -51,6 +51,7 @@ func (c *endpoint) enqueue(entry loki.Entry, segmentNum int) bool { MinBackoff: 5 * time.Millisecond, MaxBackoff: 50 * time.Millisecond, }) + for !c.shards.enqueue(entry, segmentNum) { backoff.Wait() if !backoff.Ongoing() { @@ -60,7 +61,7 @@ func (c *endpoint) enqueue(entry loki.Entry, segmentNum int) bool { return true } -func (c *endpoint) Stop() { +func (c *endpoint) stop() { c.shards.stop() c.cancel() } diff --git a/internal/component/common/loki/client/endpoint_test.go b/internal/component/common/loki/client/endpoint_test.go index a5987026ed3..3e1cc451eb7 100644 --- a/internal/component/common/loki/client/endpoint_test.go +++ b/internal/component/common/loki/client/endpoint_test.go @@ -431,7 +431,7 @@ func TestEndpoint(t *testing.T) { } // Stop the endpoint: it waits until the current batch is sent - c.Stop() + c.stop() close(receivedReqsChan) // Get all push requests received on the server side From aaebdd565dac9c38bb6bb0bbc154a94e02c4f6aa Mon Sep 17 00:00:00 2001 From: Kalle <23356117+kalleep@users.noreply.github.com> Date: Fri, 21 Nov 2025 09:02:10 +0100 Subject: [PATCH 13/24] Fix metric --- internal/component/common/loki/client/consumer_wal.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/component/common/loki/client/consumer_wal.go b/internal/component/common/loki/client/consumer_wal.go index c9be85fd856..b12fa6dc289 100644 --- a/internal/component/common/loki/client/consumer_wal.go +++ b/internal/component/common/loki/client/consumer_wal.go @@ -59,7 +59,7 @@ func NewWALConsumer(logger log.Logger, reg prometheus.Registerer, walCfg wal.Con return nil, fmt.Errorf("error starting endpoint: %w", err) } - adapter := newWalEndpointAdapter(endpoint, logger, walEndpointMetrics, markerHandler) + adapter := newWalEndpointAdapter(endpoint, logger, walEndpointMetrics.CurryWithId(name), markerHandler) // subscribe watcher's wal.WriteTo to writer events. This will make the writer trigger the cleanup of the wal.WriteTo // series cache whenever a segment is deleted. From fbd640f63887fc1a093bbe68bc42e6e9cd7dcced Mon Sep 17 00:00:00 2001 From: Kalle <23356117+kalleep@users.noreply.github.com> Date: Fri, 21 Nov 2025 14:20:00 +0100 Subject: [PATCH 14/24] unexport constants --- .../component/common/loki/client/metrics.go | 36 +++++++++---------- .../component/common/loki/client/shards.go | 14 ++++---- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/internal/component/common/loki/client/metrics.go b/internal/component/common/loki/client/metrics.go index 046f76dd3f0..5145b243900 100644 --- a/internal/component/common/loki/client/metrics.go +++ b/internal/component/common/loki/client/metrics.go @@ -6,17 +6,17 @@ import ( ) const ( - HostLabel = "host" - TenantLabel = "tenant" - ReasonLabel = "reason" - - ReasonGeneric = "ingester_error" - ReasonRateLimited = "rate_limited" - ReasonStreamLimited = "stream_limited" - ReasonLineTooLong = "line_too_long" + labelHost = "host" + labelTenant = "tenant" + labelReason = "reason" + + reasonGeneric = "ingester_error" + reasonRateLimited = "rate_limited" + reasonStreamLimited = "stream_limited" + reasonLineTooLong = "line_too_long" ) -var Reasons = []string{ReasonGeneric, ReasonRateLimited, ReasonStreamLimited, ReasonLineTooLong} +var reasons = []string{reasonGeneric, reasonRateLimited, reasonStreamLimited, reasonLineTooLong} type Metrics struct { encodedBytes *prometheus.CounterVec @@ -38,39 +38,39 @@ func NewMetrics(reg prometheus.Registerer) *Metrics { m.encodedBytes = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "loki_write_encoded_bytes_total", Help: "Number of bytes encoded and ready to send.", - }, []string{HostLabel, TenantLabel}) + }, []string{labelHost, labelTenant}) m.sentBytes = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "loki_write_sent_bytes_total", Help: "Number of bytes sent.", - }, []string{HostLabel, TenantLabel}) + }, []string{labelHost, labelTenant}) m.droppedBytes = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "loki_write_dropped_bytes_total", Help: "Number of bytes dropped because failed to be sent to the ingester after all retries.", - }, []string{HostLabel, TenantLabel, ReasonLabel}) + }, []string{labelHost, labelTenant, labelReason}) m.sentEntries = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "loki_write_sent_entries_total", Help: "Number of log entries sent to the ingester.", - }, []string{HostLabel, TenantLabel}) + }, []string{labelHost, labelTenant}) m.droppedEntries = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "loki_write_dropped_entries_total", Help: "Number of log entries dropped because failed to be sent to the ingester after all retries.", - }, []string{HostLabel, TenantLabel, ReasonLabel}) + }, []string{labelHost, labelTenant, labelReason}) m.mutatedEntries = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "loki_write_mutated_entries_total", Help: "The total number of log entries that have been mutated.", - }, []string{HostLabel, TenantLabel, ReasonLabel}) + }, []string{labelHost, labelTenant, labelReason}) m.mutatedBytes = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "loki_write_mutated_bytes_total", Help: "The total number of bytes that have been mutated.", - }, []string{HostLabel, TenantLabel, ReasonLabel}) + }, []string{labelHost, labelTenant, labelReason}) m.requestDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Name: "loki_write_request_duration_seconds", Help: "Duration of send requests.", - }, []string{"status_code", HostLabel, TenantLabel}) + }, []string{"status_code", labelHost, labelTenant}) m.batchRetries = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "loki_write_batch_retries_total", Help: "Number of times batches has had to be retried.", - }, []string{HostLabel, TenantLabel}) + }, []string{labelHost, labelTenant}) m.countersWithHostTenant = []*prometheus.CounterVec{ m.batchRetries, m.encodedBytes, m.sentBytes, m.sentEntries, diff --git a/internal/component/common/loki/client/shards.go b/internal/component/common/loki/client/shards.go index be758794933..c7953254de7 100644 --- a/internal/component/common/loki/client/shards.go +++ b/internal/component/common/loki/client/shards.go @@ -104,9 +104,9 @@ func (q *queue) append(tenantID string, entry loki.Entry, segmentNum int) bool { // Add entry to existing batch. If we cannot add entry to batch we will drop it. if err := batch.add(entry, segmentNum); err != nil { level.Error(q.logger).Log("msg", "batch add err", "tenant", tenantID, "error", err) - reason := ReasonGeneric + reason := reasonGeneric if errors.Is(err, errMaxStreamsLimitExceeded) { - reason = ReasonStreamLimited + reason = reasonStreamLimited } q.metrics.droppedBytes.WithLabelValues(q.cfg.URL.Host, tenantID, reason).Add(float64(len(entry.Line))) q.metrics.droppedEntries.WithLabelValues(q.cfg.URL.Host, tenantID, reason).Inc() @@ -355,7 +355,7 @@ func (s *shards) initBatchMetrics(tenantID string) { // Initialize counters to 0 so the metrics are exported before the first // occurrence of incrementing to avoid missing metrics. for _, counter := range s.metrics.countersWithHostTenantReason { - for _, reason := range Reasons { + for _, reason := range reasons { counter.WithLabelValues(s.cfg.URL.Host, tenantID, reason).Add(0) } } @@ -399,8 +399,8 @@ func (s *shards) sendBatch(tenantID string, batch *batch) { // Immediately drop rate limited batches to avoid HOL blocking for other tenants not experiencing throttling if s.cfg.DropRateLimitedBatches && batchIsRateLimited(status) { level.Warn(s.logger).Log("msg", "dropping batch due to rate limiting applied at ingester") - s.metrics.droppedBytes.WithLabelValues(s.cfg.URL.Host, tenantID, ReasonRateLimited).Add(bufBytes) - s.metrics.droppedEntries.WithLabelValues(s.cfg.URL.Host, tenantID, ReasonRateLimited).Add(float64(entriesCount)) + s.metrics.droppedBytes.WithLabelValues(s.cfg.URL.Host, tenantID, reasonRateLimited).Add(bufBytes) + s.metrics.droppedEntries.WithLabelValues(s.cfg.URL.Host, tenantID, reasonRateLimited).Add(float64(entriesCount)) return } @@ -428,9 +428,9 @@ func (s *shards) sendBatch(tenantID string, batch *batch) { level.Error(s.logger).Log("msg", "final error sending batch, no retries left, dropping data", "status", status, "tenant", tenantID, "error", err) // If the reason for the last retry error was rate limiting, count the drops as such, even if the previous errors // were for a different reason - dropReason := ReasonGeneric + dropReason := reasonGeneric if batchIsRateLimited(status) { - dropReason = ReasonRateLimited + dropReason = reasonRateLimited } s.metrics.droppedBytes.WithLabelValues(s.cfg.URL.Host, tenantID, dropReason).Add(bufBytes) s.metrics.droppedEntries.WithLabelValues(s.cfg.URL.Host, tenantID, dropReason).Add(float64(entriesCount)) From 6e405caf3ae3ba62937c342563aadef418e5b975 Mon Sep 17 00:00:00 2001 From: Kalle <23356117+kalleep@users.noreply.github.com> Date: Fri, 21 Nov 2025 14:25:49 +0100 Subject: [PATCH 15/24] unexport client metrics --- .../common/loki/client/consumer_fanout.go | 2 +- .../component/common/loki/client/consumer_wal.go | 10 +++++----- .../common/loki/client/consumer_wal_test.go | 10 +++++----- .../component/common/loki/client/endpoint.go | 5 +++-- .../common/loki/client/endpoint_test.go | 2 +- .../common/loki/client/internal/metrics.go | 4 ++-- internal/component/common/loki/client/metrics.go | 16 ++++++++-------- internal/component/common/loki/client/shards.go | 8 ++++---- .../component/common/loki/client/shards_test.go | 10 +++++----- 9 files changed, 34 insertions(+), 33 deletions(-) diff --git a/internal/component/common/loki/client/consumer_fanout.go b/internal/component/common/loki/client/consumer_fanout.go index bea9ac74015..458eb97e298 100644 --- a/internal/component/common/loki/client/consumer_fanout.go +++ b/internal/component/common/loki/client/consumer_fanout.go @@ -22,7 +22,7 @@ func NewFanoutConsumer(logger log.Logger, reg prometheus.Registerer, cfgs ...Con } var ( - metrics = NewMetrics(reg) + metrics = newMetrics(reg) endpointsCheck = make(map[string]struct{}) ) diff --git a/internal/component/common/loki/client/consumer_wal.go b/internal/component/common/loki/client/consumer_wal.go index b12fa6dc289..00ec74f0a96 100644 --- a/internal/component/common/loki/client/consumer_wal.go +++ b/internal/component/common/loki/client/consumer_wal.go @@ -32,12 +32,12 @@ func NewWALConsumer(logger log.Logger, reg prometheus.Registerer, walCfg wal.Con } var ( - metrics = NewMetrics(reg) + metrics = newMetrics(reg) endpointsCheck = make(map[string]struct{}) walWatcherMetrics = wal.NewWatcherMetrics(reg) walMarkerMetrics = internal.NewMarkerMetrics(reg) - walEndpointMetrics = NewWALEndpointMetrics(reg) + walEndpointMetrics = newWALEndpointMetrics(reg) ) for _, cfg := range cfgs { @@ -52,7 +52,7 @@ func NewWALConsumer(logger log.Logger, reg prometheus.Registerer, walCfg wal.Con if err != nil { return nil, err } - markerHandler := internal.NewMarkerHandler(markerFileHandler, walCfg.MaxSegmentAge, logger, walMarkerMetrics.WithCurriedId(name)) + markerHandler := internal.NewMarkerHandler(markerFileHandler, walCfg.MaxSegmentAge, logger, walMarkerMetrics.CurryWithId(name)) endpoint, err := newEndpoint(metrics, cfg, logger, markerHandler) if err != nil { @@ -140,7 +140,7 @@ func (m *WALConsumer) stop(drain bool) { stopWG.Wait() } -func newWalEndpointAdapter(endpoint *endpoint, logger log.Logger, metrics *WALEndpointMetrics, markerHandler internal.MarkerHandler) *walEndpointAdapter { +func newWalEndpointAdapter(endpoint *endpoint, logger log.Logger, metrics *walEndpointMetrics, markerHandler internal.MarkerHandler) *walEndpointAdapter { c := &walEndpointAdapter{ logger: log.With(logger, "component", "waladapter"), metrics: metrics, @@ -160,7 +160,7 @@ func newWalEndpointAdapter(endpoint *endpoint, logger log.Logger, metrics *WALEn // reads from the WAL, entires are forwarded here so it can be written to endpoint. type walEndpointAdapter struct { logger log.Logger - metrics *WALEndpointMetrics + metrics *walEndpointMetrics endpoint *endpoint diff --git a/internal/component/common/loki/client/consumer_wal_test.go b/internal/component/common/loki/client/consumer_wal_test.go index 502de224401..2622cb0dbbe 100644 --- a/internal/component/common/loki/client/consumer_wal_test.go +++ b/internal/component/common/loki/client/consumer_wal_test.go @@ -261,9 +261,9 @@ func TestWALEndpoint(t *testing.T) { logger := log.NewLogfmtLogger(os.Stdout) marker := internal.NewNopMarkerHandler() - endpoint, err := newEndpoint(NewMetrics(reg), cfg, logger, marker) + endpoint, err := newEndpoint(newMetrics(reg), cfg, logger, marker) require.NoError(t, err) - adapter := newWalEndpointAdapter(endpoint, logger, NewWALEndpointMetrics(reg).CurryWithId("test"), marker) + adapter := newWalEndpointAdapter(endpoint, logger, newWALEndpointMetrics(reg).CurryWithId("test"), marker) //labels := model.LabelSet{"app": "test"} lines := make([]string, 0, tc.numLines) @@ -405,9 +405,9 @@ func runWALEndpointBenchCase(b *testing.B, bc testCase, mhFactory func(t *testin logger := log.NewLogfmtLogger(os.Stdout) marker := mhFactory(b) - endpoint, err := newEndpoint(NewMetrics(reg), cfg, logger, marker) + endpoint, err := newEndpoint(newMetrics(reg), cfg, logger, marker) require.NoError(b, err) - adapter := newWalEndpointAdapter(endpoint, logger, NewWALEndpointMetrics(reg).CurryWithId("test"), marker) + adapter := newWalEndpointAdapter(endpoint, logger, newWALEndpointMetrics(reg).CurryWithId("test"), marker) //labels := model.LabelSet{"app": "test"} var lines []string @@ -497,7 +497,7 @@ func runEndpointBenchCase(b *testing.B, bc testCase) { logger := log.NewLogfmtLogger(os.Stdout) - m := NewMetrics(reg) + m := newMetrics(reg) endpoint, err := newEndpoint(m, cfg, logger, internal.NewNopMarkerHandler()) require.NoError(b, err) diff --git a/internal/component/common/loki/client/endpoint.go b/internal/component/common/loki/client/endpoint.go index 971ac41f76e..e268f9f4e75 100644 --- a/internal/component/common/loki/client/endpoint.go +++ b/internal/component/common/loki/client/endpoint.go @@ -7,9 +7,10 @@ import ( "time" "github.com/go-kit/log" + "github.com/grafana/dskit/backoff" + "github.com/grafana/alloy/internal/component/common/loki" "github.com/grafana/alloy/internal/component/common/loki/client/internal" - "github.com/grafana/dskit/backoff" ) type endpoint struct { @@ -22,7 +23,7 @@ type endpoint struct { shards *shards } -func newEndpoint(metrics *Metrics, cfg Config, logger log.Logger, markerHandler internal.MarkerHandler) (*endpoint, error) { +func newEndpoint(metrics *metrics, cfg Config, logger log.Logger, markerHandler internal.MarkerHandler) (*endpoint, error) { logger = log.With(logger, "component", "endpoint", "host", cfg.URL.Host) shards, err := newShards(metrics, logger, markerHandler, cfg) diff --git a/internal/component/common/loki/client/endpoint_test.go b/internal/component/common/loki/client/endpoint_test.go index 3e1cc451eb7..0e754fb7155 100644 --- a/internal/component/common/loki/client/endpoint_test.go +++ b/internal/component/common/loki/client/endpoint_test.go @@ -411,7 +411,7 @@ func TestEndpoint(t *testing.T) { tt.endpointConfig.BackoffConfig = backoff.Config{MinBackoff: 1 * time.Millisecond, MaxBackoff: 2 * time.Millisecond, MaxRetries: 3} tt.endpointConfig.Timeout = 1 * time.Second - m := NewMetrics(reg) + m := newMetrics(reg) c, err := newEndpoint(m, tt.endpointConfig, log.NewNopLogger(), internal.NewNopMarkerHandler()) require.NoError(t, err) diff --git a/internal/component/common/loki/client/internal/metrics.go b/internal/component/common/loki/client/internal/metrics.go index 85d83af5fb5..07c15c043c8 100644 --- a/internal/component/common/loki/client/internal/metrics.go +++ b/internal/component/common/loki/client/internal/metrics.go @@ -27,9 +27,9 @@ func NewMarkerMetrics(reg prometheus.Registerer) *MarkerMetrics { return m } -// WithCurriedId returns a curried version of MarkerMetrics, with the id label pre-filled. This is a helper that avoids +// CurryWithId returns a curried version of MarkerMetrics, with the id label pre-filled. This is a helper that avoids // having to move the id around where it's unnecessary, and won't change inside the consumer of the metrics. -func (m *MarkerMetrics) WithCurriedId(id string) *MarkerMetrics { +func (m *MarkerMetrics) CurryWithId(id string) *MarkerMetrics { return &MarkerMetrics{ lastMarkedSegment: m.lastMarkedSegment.MustCurryWith(map[string]string{ "id": id, diff --git a/internal/component/common/loki/client/metrics.go b/internal/component/common/loki/client/metrics.go index 5145b243900..c74b0e339a7 100644 --- a/internal/component/common/loki/client/metrics.go +++ b/internal/component/common/loki/client/metrics.go @@ -18,7 +18,7 @@ const ( var reasons = []string{reasonGeneric, reasonRateLimited, reasonStreamLimited, reasonLineTooLong} -type Metrics struct { +type metrics struct { encodedBytes *prometheus.CounterVec sentBytes *prometheus.CounterVec droppedBytes *prometheus.CounterVec @@ -32,8 +32,8 @@ type Metrics struct { countersWithHostTenantReason []*prometheus.CounterVec } -func NewMetrics(reg prometheus.Registerer) *Metrics { - var m Metrics +func newMetrics(reg prometheus.Registerer) *metrics { + var m metrics m.encodedBytes = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "loki_write_encoded_bytes_total", @@ -95,12 +95,12 @@ func NewMetrics(reg prometheus.Registerer) *Metrics { return &m } -type WALEndpointMetrics struct { +type walEndpointMetrics struct { lastReadTimestamp *prometheus.GaugeVec } -func NewWALEndpointMetrics(reg prometheus.Registerer) *WALEndpointMetrics { - m := &WALEndpointMetrics{ +func newWALEndpointMetrics(reg prometheus.Registerer) *walEndpointMetrics { + m := &walEndpointMetrics{ lastReadTimestamp: prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: "loki_write", @@ -118,8 +118,8 @@ func NewWALEndpointMetrics(reg prometheus.Registerer) *WALEndpointMetrics { return m } -func (m *WALEndpointMetrics) CurryWithId(id string) *WALEndpointMetrics { - return &WALEndpointMetrics{ +func (m *walEndpointMetrics) CurryWithId(id string) *walEndpointMetrics { + return &walEndpointMetrics{ lastReadTimestamp: m.lastReadTimestamp.MustCurryWith(map[string]string{ "id": id, }), diff --git a/internal/component/common/loki/client/shards.go b/internal/component/common/loki/client/shards.go index c7953254de7..5f1be31792f 100644 --- a/internal/component/common/loki/client/shards.go +++ b/internal/component/common/loki/client/shards.go @@ -34,7 +34,7 @@ type queuedBatch struct { Batch *batch } -func newQueue(metrics *Metrics, logger log.Logger, cfg Config) *queue { +func newQueue(metrics *metrics, logger log.Logger, cfg Config) *queue { // Capacity is the worst case size in bytes desired for the send queue. This value is used to calculate the size of // the buffered channel. The worst case scenario assumed is that every batch buffered in full, hence // the channel capacity would be calculated as: bufferChannelSize = Capacity / BatchSize. @@ -57,7 +57,7 @@ func newQueue(metrics *Metrics, logger log.Logger, cfg Config) *queue { // reach the configured batch size limit. type queue struct { cfg Config - metrics *Metrics + metrics *metrics logger log.Logger c chan queuedBatch @@ -179,7 +179,7 @@ loop: // newShards creates a new shards instance for parallel processing of log entries. // It validates the configuration and creates an HTTP client for sending batches to Loki. -func newShards(metrics *Metrics, logger log.Logger, markerHandler SentDataMarkerHandler, cfg Config) (*shards, error) { +func newShards(metrics *metrics, logger log.Logger, markerHandler SentDataMarkerHandler, cfg Config) (*shards, error) { if cfg.URL.URL == nil { return nil, errors.New("endpoint needs target URL") } @@ -213,7 +213,7 @@ func newShards(metrics *Metrics, logger log.Logger, markerHandler SentDataMarker type shards struct { cfg Config logger log.Logger - metrics *Metrics + metrics *metrics client *http.Client markerHandler SentDataMarkerHandler diff --git a/internal/component/common/loki/client/shards_test.go b/internal/component/common/loki/client/shards_test.go index 5a0a2dcdf55..7aa915db5cd 100644 --- a/internal/component/common/loki/client/shards_test.go +++ b/internal/component/common/loki/client/shards_test.go @@ -22,7 +22,7 @@ var entry = loki.Entry{ func TestQueue_append(t *testing.T) { // a queue with 8 bytes batches and only one batch can queued. - q := newQueue(NewMetrics(prometheus.NewRegistry()), log.NewNopLogger(), Config{ + q := newQueue(newMetrics(prometheus.NewRegistry()), log.NewNopLogger(), Config{ BatchSize: 8, QueueConfig: QueueConfig{ Capacity: 8, @@ -59,7 +59,7 @@ func TestQueue_append(t *testing.T) { func TestQueue_drain(t *testing.T) { t.Run("should drain queue and current batch", func(t *testing.T) { // a queue with 8 bytes batches and only one batch can queued at any given time. - q := newQueue(NewMetrics(prometheus.NewRegistry()), log.NewNopLogger(), Config{ + q := newQueue(newMetrics(prometheus.NewRegistry()), log.NewNopLogger(), Config{ BatchSize: 8, QueueConfig: QueueConfig{ Capacity: 8, @@ -80,7 +80,7 @@ func TestQueue_drain(t *testing.T) { t.Run("should only drain queue", func(t *testing.T) { // a queue with 8 bytes batches and only one batch can queued at any given time. - q := newQueue(NewMetrics(prometheus.NewRegistry()), log.NewNopLogger(), Config{ + q := newQueue(newMetrics(prometheus.NewRegistry()), log.NewNopLogger(), Config{ BatchSize: 8, BatchWait: 10 * time.Second, QueueConfig: QueueConfig{ @@ -104,7 +104,7 @@ func TestQueue_drain(t *testing.T) { func TestQueue_flushAndShutdown(t *testing.T) { t.Run("should flush all batches to queue", func(t *testing.T) { // a queue with 8 bytes batches and only one batch can queued at any given time. - q := newQueue(NewMetrics(prometheus.NewRegistry()), log.NewNopLogger(), Config{ + q := newQueue(newMetrics(prometheus.NewRegistry()), log.NewNopLogger(), Config{ BatchSize: 8, QueueConfig: QueueConfig{ Capacity: 8, @@ -142,7 +142,7 @@ func TestQueue_flushAndShutdown(t *testing.T) { t.Run("should stop early if done channel is closed", func(t *testing.T) { // a queue with 8 bytes batches and only one batch can queued at any given time. - q := newQueue(NewMetrics(prometheus.NewRegistry()), log.NewNopLogger(), Config{ + q := newQueue(newMetrics(prometheus.NewRegistry()), log.NewNopLogger(), Config{ BatchSize: 8, QueueConfig: QueueConfig{ Capacity: 8, From 36fbaabcb99e0b69a54d57d132dc9dfcff9dbb11 Mon Sep 17 00:00:00 2001 From: Kalle <23356117+kalleep@users.noreply.github.com> Date: Fri, 21 Nov 2025 14:32:15 +0100 Subject: [PATCH 16/24] fix test --- internal/component/common/loki/client/consumer_wal_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/component/common/loki/client/consumer_wal_test.go b/internal/component/common/loki/client/consumer_wal_test.go index 2622cb0dbbe..35128db57fe 100644 --- a/internal/component/common/loki/client/consumer_wal_test.go +++ b/internal/component/common/loki/client/consumer_wal_test.go @@ -345,7 +345,7 @@ func BenchmarkEndpointImplementations(b *testing.B) { markerFileHandler, err := internal.NewMarkerFileHandler(nopLogger, dir) require.NoError(b, err) - markerHandler := internal.NewMarkerHandler(markerFileHandler, time.Minute, nopLogger, internal.NewMarkerMetrics(nil).WithCurriedId("test")) + markerHandler := internal.NewMarkerHandler(markerFileHandler, time.Minute, nopLogger, internal.NewMarkerMetrics(nil).CurryWithId("test")) return markerHandler }) From def54abf18f888cdf232300a7726b0b7dfc58170 Mon Sep 17 00:00:00 2001 From: Kalle <23356117+kalleep@users.noreply.github.com> Date: Fri, 21 Nov 2025 14:41:24 +0100 Subject: [PATCH 17/24] fix test --- .../common/loki/client/internal/marker_handler_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/component/common/loki/client/internal/marker_handler_test.go b/internal/component/common/loki/client/internal/marker_handler_test.go index 71610610a5d..1049e88d07f 100644 --- a/internal/component/common/loki/client/internal/marker_handler_test.go +++ b/internal/component/common/loki/client/internal/marker_handler_test.go @@ -31,7 +31,7 @@ func (m *mockMarkerFileHandler) MarkSegment(segment int) { func TestMarkerHandler(t *testing.T) { logger := log.NewLogfmtLogger(os.Stdout) // drive-by test: if metrics don't have the id curried, it panics when emitting them - metrics := NewMarkerMetrics(nil).WithCurriedId("test") + metrics := NewMarkerMetrics(nil).CurryWithId("test") t.Run("returns last marked segment from file handler on start", func(t *testing.T) { mockMFH := newMockMarkerFileHandler(10) mh := NewMarkerHandler(mockMFH, time.Minute, logger, metrics) From 94f680e8324ac826fc69e5830304bb5af452b69c Mon Sep 17 00:00:00 2001 From: Kalle <23356117+kalleep@users.noreply.github.com> Date: Fri, 21 Nov 2025 22:08:10 +0100 Subject: [PATCH 18/24] fix race where flushAndShutdown holds mutex while we try to drain the queue. This would deadlock because we would not be able to drain and hard shutdown would not cancel it --- .../component/common/loki/client/endpoint.go | 2 +- .../component/common/loki/client/shards.go | 72 ++++++++++++------- 2 files changed, 49 insertions(+), 25 deletions(-) diff --git a/internal/component/common/loki/client/endpoint.go b/internal/component/common/loki/client/endpoint.go index e268f9f4e75..d83fd032211 100644 --- a/internal/component/common/loki/client/endpoint.go +++ b/internal/component/common/loki/client/endpoint.go @@ -63,8 +63,8 @@ func (c *endpoint) enqueue(entry loki.Entry, segmentNum int) bool { } func (c *endpoint) stop() { - c.shards.stop() c.cancel() + c.shards.stop() } // getEndpointName computes the specific name for each endpoint config. The name is either the configured Name setting in Config, diff --git a/internal/component/common/loki/client/shards.go b/internal/component/common/loki/client/shards.go index 5f1be31792f..dbf1e99e78e 100644 --- a/internal/component/common/loki/client/shards.go +++ b/internal/component/common/loki/client/shards.go @@ -129,54 +129,78 @@ func (q *queue) drain() []queuedBatch { var batches []queuedBatch + // First drain all batches in queue. loop: for { select { - case b := <-q.c: - // Drain all batches from the channel + case b, ok := <-q.c: + if !ok { + break loop + } batches = append(batches, b) default: - // Check for age-based ready batches - for tenantID, batch := range q.batches { - if batch.age() < q.cfg.BatchWait { - continue - } - - // Batch has exceeded wait time, remove from map and return it - delete(q.batches, tenantID) - batches = append(batches, queuedBatch{ - TenantID: tenantID, - Batch: batch, - }) - } break loop } } + + // Then check batches that are not queued but should be flushed anyway. + for tenantID, batch := range q.batches { + if batch.age() < q.cfg.BatchWait { + continue + } + + // Batch has exceeded wait time, remove from map and return it. + delete(q.batches, tenantID) + batches = append(batches, queuedBatch{ + TenantID: tenantID, + Batch: batch, + }) + } + return batches } // flushAndShutdown flushes all remaining batches and closes the channel. -// It will stop early if the done channel is signaled. +// It will stop early if the done channel is closed. func (q *queue) flushAndShutdown(done chan struct{}) { - q.mu.Lock() - defer q.mu.Unlock() - loop: - for tenantID, batch := range q.batches { + for q.tryEnqueueingBatch(done) { select { - case q.c <- queuedBatch{Batch: batch, TenantID: tenantID}: - // Successfully enqueued batch for sending case <-done: - // Shutdown timeout reached, stop trying to flush break loop + case <-time.After(time.Second): } } - // It's safe to set batches to nil because a queue is never reused once we have closed it. + q.mu.Lock() + defer q.mu.Unlock() q.batches = nil close(q.c) } +// tryEnqueueingBatch tries to send a batch if necessary. If sending needs to +// be retried it will return true. +func (q *queue) tryEnqueueingBatch(done <-chan struct{}) bool { + q.mu.Lock() + defer q.mu.Unlock() + + for tenantID, batch := range q.batches { + select { + case q.c <- queuedBatch{Batch: batch, TenantID: tenantID}: + // Successfully queued a batch. If we have more we should retry this. + delete(q.batches, tenantID) + return len(q.batches) > 0 + case <-done: + // Shutdown timeout reached, stop trying to flush. + return false + default: + // Queue is full so we should try again. + return true + } + } + return false +} + // newShards creates a new shards instance for parallel processing of log entries. // It validates the configuration and creates an HTTP client for sending batches to Loki. func newShards(metrics *metrics, logger log.Logger, markerHandler SentDataMarkerHandler, cfg Config) (*shards, error) { From b7c1f2195318459aa435d38c91c7000c1f140b58 Mon Sep 17 00:00:00 2001 From: Kalle <23356117+kalleep@users.noreply.github.com> Date: Mon, 24 Nov 2025 09:21:15 +0100 Subject: [PATCH 19/24] Update comment --- internal/component/common/loki/client/config.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/component/common/loki/client/config.go b/internal/component/common/loki/client/config.go index fc1cefefca1..c3c7d7f05ad 100644 --- a/internal/component/common/loki/client/config.go +++ b/internal/component/common/loki/client/config.go @@ -33,11 +33,11 @@ type Config struct { // prevent HOL blocking in multitenant deployments. DropRateLimitedBatches bool - // QueueConfig controls how shards and queues are configured for endpoints. + // QueueConfig controls how shards and queues are configured for endpoint. QueueConfig QueueConfig } -// QueueConfig controls how shards and queue are configured for client. +// QueueConfig controls how shards and queues are configured for endpoints. type QueueConfig struct { // Capacity is the worst case size in bytes desired for the send queue. This value is used to calculate the size of // the buffered channel used underneath. The worst case scenario assumed is that every batch buffered in full, hence From ef40a0acf06dc8e35041368a119b1526db423ad8 Mon Sep 17 00:00:00 2001 From: Kalle <23356117+kalleep@users.noreply.github.com> Date: Mon, 12 Jan 2026 14:03:03 +0100 Subject: [PATCH 20/24] Chaning queue_config is marked as experimental and add test to check it --- internal/component/loki/write/types.go | 12 +++--- internal/component/loki/write/write.go | 19 +++++++++ internal/component/loki/write/write_test.go | 43 +++++++++++++++++++++ 3 files changed, 69 insertions(+), 5 deletions(-) diff --git a/internal/component/loki/write/types.go b/internal/component/loki/write/types.go index 8d0970315ae..b5e1b7f0d69 100644 --- a/internal/component/loki/write/types.go +++ b/internal/component/loki/write/types.go @@ -77,13 +77,15 @@ type QueueConfig struct { DrainTimeout time.Duration `alloy:"drain_timeout,attr,optional"` } +var defaultQueueConfig = QueueConfig{ + Capacity: 10 * units.MiB, // considering the default BatchSize of 1MiB, this gives us a default buffered channel of size 10 + MinShards: 1, + DrainTimeout: 15 * time.Second, +} + // SetToDefault implements syntax.Defaulter. func (q *QueueConfig) SetToDefault() { - *q = QueueConfig{ - Capacity: 10 * units.MiB, // considering the default BatchSize of 1MiB, this gives us a default buffered channel of size 10 - MinShards: 1, - DrainTimeout: 15 * time.Second, - } + *q = defaultQueueConfig } func (args Arguments) convertEndpointConfigs() []client.Config { diff --git a/internal/component/loki/write/write.go b/internal/component/loki/write/write.go index 287305280f6..b8452ea130f 100644 --- a/internal/component/loki/write/write.go +++ b/internal/component/loki/write/write.go @@ -2,6 +2,7 @@ package write import ( "context" + "errors" "fmt" "path/filepath" "sync" @@ -99,6 +100,10 @@ func New(o component.Options, args Arguments) (*Component, error) { opts: o, } + if err := validateConfigStabilityLevel(o, args); err != nil { + return nil, err + } + // Create and immediately export the receiver which remains the same for // the component's lifetime. c.receiver = loki.NewLogsReceiver(loki.WithComponentID(o.ID)) @@ -151,6 +156,10 @@ func (c *Component) Run(ctx context.Context) error { func (c *Component) Update(args component.Arguments) error { newArgs := args.(Arguments) + if err := validateConfigStabilityLevel(c.opts, newArgs); err != nil { + return err + } + c.mut.Lock() defer c.mut.Unlock() c.args = newArgs @@ -211,3 +220,13 @@ func newEntryHandler(handler loki.EntryHandler, externalLabels model.LabelSet) l return e }) } + +func validateConfigStabilityLevel(o component.Options, args Arguments) error { + canUseExperimentalConfig := o.MinStability.Permits(featuregate.StabilityExperimental) + for _, e := range args.Endpoints { + if e.QueueConfig != defaultQueueConfig && !canUseExperimentalConfig { + return errors.New("changing queue_config requires stability.level flag to be experimental") + } + } + return nil +} diff --git a/internal/component/loki/write/write_test.go b/internal/component/loki/write/write_test.go index 392cf813fb8..02e7b19b4b9 100644 --- a/internal/component/loki/write/write_test.go +++ b/internal/component/loki/write/write_test.go @@ -14,10 +14,12 @@ import ( "github.com/stretchr/testify/require" "go.uber.org/atomic" + "github.com/grafana/alloy/internal/component" "github.com/grafana/alloy/internal/component/common/loki" "github.com/grafana/alloy/internal/component/common/loki/wal" "github.com/grafana/alloy/internal/component/discovery" lsf "github.com/grafana/alloy/internal/component/loki/source/file" + "github.com/grafana/alloy/internal/featuregate" loki_util "github.com/grafana/alloy/internal/loki/util" "github.com/grafana/alloy/internal/runtime/componenttest" "github.com/grafana/alloy/internal/util" @@ -318,6 +320,47 @@ func testMultipleEndpoint(t *testing.T, alterArgs func(arguments *Arguments)) { } } +func TestComponentExperimentalConfig(t *testing.T) { + t.Run("should not be able to create component with experimental config without correct flag", func(t *testing.T) { + var args Arguments + err := syntax.Unmarshal([]byte(` + endpoint { + url = "test.com" + queue_config { + min_shards = 2 + } + } + `), &args) + require.NoError(t, err) + + _, err = New(component.Options{ + MinStability: featuregate.StabilityGenerallyAvailable, + }, args) + + require.Error(t, err) + }) + + t.Run("should be able to create component with experimental config correct flag", func(t *testing.T) { + var args Arguments + err := syntax.Unmarshal([]byte(` + endpoint { + url = "test.com" + queue_config { + min_shards = 2 + } + } + `), &args) + require.NoError(t, err) + + _, err = New(component.Options{ + MinStability: featuregate.StabilityExperimental, + OnStateChange: func(e component.Exports) {}, + }, args) + + require.NoError(t, err) + }) +} + type testCase struct { linesCount int seriesCount int From ea9f26332e55585e45d4bdd535364bc89c0b2c9f Mon Sep 17 00:00:00 2001 From: Kalle <23356117+kalleep@users.noreply.github.com> Date: Mon, 12 Jan 2026 14:04:49 +0100 Subject: [PATCH 21/24] Add back experimental banner --- docs/sources/reference/components/loki/loki.write.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/sources/reference/components/loki/loki.write.md b/docs/sources/reference/components/loki/loki.write.md index f6d6d448e4d..65589dabb3f 100644 --- a/docs/sources/reference/components/loki/loki.write.md +++ b/docs/sources/reference/components/loki/loki.write.md @@ -128,6 +128,8 @@ When `retry_on_http_429` is enabled, the retry mechanism is governed by the back ### `queue_config` +{{< docs/shared lookup="stability/experimental_feature.md" source="alloy" version="" >}} + The optional `queue_config` block configures how the endpoint queues batches of logs sent to Loki. The following arguments are supported: From 757dfa0bc30f3dc634372133d809be0504fa4bed Mon Sep 17 00:00:00 2001 From: Kalle <23356117+kalleep@users.noreply.github.com> Date: Wed, 14 Jan 2026 13:16:38 +0100 Subject: [PATCH 22/24] Remove duplicated validation --- internal/component/loki/write/write.go | 4 ---- internal/component/loki/write/write_test.go | 3 ++- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/internal/component/loki/write/write.go b/internal/component/loki/write/write.go index b8452ea130f..48e1e186be3 100644 --- a/internal/component/loki/write/write.go +++ b/internal/component/loki/write/write.go @@ -100,10 +100,6 @@ func New(o component.Options, args Arguments) (*Component, error) { opts: o, } - if err := validateConfigStabilityLevel(o, args); err != nil { - return nil, err - } - // Create and immediately export the receiver which remains the same for // the component's lifetime. c.receiver = loki.NewLogsReceiver(loki.WithComponentID(o.ID)) diff --git a/internal/component/loki/write/write_test.go b/internal/component/loki/write/write_test.go index 02e7b19b4b9..7dc64e204ae 100644 --- a/internal/component/loki/write/write_test.go +++ b/internal/component/loki/write/write_test.go @@ -334,7 +334,8 @@ func TestComponentExperimentalConfig(t *testing.T) { require.NoError(t, err) _, err = New(component.Options{ - MinStability: featuregate.StabilityGenerallyAvailable, + MinStability: featuregate.StabilityGenerallyAvailable, + OnStateChange: func(e component.Exports) {}, }, args) require.Error(t, err) From 19d9b79eed4aa2b06d208ab799101c0c0ac12165 Mon Sep 17 00:00:00 2001 From: Kalle <23356117+kalleep@users.noreply.github.com> Date: Wed, 14 Jan 2026 13:49:16 +0100 Subject: [PATCH 23/24] Add log when we fail to drain the whole queue during shutdown --- internal/component/common/loki/client/shards.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/internal/component/common/loki/client/shards.go b/internal/component/common/loki/client/shards.go index dbf1e99e78e..ead98982a6f 100644 --- a/internal/component/common/loki/client/shards.go +++ b/internal/component/common/loki/client/shards.go @@ -306,6 +306,8 @@ func (s *shards) stop() { case <-time.After(s.cfg.QueueConfig.DrainTimeout): } + level.Warn(s.logger).Log("msg", "failed to flush all queues during shutdown") + // Perform hard shutdown s.cancel() <-s.done From 049f0185228a29516ef81dd82f7b5d155a24cc81 Mon Sep 17 00:00:00 2001 From: Kalle <23356117+kalleep@users.noreply.github.com> Date: Wed, 14 Jan 2026 13:49:37 +0100 Subject: [PATCH 24/24] Update docs to describe how queue size is calculated and how the memory required for all queue scales with shards --- docs/sources/reference/components/loki/loki.write.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/sources/reference/components/loki/loki.write.md b/docs/sources/reference/components/loki/loki.write.md index 65589dabb3f..be68b4aae17 100644 --- a/docs/sources/reference/components/loki/loki.write.md +++ b/docs/sources/reference/components/loki/loki.write.md @@ -143,6 +143,9 @@ The following arguments are supported: Each endpoint is divided into a number of concurrent _shards_ which are responsible for sending a fraction of batches. The number of shards is controlled with `min_shards` argument. Each shard has a queue of batches it keeps in memory, controlled with the `capacity` argument. +Queue size is calculated using `batch_size` and `capacity` for each shard. So if `batch_size` is 1MiB and `capacity` is 10MiB each shard would be able to queue up 10 batches. +The maximum amount of memory required for all configured shards can be calculated using `capacity` * `min_shards`. + ### `tls_config` {{< docs/shared lookup="reference/components/tls-config-block.md" source="alloy" version="" >}}