Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions internal/component/common/loki/client/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,12 @@ func newMetrics(reg prometheus.Registerer) *metrics {
NativeHistogramMinResetDuration: 1 * time.Hour,
}, []string{labelHost, labelTenant})
m.requestSize = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Name: "loki_write_request_size_bytes",
Help: "Number of bytes for requests.",
Buckets: []float64{1 * KiB, 4 * KiB, 16 * KiB, 64 * KiB, 256 * KiB, 512 * KiB, 1 * MiB, 2 * MiB, 4 * MiB, 8 * MiB, 16 * MiB, 20 * MiB},
Name: "loki_write_request_size_bytes",
Help: "Number of bytes for requests.",
Buckets: []float64{1 * KiB, 4 * KiB, 16 * KiB, 64 * KiB, 256 * KiB, 512 * KiB, 1 * MiB, 2 * MiB, 4 * MiB, 8 * MiB, 16 * MiB, 20 * MiB},
NativeHistogramBucketFactor: 1.1,
NativeHistogramMaxBucketNumber: 100,
NativeHistogramMinResetDuration: 1 * time.Hour,
Comment thread
thampiotr marked this conversation as resolved.
}, []string{labelHost, labelTenant})
m.requestDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Name: "loki_write_request_duration_seconds",
Expand Down
75 changes: 72 additions & 3 deletions operations/alloy-mixin/dashboards/loki.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ local filename = 'alloy-loki.json';
(
panel.new(title='Write latency in $cluster', type='timeseries') +
panel.withDescription(|||
Bytes dropped per second.
Percentile write latency.
|||) +
panel.withUnit('s') +
panel.withPosition({ x: 12, y: 1 + y_offset, w: 12, h: 10 }) +
Expand Down Expand Up @@ -159,14 +159,83 @@ local filename = 'alloy-loki.json';
panel.withDescription(|||
Bytes dropped per second.
|||) +
panel.withStacked() +
panel.withUnit('Bps') +
panel.withStacked(stackingMode='off') +
panel.withPosition({ x: 12, y: 1 + y_offset, w: 12, h: 10 }) +
panel.withQueries([
panel.newQuery(
expr=|||
sum by(${groupby}, reason) (rate(loki_write_dropped_bytes_total{%(instanceSelector)s, host=~"$url"}[$__rate_interval]))
Comment thread
thampiotr marked this conversation as resolved.
||| % $._config,
legendFormat='{{${groupby}}}: {{reason}}'
),
])
),

// Loki write request size distribution
(
panel.newNativeHistogramHeatmap('Write request size distribution in $cluster', 'bytes') +
panel.withDescription(|||
Shows distribution of write request sizes over time.
|||) +
panel.withPosition({ x: 0, y: 1 + y_offset, w: 12, h: 10 }) +
panel.withQueries([
panel.newQuery(
expr= |||
sum(increase(loki_write_request_size_bytes{%(instanceSelector)s}[$__rate_interval]))
or ignoring (le)
sum by (le) (increase(loki_write_request_size_bytes_bucket{%(instanceSelector)s}[$__rate_interval]))
Comment on lines +185 to +187
Copy link

Copilot AI Mar 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The write request size heatmap queries don’t include the host=~"$url" selector, so the dashboard’s URL template variable won’t affect this panel (and it may unintentionally aggregate across all hosts). Add host=~"$url" to both the native and classic histogram selectors to match the other loki.write panels.

Suggested change
sum(increase(loki_write_request_size_bytes{%(instanceSelector)s}[$__rate_interval]))
or ignoring (le)
sum by (le) (increase(loki_write_request_size_bytes_bucket{%(instanceSelector)s}[$__rate_interval]))
sum(increase(loki_write_request_size_bytes{%(instanceSelector)s, host=~"$url"}[$__rate_interval]))
or ignoring (le)
sum by (le) (increase(loki_write_request_size_bytes_bucket{%(instanceSelector)s, host=~"$url"}[$__rate_interval]))

Copilot uses AI. Check for mistakes.
||| % $._config,
format='heatmap',
legendFormat='{{le}}',
),
])
),

// Loki entry propagation latency
(
panel.new(title='Entry propagation latency in $cluster', type='timeseries') +
panel.withDescription(|||
p99 and p50 of entry propagation latency. Prefers native histogram, falls back to classic histogram when native is unavailable.
|||) +
panel.withUnit('s') +
panel.withPosition({ x: 12, y: 1 + y_offset, w: 12, h: 10 }) +
panel.withQueries([
panel.newQuery(
expr=|||
sum by(${groupby}) (rate(loki_write_dropped_bytes_total{%(instanceSelector)s, host=~"$url"}[$__rate_interval]))
histogram_quantile(
0.99,
sum by (${groupby}) (
rate(loki_write_entry_propagation_latency_seconds{%(instanceSelector)s}[$__rate_interval])
)
)
or ignoring(le)
histogram_quantile(
0.99,
sum by (le, ${groupby}) (
rate(loki_write_entry_propagation_latency_seconds_bucket{%(instanceSelector)s}[$__rate_interval])
)
Comment on lines +206 to +217
Copy link

Copilot AI Mar 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The entry propagation latency queries don’t include the host=~"$url" selector. This makes the URL template variable ineffective for this panel and changes behavior vs the other loki.write charts which are scoped to the selected host. Add host=~"$url" to both the native and classic histogram selectors.

Copilot uses AI. Check for mistakes.
)
||| % $._config,
legendFormat='{{${groupby}}} p99'
),
panel.newQuery(
expr=|||
histogram_quantile(
0.50,
sum by (${groupby}) (
rate(loki_write_entry_propagation_latency_seconds{%(instanceSelector)s}[$__rate_interval])
)
)
or ignoring(le)
histogram_quantile(
0.50,
sum by (le, ${groupby}) (
rate(loki_write_entry_propagation_latency_seconds_bucket{%(instanceSelector)s}[$__rate_interval])
)
)
||| % $._config,
legendFormat='{{${groupby}}} p50'
),
])
),
Expand Down
85 changes: 81 additions & 4 deletions operations/alloy-mixin/rendered/dashboards/alloy-loki.json
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@
},
{
"datasource": "${datasource}",
"description": "Bytes dropped per second.\n",
"description": "Percentile write latency.\n",
"fieldConfig": {
"defaults": {
"unit": "s"
Expand Down Expand Up @@ -272,7 +272,7 @@
"fillOpacity": 20,
"gradientMode": "hue",
"stacking": {
"mode": "normal"
"mode": "off"
}
},
"unit": "Bps"
Expand All @@ -287,14 +287,91 @@
"targets": [
{
"datasource": "${datasource}",
"expr": "sum by(${groupby}) (rate(loki_write_dropped_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", host=~\"$url\"}[$__rate_interval]))\n",
"expr": "sum by(${groupby}, reason) (rate(loki_write_dropped_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", host=~\"$url\"}[$__rate_interval]))\n",
"instant": false,
"legendFormat": "__auto",
"legendFormat": "{{${groupby}}}: {{reason}}",
"range": true
}
],
"title": "Bytes dropped in $cluster",
"type": "timeseries"
},
{
"datasource": "${datasource}",
"description": "Shows distribution of write request sizes over time.\n",
"gridPos": {
"h": 10,
"w": 12,
"x": 0,
"y": 23
},
"maxDataPoints": 30,
"options": {
"calculate": false,
"cellGap": 0,
"color": {
"scheme": "Spectral"
},
"exemplars": {
"color": "rgba(255,0,255,0.7)"
},
"filterValues": {
"le": 0.10000000000000001
},
"tooltip": {
"show": true,
"yHistogram": true
},
"yAxis": {
"unit": "bytes"
}
},
"pluginVersion": "9.0.6",
"targets": [
{
"datasource": "${datasource}",
"expr": "sum(increase(loki_write_request_size_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))\nor ignoring (le)\nsum by (le) (increase(loki_write_request_size_bytes_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))\n",
"format": "heatmap",
"instant": false,
"legendFormat": "{{le}}",
"range": true
Comment thread
kalleep marked this conversation as resolved.
}
],
"title": "Write request size distribution in $cluster",
"type": "heatmap"
},
{
"datasource": "${datasource}",
"description": "p99 and p50 of entry propagation latency. Prefers native histogram, falls back to classic histogram when native is unavailable.\n",
"fieldConfig": {
"defaults": {
"unit": "s"
}
},
"gridPos": {
"h": 10,
"w": 12,
"x": 12,
"y": 23
},
"targets": [
{
"datasource": "${datasource}",
"expr": "histogram_quantile(\n 0.99,\n sum by (${groupby}) (\n\trate(loki_write_entry_propagation_latency_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n )\n)\nor ignoring(le)\nhistogram_quantile(\n 0.99,\n sum by (le, ${groupby}) (\n\trate(loki_write_entry_propagation_latency_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n )\n)\n",
"instant": false,
"legendFormat": "{{${groupby}}} p99",
"range": true
},
{
"datasource": "${datasource}",
"expr": "histogram_quantile(\n 0.50,\n sum by (${groupby}) (\n\trate(loki_write_entry_propagation_latency_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n )\n)\nor ignoring(le)\nhistogram_quantile(\n 0.50,\n sum by (le, ${groupby}) (\n\trate(loki_write_entry_propagation_latency_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n )\n)\n",
Comment thread
kalleep marked this conversation as resolved.
"instant": false,
"legendFormat": "{{${groupby}}} p50",
"range": true
}
],
"title": "Entry propagation latency in $cluster",
"type": "timeseries"
}
],
"refresh": "30s",
Expand Down
Loading