grafana · kalleep · Mar 25, 2026 · Mar 20, 2026 · Mar 20, 2026 · Mar 24, 2026
@@ -68,9 +68,12 @@ func newMetrics(reg prometheus.Registerer) *metrics {
 		NativeHistogramMinResetDuration: 1 * time.Hour,
 	}, []string{labelHost, labelTenant})
 	m.requestSize = prometheus.NewHistogramVec(prometheus.HistogramOpts{
-		Name:    "loki_write_request_size_bytes",
-		Help:    "Number of bytes for requests.",
-		Buckets: []float64{1 * KiB, 4 * KiB, 16 * KiB, 64 * KiB, 256 * KiB, 512 * KiB, 1 * MiB, 2 * MiB, 4 * MiB, 8 * MiB, 16 * MiB, 20 * MiB},
+		Name:                            "loki_write_request_size_bytes",
+		Help:                            "Number of bytes for requests.",
+		Buckets:                         []float64{1 * KiB, 4 * KiB, 16 * KiB, 64 * KiB, 256 * KiB, 512 * KiB, 1 * MiB, 2 * MiB, 4 * MiB, 8 * MiB, 16 * MiB, 20 * MiB},
+		NativeHistogramBucketFactor:     1.1,
+		NativeHistogramMaxBucketNumber:  100,
+		NativeHistogramMinResetDuration: 1 * time.Hour,
 	}, []string{labelHost, labelTenant})
 	m.requestDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
 		Name: "loki_write_request_duration_seconds",

@@ -94,7 +94,7 @@ local filename = 'alloy-loki.json';
     (
       panel.new(title='Write latency in $cluster', type='timeseries') +
       panel.withDescription(|||
-        Bytes dropped per second.
+        Percentile write latency.
       |||) +
       panel.withUnit('s') +
       panel.withPosition({ x: 12, y: 1 + y_offset, w: 12, h: 10 }) +
@@ -159,14 +159,83 @@ local filename = 'alloy-loki.json';
       panel.withDescription(|||
         Bytes dropped per second.
       |||) +
-      panel.withStacked() +
       panel.withUnit('Bps') +
+      panel.withStacked(stackingMode='off') +
+      panel.withPosition({ x: 12, y: 1 + y_offset, w: 12, h: 10 }) +
+      panel.withQueries([
+        panel.newQuery(
+          expr=|||
+            sum by(${groupby}, reason) (rate(loki_write_dropped_bytes_total{%(instanceSelector)s, host=~"$url"}[$__rate_interval]))
+          ||| % $._config,
+          legendFormat='{{${groupby}}}: {{reason}}'
+        ),
+      ])
+    ),
+
+    // Loki write request size distribution
+    (
+      panel.newNativeHistogramHeatmap('Write request size distribution in $cluster', 'bytes') +
+      panel.withDescription(|||
+        Shows distribution of write request sizes over time.
+      |||) +
+      panel.withPosition({ x: 0, y: 1 + y_offset, w: 12, h: 10 }) +
+      panel.withQueries([
+		panel.newQuery(
+            expr= |||
+              sum(increase(loki_write_request_size_bytes{%(instanceSelector)s}[$__rate_interval]))
+              or ignoring (le)
+              sum by (le) (increase(loki_write_request_size_bytes_bucket{%(instanceSelector)s}[$__rate_interval]))
-              sum(increase(loki_write_request_size_bytes{%(instanceSelector)s}[$__rate_interval]))
-              or ignoring (le)
-              sum by (le) (increase(loki_write_request_size_bytes_bucket{%(instanceSelector)s}[$__rate_interval]))
+              sum(increase(loki_write_request_size_bytes{%(instanceSelector)s, host=~"$url"}[$__rate_interval]))
+              or ignoring (le)
+              sum by (le) (increase(loki_write_request_size_bytes_bucket{%(instanceSelector)s, host=~"$url"}[$__rate_interval]))
-              sum(increase(loki_write_request_size_bytes{%(instanceSelector)s}[$__rate_interval]))
-              or ignoring (le)
-              sum by (le) (increase(loki_write_request_size_bytes_bucket{%(instanceSelector)s}[$__rate_interval]))
+              sum(increase(loki_write_request_size_bytes{%(instanceSelector)s, host=~"$url"}[$__rate_interval]))
+              or ignoring (le)
+              sum by (le) (increase(loki_write_request_size_bytes_bucket{%(instanceSelector)s, host=~"$url"}[$__rate_interval]))
+            ||| % $._config,
+            format='heatmap',
+            legendFormat='{{le}}',
+          ),
+      ])
+    ),
+
+    // Loki entry propagation latency
+	(
+      panel.new(title='Entry propagation latency in $cluster', type='timeseries') +
+      panel.withDescription(|||
+        p99 and p50 of entry propagation latency. Prefers native histogram, falls back to classic histogram when native is unavailable.
+      |||) +
+      panel.withUnit('s') +
       panel.withPosition({ x: 12, y: 1 + y_offset, w: 12, h: 10 }) +
       panel.withQueries([
         panel.newQuery(
           expr=|||
-            sum by(${groupby}) (rate(loki_write_dropped_bytes_total{%(instanceSelector)s, host=~"$url"}[$__rate_interval]))
+			histogram_quantile(
+			  0.99,
+			  sum by (${groupby}) (
+				rate(loki_write_entry_propagation_latency_seconds{%(instanceSelector)s}[$__rate_interval])
+			  )
+			)
+			or ignoring(le)
+			histogram_quantile(
+			  0.99,
+			  sum by (le, ${groupby}) (
+				rate(loki_write_entry_propagation_latency_seconds_bucket{%(instanceSelector)s}[$__rate_interval])
+			  )
+			)
           ||| % $._config,
+          legendFormat='{{${groupby}}} p99'
+        ),
+        panel.newQuery(
+          expr=|||
+			histogram_quantile(
+			  0.50,
+			  sum by (${groupby}) (
+				rate(loki_write_entry_propagation_latency_seconds{%(instanceSelector)s}[$__rate_interval])
+			  )
+			)
+			or ignoring(le)
+			histogram_quantile(
+			  0.50,
+			  sum by (le, ${groupby}) (
+				rate(loki_write_entry_propagation_latency_seconds_bucket{%(instanceSelector)s}[$__rate_interval])
+			  )
+			)
+          ||| % $._config,
+          legendFormat='{{${groupby}}} p50'
         ),
       ])
     ),

@@ -192,7 +192,7 @@
       },
       {
          "datasource": "${datasource}",
-         "description": "Bytes dropped per second.\n",
+         "description": "Percentile write latency.\n",
          "fieldConfig": {
             "defaults": {
                "unit": "s"
@@ -272,7 +272,7 @@
                   "fillOpacity": 20,
                   "gradientMode": "hue",
                   "stacking": {
-                     "mode": "normal"
+                     "mode": "off"
                   }
                },
                "unit": "Bps"
@@ -287,14 +287,91 @@
          "targets": [
             {
                "datasource": "${datasource}",
-               "expr": "sum by(${groupby}) (rate(loki_write_dropped_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", host=~\"$url\"}[$__rate_interval]))\n",
+               "expr": "sum by(${groupby}, reason) (rate(loki_write_dropped_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", host=~\"$url\"}[$__rate_interval]))\n",
                "instant": false,
-               "legendFormat": "__auto",
+               "legendFormat": "{{${groupby}}}: {{reason}}",
                "range": true
             }
          ],
          "title": "Bytes dropped in $cluster",
          "type": "timeseries"
+      },
+      {
+         "datasource": "${datasource}",
+         "description": "Shows distribution of write request sizes over time.\n",
+         "gridPos": {
+            "h": 10,
+            "w": 12,
+            "x": 0,
+            "y": 23
+         },
+         "maxDataPoints": 30,
+         "options": {
+            "calculate": false,
+            "cellGap": 0,
+            "color": {
+               "scheme": "Spectral"
+            },
+            "exemplars": {
+               "color": "rgba(255,0,255,0.7)"
+            },
+            "filterValues": {
+               "le": 0.10000000000000001
+            },
+            "tooltip": {
+               "show": true,
+               "yHistogram": true
+            },
+            "yAxis": {
+               "unit": "bytes"
+            }
+         },
+         "pluginVersion": "9.0.6",
+         "targets": [
+            {
+               "datasource": "${datasource}",
+               "expr": "sum(increase(loki_write_request_size_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))\nor ignoring (le)\nsum by (le) (increase(loki_write_request_size_bytes_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))\n",
+               "format": "heatmap",
+               "instant": false,
+               "legendFormat": "{{le}}",
+               "range": true
+            }
+         ],
+         "title": "Write request size distribution in $cluster",
+         "type": "heatmap"
+      },
+      {
+         "datasource": "${datasource}",
+         "description": "p99 and p50 of entry propagation latency. Prefers native histogram, falls back to classic histogram when native is unavailable.\n",
+         "fieldConfig": {
+            "defaults": {
+               "unit": "s"
+            }
+         },
+         "gridPos": {
+            "h": 10,
+            "w": 12,
+            "x": 12,
+            "y": 23
+         },
+         "targets": [
+            {
+               "datasource": "${datasource}",
+               "expr": "histogram_quantile(\n  0.99,\n  sum by (${groupby}) (\n\trate(loki_write_entry_propagation_latency_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n  )\n)\nor ignoring(le)\nhistogram_quantile(\n  0.99,\n  sum by (le, ${groupby}) (\n\trate(loki_write_entry_propagation_latency_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n  )\n)\n",
+               "instant": false,
+               "legendFormat": "{{${groupby}}} p99",
+               "range": true
+            },
+            {
+               "datasource": "${datasource}",
+               "expr": "histogram_quantile(\n  0.50,\n  sum by (${groupby}) (\n\trate(loki_write_entry_propagation_latency_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n  )\n)\nor ignoring(le)\nhistogram_quantile(\n  0.50,\n  sum by (le, ${groupby}) (\n\trate(loki_write_entry_propagation_latency_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n  )\n)\n",
+               "instant": false,
+               "legendFormat": "{{${groupby}}} p50",
+               "range": true
+            }
+         ],
+         "title": "Entry propagation latency in $cluster",
+         "type": "timeseries"
       }
    ],
    "refresh": "30s",