Skip to content

Commit 5a977b3

Browse files
authored
Rename lru cache metrics to the same "dgraph_lru" prefix. (#2714)
* Rename lru cache metrics with a "dgraph_lru" prefix. * Rename dgraph_evicted_lists_total to dgraph_lru_evicted_total. * docs: Update metrics section for `dgraph_lru` prefixed metrics. * Rename EvictedPls variable to LcacheEvicts for consistency with other Lcache vars. * Add TODO about moving away from Prometheus's expvar collector. * Reflow metrics docs to 100-columns.
1 parent 3897db2 commit 5a977b3

File tree

3 files changed

+74
-59
lines changed

3 files changed

+74
-59
lines changed

posting/lists.go

+4-4
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ func periodicUpdateStats(lc *y.Closer) {
137137
inUse := float64(megs)
138138

139139
stats := lcache.Stats()
140-
x.EvictedPls.Set(int64(stats.NumEvicts))
140+
x.LcacheEvicts.Set(int64(stats.NumEvicts))
141141
x.LcacheSize.Set(int64(stats.Size))
142142
x.LcacheLen.Set(int64(stats.Length))
143143

@@ -246,10 +246,10 @@ func StopLRUEviction() {
246246
func Get(key []byte) (rlist *List, err error) {
247247
lp := lcache.Get(string(key))
248248
if lp != nil {
249-
x.CacheHit.Add(1)
249+
x.LcacheHit.Add(1)
250250
return lp, nil
251251
}
252-
x.CacheMiss.Add(1)
252+
x.LcacheMiss.Add(1)
253253

254254
// Any initialization for l must be done before PutIfMissing. Once it's added
255255
// to the map, any other goroutine can retrieve it.
@@ -260,7 +260,7 @@ func Get(key []byte) (rlist *List, err error) {
260260
// We are always going to return lp to caller, whether it is l or not
261261
lp = lcache.PutIfMissing(string(key), l)
262262
if lp != l {
263-
x.CacheRace.Add(1)
263+
x.LcacheRace.Add(1)
264264
}
265265
return lp, nil
266266
}

wiki/content/deploy/index.md

+28-15
Original file line numberDiff line numberDiff line change
@@ -1395,11 +1395,14 @@ Install **[Grafana](http://docs.grafana.org/installation/)** to plot the metrics
13951395

13961396
## Metrics
13971397

1398-
Dgraph metrics follow the [metric and label conventions for Prometheus](https://prometheus.io/docs/practices/naming/).
1398+
Dgraph metrics follow the [metric and label conventions for
1399+
Prometheus](https://prometheus.io/docs/practices/naming/).
13991400

14001401
### Disk Metrics
14011402

1402-
The disk metrics let you track the disk activity of the Dgraph process. Dgraph does not interact directly with the filesystem. Instead it relies on [Badger](https://github.com/dgraph-io/badger) to read from and write to disk.
1403+
The disk metrics let you track the disk activity of the Dgraph process. Dgraph does not interact
1404+
directly with the filesystem. Instead it relies on [Badger](https://github.com/dgraph-io/badger) to
1405+
read from and write to disk.
14031406

14041407
Metrics | Description
14051408
------- | -----------
@@ -1413,9 +1416,12 @@ The disk metrics let you track the disk activity of the Dgraph process. Dgraph d
14131416

14141417
### Memory Metrics
14151418

1416-
The memory metrics let you track the memory usage of the Dgraph process. The idle and inuse metrics gives you a better sense of the active memory usage of the Dgraph process. The process memory metric shows the memory usage as measured by the operating system.
1419+
The memory metrics let you track the memory usage of the Dgraph process. The idle and inuse metrics
1420+
gives you a better sense of the active memory usage of the Dgraph process. The process memory metric
1421+
shows the memory usage as measured by the operating system.
14171422

1418-
By looking at all three metrics you can see how much memory a Dgraph process is holding from the operating system and how much is actively in use.
1423+
By looking at all three metrics you can see how much memory a Dgraph process is holding from the
1424+
operating system and how much is actively in use.
14191425

14201426
Metrics | Description
14211427
------- | -----------
@@ -1425,21 +1431,28 @@ By looking at all three metrics you can see how much memory a Dgraph process is
14251431

14261432
### LRU Cache Metrics
14271433

1428-
The LRU cache metrics let you track on how well the posting list cache is being used. You can measure the cache capacity with the max posting list size (see data metrics below), which may correlate with `dgraph_evicted_lists_total` is growing.
1434+
The LRU cache metrics let you track on how well the posting list cache is being used.
14291435

1430-
Metrics | Description
1431-
------- | -----------
1432-
`dgraph_cache_hits_total` | Total number of cache hits for posting lists in Dgraph.
1433-
`dgraph_cache_miss_total` | Total number of cache misses for posting lists in Dgraph.
1434-
`dgraph_cache_race_total` | Total number of cache races when getting posting lists in Dgraph.
1435-
`dgraph_evicted_lists_total` | Total number of posting lists evicted from LRU cache. A large number here could indicate a large posting list.
1436-
`dgraph_lcache_capacity_bytes` | Current size of the LRU cache. The max value should be close to the size specified by `--lru_mb`.
1437-
`dgraph_lcache_keys_total` | Total number of keys in the LRU cache.
1438-
`dgraph_lcache_size_bytes` | Size in bytes of the LRU cache.
1436+
You can track `dgraph_lru_capacity_bytes`, `dgraph_lru_evicted_total`, and `dgraph_max_list_bytes`
1437+
(see the [Data Metrics]({{< relref "#data-metrics" >}})) to determine if the cache size should be
1438+
adjusted. A high number of evictions can indicate a large posting list that repeatedly is inserted
1439+
and evicted from the cache due to insufficient sizing. The LRU cache size can be tuned with the option
1440+
`--lru_mb`.
1441+
1442+
Metrics | Description
1443+
------- | -----------
1444+
`dgraph_lru_hits_total` | Total number of cache hits for posting lists in Dgraph.
1445+
`dgraph_lru_miss_total` | Total number of cache misses for posting lists in Dgraph.
1446+
`dgraph_lru_race_total` | Total number of cache races when getting posting lists in Dgraph.
1447+
`dgraph_lru_evicted_total` | Total number of posting lists evicted from LRU cache.
1448+
`dgraph_lru_capacity_bytes` | Current size of the LRU cache. The max value should be close to the size specified by `--lru_mb`.
1449+
`dgraph_lru_keys_total` | Total number of keys in the LRU cache.
1450+
`dgraph_lru_size_bytes` | Size in bytes of the LRU cache.
14391451

14401452
### Data Metrics
14411453

1442-
The data metrics let you track the [posting list]({{< ref "/design-concepts/index.md#posting-list" >}}) store.
1454+
The data metrics let you track the [posting list]({{< ref "/design-concepts/index.md#posting-list"
1455+
>}}) store.
14431456
14441457
Metrics | Description
14451458
------- | -----------

x/metrics.go

+42-40
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,11 @@ var (
3030
PostingWrites *expvar.Int
3131
BytesRead *expvar.Int
3232
BytesWrite *expvar.Int
33-
EvictedPls *expvar.Int
3433
NumQueries *expvar.Int
35-
CacheHit *expvar.Int
36-
CacheMiss *expvar.Int
37-
CacheRace *expvar.Int
34+
LcacheHit *expvar.Int
35+
LcacheMiss *expvar.Int
36+
LcacheRace *expvar.Int
37+
LcacheEvicts *expvar.Int
3838

3939
// value at particular point of time
4040
PendingQueries *expvar.Int
@@ -66,24 +66,24 @@ func init() {
6666
PendingProposals = expvar.NewInt("dgraph_pending_proposals_total")
6767
BytesRead = expvar.NewInt("dgraph_read_bytes_total")
6868
BytesWrite = expvar.NewInt("dgraph_written_bytes_total")
69-
EvictedPls = expvar.NewInt("dgraph_evicted_lists_total")
7069
PendingQueries = expvar.NewInt("dgraph_pending_queries_total")
7170
NumQueries = expvar.NewInt("dgraph_num_queries_total")
7271
AlphaHealth = expvar.NewInt("dgraph_alpha_health_status")
7372
DirtyMapSize = expvar.NewInt("dgraph_dirtymap_keys_total")
74-
LcacheSize = expvar.NewInt("dgraph_lcache_size_bytes")
75-
LcacheLen = expvar.NewInt("dgraph_lcache_keys_total")
76-
LcacheCapacity = expvar.NewInt("dgraph_lcache_capacity_bytes")
7773
NumGoRoutines = expvar.NewInt("dgraph_goroutines_total")
7874
MemoryInUse = expvar.NewInt("dgraph_memory_inuse_bytes")
7975
MemoryIdle = expvar.NewInt("dgraph_memory_idle_bytes")
8076
MemoryProc = expvar.NewInt("dgraph_memory_proc_bytes")
8177
ActiveMutations = expvar.NewInt("dgraph_active_mutations_total")
8278
PredicateStats = expvar.NewMap("dgraph_predicate_stats")
8379
Conf = expvar.NewMap("dgraph_config")
84-
CacheHit = expvar.NewInt("dgraph_cache_hits_total")
85-
CacheMiss = expvar.NewInt("dgraph_cache_miss_total")
86-
CacheRace = expvar.NewInt("dgraph_cache_race_total")
80+
LcacheHit = expvar.NewInt("dgraph_lru_hits_total")
81+
LcacheMiss = expvar.NewInt("dgraph_lru_miss_total")
82+
LcacheRace = expvar.NewInt("dgraph_lru_race_total")
83+
LcacheEvicts = expvar.NewInt("dgraph_lru_evicted_total")
84+
LcacheSize = expvar.NewInt("dgraph_lru_size_bytes")
85+
LcacheLen = expvar.NewInt("dgraph_lru_keys_total")
86+
LcacheCapacity = expvar.NewInt("dgraph_lru_capacity_bytes")
8787
MaxPlSize = expvar.NewInt("dgraph_max_list_bytes")
8888
MaxPlLength = expvar.NewInt("dgraph_max_list_length")
8989

@@ -102,20 +102,42 @@ func init() {
102102
}
103103
}()
104104

105+
// TODO: prometheus.NewExpvarCollector is not production worthy (see godocs). Use a better
106+
// way for exporting Prometheus metrics (like an OpenCensus metrics exporter).
105107
expvarCollector := prometheus.NewExpvarCollector(map[string]*prometheus.Desc{
106-
"dgraph_cache_hits_total": prometheus.NewDesc(
107-
"dgraph_cache_hits_total",
108-
"dgraph_cache_hits_total",
108+
"dgraph_lru_hits_total": prometheus.NewDesc(
109+
"dgraph_lru_hits_total",
110+
"dgraph_lru_hits_total",
109111
nil, nil,
110112
),
111-
"dgraph_cache_miss_total": prometheus.NewDesc(
112-
"dgraph_cache_miss_total",
113-
"dgraph_cache_miss_total",
113+
"dgraph_lru_miss_total": prometheus.NewDesc(
114+
"dgraph_lru_miss_total",
115+
"dgraph_lru_miss_total",
114116
nil, nil,
115117
),
116-
"dgraph_cache_race_total": prometheus.NewDesc(
117-
"dgraph_cache_race_total",
118-
"dgraph_cache_race_total",
118+
"dgraph_lru_race_total": prometheus.NewDesc(
119+
"dgraph_lru_race_total",
120+
"dgraph_lru_race_total",
121+
nil, nil,
122+
),
123+
"dgraph_lru_evicted_total": prometheus.NewDesc(
124+
"dgraph_lru_evicted_total",
125+
"dgraph_lru_evicted_total",
126+
nil, nil,
127+
),
128+
"dgraph_lru_size_bytes": prometheus.NewDesc(
129+
"dgraph_lru_size_bytes",
130+
"dgraph_lru_size_bytes",
131+
nil, nil,
132+
),
133+
"dgraph_lru_keys_total": prometheus.NewDesc(
134+
"dgraph_lru_keys_total",
135+
"dgraph_lru_keys_total",
136+
nil, nil,
137+
),
138+
"dgraph_lru_capacity_bytes": prometheus.NewDesc(
139+
"dgraph_lru_capacity_bytes",
140+
"dgraph_lru_capacity_bytes",
119141
nil, nil,
120142
),
121143
"dgraph_posting_reads_total": prometheus.NewDesc(
@@ -153,11 +175,6 @@ func init() {
153175
"dgraph_written_bytes_total",
154176
nil, nil,
155177
),
156-
"dgraph_evicted_lists_total": prometheus.NewDesc(
157-
"dgraph_evicted_lists_total",
158-
"dgraph_evicted_lists_total",
159-
nil, nil,
160-
),
161178
"dgraph_pending_queries_total": prometheus.NewDesc(
162179
"dgraph_pending_queries_total",
163180
"dgraph_pending_queries_total",
@@ -178,21 +195,6 @@ func init() {
178195
"dgraph_dirtymap_keys_total",
179196
nil, nil,
180197
),
181-
"dgraph_lcache_size_bytes": prometheus.NewDesc(
182-
"dgraph_lcache_size_bytes",
183-
"dgraph_lcache_size_bytes",
184-
nil, nil,
185-
),
186-
"dgraph_lcache_keys_total": prometheus.NewDesc(
187-
"dgraph_lcache_keys_total",
188-
"dgraph_lcache_keys_total",
189-
nil, nil,
190-
),
191-
"dgraph_lcache_capacity_bytes": prometheus.NewDesc(
192-
"dgraph_lcache_capacity_bytes",
193-
"dgraph_lcache_capacity_bytes",
194-
nil, nil,
195-
),
196198
"dgraph_goroutines_total": prometheus.NewDesc(
197199
"dgraph_goroutines_total",
198200
"dgraph_goroutines_total",

0 commit comments

Comments
 (0)