diff --git a/go/cmd/vtctld/plugin_prometheusbackend.go b/go/cmd/vtctld/plugin_prometheusbackend.go new file mode 100644 index 00000000000..e226da5ff79 --- /dev/null +++ b/go/cmd/vtctld/plugin_prometheusbackend.go @@ -0,0 +1,31 @@ +/* +Copyright 2018 Google Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +// This plugin imports Prometheus to allow for instrumentation +// with the Prometheus client library + +import ( + "vitess.io/vitess/go/stats/prometheusbackend" + "vitess.io/vitess/go/vt/servenv" +) + +func init() { + servenv.OnRun(func() { + prometheusbackend.Init("vtctld") + }) +} diff --git a/go/cmd/vtgate/plugin_prometheusbackend.go b/go/cmd/vtgate/plugin_prometheusbackend.go new file mode 100644 index 00000000000..6cbfbec74fb --- /dev/null +++ b/go/cmd/vtgate/plugin_prometheusbackend.go @@ -0,0 +1,31 @@ +/* +Copyright 2018 Google Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +// This plugin imports Prometheus to allow for instrumentation +// with the Prometheus client library + +import ( + "vitess.io/vitess/go/stats/prometheusbackend" + "vitess.io/vitess/go/vt/servenv" +) + +func init() { + servenv.OnRun(func() { + prometheusbackend.Init("vtgate") + }) +} diff --git a/go/cmd/vttablet/plugin_prometheusbackend.go b/go/cmd/vttablet/plugin_prometheusbackend.go new file mode 100644 index 00000000000..a72f269c189 --- /dev/null +++ b/go/cmd/vttablet/plugin_prometheusbackend.go @@ -0,0 +1,31 @@ +/* +Copyright 2018 Google Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +// This plugin imports Prometheus to allow for instrumentation +// with the Prometheus client library + +import ( + "vitess.io/vitess/go/stats/prometheusbackend" + "vitess.io/vitess/go/vt/servenv" +) + +func init() { + servenv.OnRun(func() { + prometheusbackend.Init("vttablet") + }) +} diff --git a/go/cmd/vtworker/plugin_prometheusbackend.go b/go/cmd/vtworker/plugin_prometheusbackend.go new file mode 100644 index 00000000000..0579fc9b602 --- /dev/null +++ b/go/cmd/vtworker/plugin_prometheusbackend.go @@ -0,0 +1,31 @@ +/* +Copyright 2018 Google Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +// This plugin imports Prometheus to allow for instrumentation +// with the Prometheus client library + +import ( + "vitess.io/vitess/go/stats/prometheusbackend" + "vitess.io/vitess/go/vt/servenv" +) + +func init() { + servenv.OnRun(func() { + prometheusbackend.Init("vtworker") + }) +} diff --git a/go/mysql/server.go b/go/mysql/server.go index d5f6700abb8..ee081a01ca2 100644 --- a/go/mysql/server.go +++ b/go/mysql/server.go @@ -42,10 +42,10 @@ const ( var ( // Metrics - timings = stats.NewTimings("MysqlServerTimings") - connCount = stats.NewInt("MysqlServerConnCount") - connAccept = stats.NewInt("MysqlServerConnAccepted") - connSlow = stats.NewInt("MysqlServerConnSlow") + timings = stats.NewTimings("MysqlServerTimings", "MySQL server timings") + connCount = stats.NewGauge("MysqlServerConnCount", "Active MySQL server connections") + connAccept = stats.NewCounter("MysqlServerConnAccepted", "Connections accepted by MySQL server") + connSlow = stats.NewCounter("MysqlServerConnSlow", "Connections that took more than the configured mysql_slow_connect_warn_threshold to establish") ) // A Handler is an interface used by Listener to send queries. diff --git a/go/proc/counting_listener.go b/go/proc/counting_listener.go index 09ee1cf3650..719ba6daa27 100644 --- a/go/proc/counting_listener.go +++ b/go/proc/counting_listener.go @@ -24,7 +24,8 @@ import ( type CountingListener struct { net.Listener - ConnCount, ConnAccept *stats.Int + ConnCount *stats.Gauge + ConnAccept *stats.Counter } type countingConnection struct { @@ -37,8 +38,8 @@ type countingConnection struct { func Published(l net.Listener, countTag, acceptTag string) net.Listener { return &CountingListener{ Listener: l, - ConnCount: stats.NewInt(countTag), - ConnAccept: stats.NewInt(acceptTag), + ConnCount: stats.NewGauge(countTag, "Active connections accepted by counting listener"), + ConnAccept: stats.NewCounter(acceptTag, "Count of connections accepted by the counting listener"), } } diff --git a/go/stats/counters.go b/go/stats/counters.go index 0bc94fea498..d71f01fbe8b 100644 --- a/go/stats/counters.go +++ b/go/stats/counters.go @@ -19,37 +19,99 @@ package stats import ( "bytes" "fmt" + "strconv" "strings" "sync" "sync/atomic" + "time" + + "vitess.io/vitess/go/sync2" + "vitess.io/vitess/go/vt/logutil" ) -// Counters is similar to expvar.Map, except that -// it doesn't allow floats. In addition, it provides -// a Counts method which can be used for tracking rates. -type Counters struct { - // mu only protects adding and retrieving the value (*int64) from the map, - // modification to the actual number (int64) should be done with atomic funcs. - mu sync.RWMutex - counts map[string]*int64 +// logCounterNegative is for throttling adding a negative value to a counter messages in logs +var logCounterNegative = logutil.NewThrottledLogger("StatsCounterNegative", 1*time.Minute) + +// Counter is expvar.Int+Get+hook +type Counter struct { + i sync2.AtomicInt64 + help string } -// NewCounters create a new Counters instance. If name is set, the variable -// gets published. The functional also accepts an optional list of tags that -// pre-creates them initialized to 0. -func NewCounters(name string, tags ...string) *Counters { - c := &Counters{counts: make(map[string]*int64)} - for _, tag := range tags { - c.counts[tag] = new(int64) +// NewCounter returns a new Counter +func NewCounter(name string, help string) *Counter { + v := &Counter{help: help} + if name != "" { + publish(name, v) } + return v +} + +// Add adds the provided value to the Counter +func (v *Counter) Add(delta int64) { + if delta < 0 { + logCounterNegative.Warningf("Adding a negative value to a counter, %v should be a gauge instead", v) + } + v.i.Add(delta) +} + +// Reset resets the counter value to 0 +func (v *Counter) Reset() { + v.i.Set(int64(0)) +} + +// Get returns the value +func (v *Counter) Get() int64 { + return v.i.Get() +} + +// String is the implementation of expvar.var +func (v *Counter) String() string { + return strconv.FormatInt(v.i.Get(), 10) +} + +// Help returns the help string +func (v *Counter) Help() string { + return v.help +} + +// Gauge is an unlabeled metric whose values can go up/down. +type Gauge struct { + Counter +} + +// NewGauge creates a new Gauge and publishes it if name is set +func NewGauge(name string, help string) *Gauge { + v := &Gauge{Counter: Counter{help: help}} + if name != "" { - publish(name, c) + publish(name, v) } - return c + return v } -// String is used by expvar. -func (c *Counters) String() string { +// Set sets the value +func (v *Gauge) Set(value int64) { + v.Counter.i.Set(value) +} + +// Add adds the provided value to the Gauge +func (v *Gauge) Add(delta int64) { + v.Counter.i.Add(delta) +} + +// counters is similar to expvar.Map, except that +// it doesn't allow floats. It is used to build CountersWithLabels and GaugesWithLabels. +type counters struct { + // mu only protects adding and retrieving the value (*int64) from the map, + // modification to the actual number (int64) should be done with atomic funcs. + mu sync.RWMutex + counts map[string]*int64 + help string +} + +// String implements expvar +func (c *counters) String() string { b := bytes.NewBuffer(make([]byte, 0, 4096)) c.mu.RLock() @@ -69,7 +131,7 @@ func (c *Counters) String() string { return b.String() } -func (c *Counters) getValueAddr(name string) *int64 { +func (c *counters) getValueAddr(name string) *int64 { c.mu.RLock() a, ok := c.counts[name] c.mu.RUnlock() @@ -92,26 +154,26 @@ func (c *Counters) getValueAddr(name string) *int64 { } // Add adds a value to a named counter. -func (c *Counters) Add(name string, value int64) { +func (c *counters) Add(name string, value int64) { a := c.getValueAddr(name) atomic.AddInt64(a, value) } -// Set sets the value of a named counter. -func (c *Counters) Set(name string, value int64) { - a := c.getValueAddr(name) - atomic.StoreInt64(a, value) -} - -// Reset resets all counter values -func (c *Counters) Reset() { +// ResetAll resets all counter values. +func (c *counters) ResetAll() { c.mu.Lock() defer c.mu.Unlock() c.counts = make(map[string]*int64) } +// Reset resets a specific counter value to 0 +func (c *counters) Reset(name string) { + a := c.getValueAddr(name) + atomic.StoreInt64(a, int64(0)) +} + // Counts returns a copy of the Counters' map. -func (c *Counters) Counts() map[string]int64 { +func (c *counters) Counts() map[string]int64 { c.mu.RLock() defer c.mu.RUnlock() @@ -122,6 +184,159 @@ func (c *Counters) Counts() map[string]int64 { return counts } +// Help returns the help string. +func (c *counters) Help() string { + return c.help +} + +// CountersWithLabels provides a labelName for the tagged values in Counters +// It provides a Counts method which can be used for tracking rates. +type CountersWithLabels struct { + counters + labelName string +} + +// NewCountersWithLabels create a new Counters instance. If name is set, the variable +// gets published. The function also accepts an optional list of tags that +// pre-creates them initialized to 0. +// labelName is a category name used to organize the tags in Prometheus. +func NewCountersWithLabels(name string, help string, labelName string, tags ...string) *CountersWithLabels { + c := &CountersWithLabels{ + counters: counters{ + counts: make(map[string]*int64), + help: help, + }, + labelName: labelName, + } + + for _, tag := range tags { + c.counts[tag] = new(int64) + } + if name != "" { + publish(name, c) + } + return c +} + +// LabelName returns the label name. +func (c *CountersWithLabels) LabelName() string { + return c.labelName +} + +// Add adds a value to a named counter. +func (c *CountersWithLabels) Add(name string, value int64) { + if value < 0 { + logCounterNegative.Warningf("Adding a negative value to a counter, %v should be a gauge instead", c) + } + a := c.getValueAddr(name) + atomic.AddInt64(a, value) +} + +// GaugesWithLabels is similar to CountersWithLabels, except its values can go up and down. +type GaugesWithLabels struct { + CountersWithLabels +} + +// NewGaugesWithLabels creates a new GaugesWithLabels and publishes it if the name is set. +func NewGaugesWithLabels(name string, help string, labelName string, tags ...string) *GaugesWithLabels { + g := &GaugesWithLabels{CountersWithLabels: CountersWithLabels{counters: counters{ + counts: make(map[string]*int64), + help: help, + }, labelName: labelName}} + + for _, tag := range tags { + g.CountersWithLabels.counts[tag] = new(int64) + } + if name != "" { + publish(name, g) + } + return g +} + +// Set sets the value of a named gauge. +func (g *GaugesWithLabels) Set(name string, value int64) { + a := g.CountersWithLabels.getValueAddr(name) + atomic.StoreInt64(a, value) +} + +// Add adds a value to a named gauge. +func (g *GaugesWithLabels) Add(name string, value int64) { + a := g.getValueAddr(name) + atomic.AddInt64(a, value) +} + +// CounterFunc converts a function that returns +// an int64 as an expvar. +// For implementations that differentiate between Counters/Gauges, +// CounterFunc's values only go up (or are reset to 0) +type CounterFunc struct { + Mf MetricFunc + help string +} + +// NewCounterFunc creates a new CounterFunc instance and publishes it if name is set +func NewCounterFunc(name string, help string, Mf MetricFunc) *CounterFunc { + c := &CounterFunc{ + Mf: Mf, + help: help, + } + + if name != "" { + publish(name, c) + } + return c +} + +// Help returns the help string +func (cf *CounterFunc) Help() string { + return cf.help +} + +// String implements expvar.Var +func (cf *CounterFunc) String() string { + return cf.Mf.String() +} + +// MetricFunc defines an interface for things that can be exported with calls to stats.CounterFunc/stats.GaugeFunc +type MetricFunc interface { + FloatVal() float64 + String() string +} + +// IntFunc converst a function that returns an int64 as both an expvar and a MetricFunc +type IntFunc func() int64 + +// FloatVal is the implementation of MetricFunc +func (f IntFunc) FloatVal() float64 { + return float64(f()) +} + +// String is the implementation of expvar.var +func (f IntFunc) String() string { + return strconv.FormatInt(f(), 10) +} + +// GaugeFunc converts a function that returns an int64 as an expvar. +// It's a wrapper around CounterFunc for values that go up/down +// for implementations (like Prometheus) that need to differ between Counters and Gauges. +type GaugeFunc struct { + CounterFunc +} + +// NewGaugeFunc creates a new GaugeFunc instance and publishes it if name is set +func NewGaugeFunc(name string, help string, Mf MetricFunc) *GaugeFunc { + i := &GaugeFunc{ + CounterFunc: CounterFunc{ + Mf: Mf, + help: help, + }} + + if name != "" { + publish(name, i) + } + return i +} + // CountersFunc converts a function that returns // a map of int64 as an expvar. type CountersFunc func() map[string]int64 @@ -152,76 +367,164 @@ func (f CountersFunc) String() string { return b.String() } -// MultiCounters is a multidimensional Counters implementation where +// CountersWithMultiLabels is a multidimensional Counters implementation where // names of categories are compound names made with joining multiple // strings with '.'. -type MultiCounters struct { - Counters +type CountersWithMultiLabels struct { + counters labels []string } -// NewMultiCounters creates a new MultiCounters instance, and publishes it +// NewCountersWithMultiLabels creates a new CountersWithMultiLabels instance, and publishes it // if name is set. -func NewMultiCounters(name string, labels []string) *MultiCounters { - t := &MultiCounters{ - Counters: Counters{counts: make(map[string]*int64)}, - labels: labels, +func NewCountersWithMultiLabels(name string, help string, labels []string) *CountersWithMultiLabels { + t := &CountersWithMultiLabels{ + counters: counters{ + counts: make(map[string]*int64), + help: help}, + labels: labels, } if name != "" { publish(name, t) } + return t } // Labels returns the list of labels. -func (mc *MultiCounters) Labels() []string { +func (mc *CountersWithMultiLabels) Labels() []string { return mc.labels } // Add adds a value to a named counter. len(names) must be equal to // len(Labels) -func (mc *MultiCounters) Add(names []string, value int64) { +func (mc *CountersWithMultiLabels) Add(names []string, value int64) { if len(names) != len(mc.labels) { - panic("MultiCounters: wrong number of values in Add") + panic("CountersWithMultiLabels: wrong number of values in Add") + } + if value < 0 { + logCounterNegative.Warningf("Adding a negative value to a counter, %v should be a gauge instead", mc) } - mc.Counters.Add(mapKey(names), value) + + mc.counters.Add(mapKey(names), value) +} + +// Reset resets the value of a named counter back to 0. len(names) +// must be equal to len(Labels) +func (mc *CountersWithMultiLabels) Reset(names []string) { + if len(names) != len(mc.labels) { + panic("CountersWithMultiLabels: wrong number of values in Reset") + } + + mc.counters.Reset(mapKey(names)) +} + +// Counts returns a copy of the Counters' map. +// The key is a single string where all labels are joiend by a "." e.g. +// "label1.label2". +func (mc *CountersWithMultiLabels) Counts() map[string]int64 { + return mc.counters.Counts() +} + +// GaugesWithMultiLabels is a CountersWithMultiLabels implementation where the values can go up and down +type GaugesWithMultiLabels struct { + CountersWithMultiLabels +} + +// NewGaugesWithMultiLabels creates a new GaugesWithMultiLabels instance, and publishes it +// if name is set. +func NewGaugesWithMultiLabels(name string, help string, labels []string) *GaugesWithMultiLabels { + t := &GaugesWithMultiLabels{ + CountersWithMultiLabels: CountersWithMultiLabels{counters: counters{ + counts: make(map[string]*int64), + help: help, + }, + labels: labels, + }} + if name != "" { + publish(name, t) + } + + return t } // Set sets the value of a named counter. len(names) must be equal to // len(Labels) -func (mc *MultiCounters) Set(names []string, value int64) { - if len(names) != len(mc.labels) { - panic("MultiCounters: wrong number of values in Set") +func (mg *GaugesWithMultiLabels) Set(names []string, value int64) { + if len(names) != len(mg.CountersWithMultiLabels.labels) { + panic("GaugesWithMultiLabels: wrong number of values in Set") + } + a := mg.getValueAddr(mapKey(names)) + atomic.StoreInt64(a, value) +} + +// Add adds a value to a named gauge. len(names) must be equal to +// len(Labels) +func (mg *GaugesWithMultiLabels) Add(names []string, value int64) { + if len(names) != len(mg.labels) { + panic("CountersWithMultiLabels: wrong number of values in Add") } - mc.Counters.Set(mapKey(names), value) + + mg.counters.Add(mapKey(names), value) } -// MultiCountersFunc is a multidimensional CountersFunc implementation +// CountersFuncWithMultiLabels is a multidimensional CountersFunc implementation // where names of categories are compound names made with joining // multiple strings with '.'. Since the map is returned by the // function, we assume it's in the right format (meaning each key is // of the form 'aaa.bbb.ccc' with as many elements as there are in // Labels). -type MultiCountersFunc struct { +type CountersFuncWithMultiLabels struct { CountersFunc labels []string + help string } // Labels returns the list of labels. -func (mcf *MultiCountersFunc) Labels() []string { +func (mcf *CountersFuncWithMultiLabels) Labels() []string { return mcf.labels } -// NewMultiCountersFunc creates a new MultiCountersFunc mapping to the provided +// Help returns the help string +func (mcf *CountersFuncWithMultiLabels) Help() string { + return mcf.help +} + +// NewCountersFuncWithMultiLabels creates a new CountersFuncWithMultiLabels mapping to the provided // function. -func NewMultiCountersFunc(name string, labels []string, f CountersFunc) *MultiCountersFunc { - t := &MultiCountersFunc{ +func NewCountersFuncWithMultiLabels(name string, labels []string, help string, f CountersFunc) *CountersFuncWithMultiLabels { + t := &CountersFuncWithMultiLabels{ CountersFunc: f, labels: labels, + help: help, } if name != "" { publish(name, t) } + + return t +} + +// GaugesFuncWithMultiLabels is a wrapper around CountersFuncWithMultiLabels +// for values that go up/down for implementations (like Prometheus) that need to differ between Counters and Gauges. +type GaugesFuncWithMultiLabels struct { + CountersFuncWithMultiLabels +} + +// NewGaugesFuncWithMultiLabels creates a new GaugesFuncWithMultiLabels mapping to the provided +// function. +func NewGaugesFuncWithMultiLabels(name string, labels []string, help string, f CountersFunc) *GaugesFuncWithMultiLabels { + t := &GaugesFuncWithMultiLabels{ + CountersFuncWithMultiLabels: CountersFuncWithMultiLabels{ + CountersFunc: f, + labels: labels, + help: help, + }} + + if name != "" { + publish(name, t) + } + return t } diff --git a/go/stats/counters_test.go b/go/stats/counters_test.go index 8f51124530c..43cf800f6da 100644 --- a/go/stats/counters_test.go +++ b/go/stats/counters_test.go @@ -27,7 +27,7 @@ import ( func TestCounters(t *testing.T) { clear() - c := NewCounters("counter1") + c := NewCountersWithLabels("counter1", "help", "type") c.Add("c1", 1) c.Add("c2", 1) c.Add("c2", 1) @@ -56,14 +56,14 @@ func TestCounters(t *testing.T) { func TestCountersTags(t *testing.T) { clear() - c := NewCounters("counterTag1") + c := NewCountersWithLabels("counterTag1", "help", "label") want := map[string]int64{} got := c.Counts() if !reflect.DeepEqual(got, want) { t.Errorf("want %v, got %v", want, got) } - c = NewCounters("counterTag2", "tag1", "tag2") + c = NewCountersWithLabels("counterTag2", "help", "label", "tag1", "tag2") want = map[string]int64{"tag1": 0, "tag2": 0} got = c.Counts() if !reflect.DeepEqual(got, want) { @@ -73,7 +73,7 @@ func TestCountersTags(t *testing.T) { func TestMultiCounters(t *testing.T) { clear() - c := NewMultiCounters("mapCounter1", []string{"aaa", "bbb"}) + c := NewCountersWithMultiLabels("mapCounter1", "help", []string{"aaa", "bbb"}) c.Add([]string{"c1a", "c1b"}, 1) c.Add([]string{"c2a", "c2b"}, 1) c.Add([]string{"c2a", "c2b"}, 1) @@ -89,7 +89,7 @@ func TestMultiCounters(t *testing.T) { if counts["c2a.c2b"] != 2 { t.Errorf("want 2, got %d", counts["c2a.c2b"]) } - f := NewMultiCountersFunc("", []string{"aaa", "bbb"}, func() map[string]int64 { + f := NewCountersFuncWithMultiLabels("", []string{"aaa", "bbb"}, "help", func() map[string]int64 { return map[string]int64{ "c1a.c1b": 1, "c2a.c2b": 2, @@ -102,7 +102,7 @@ func TestMultiCounters(t *testing.T) { func TestMultiCountersDot(t *testing.T) { clear() - c := NewMultiCounters("mapCounter2", []string{"aaa", "bbb"}) + c := NewCountersWithMultiLabels("mapCounter2", "help", []string{"aaa", "bbb"}) c.Add([]string{"c1.a", "c1b"}, 1) c.Add([]string{"c2a", "c2.b"}, 1) c.Add([]string{"c2a", "c2.b"}, 1) @@ -122,14 +122,14 @@ func TestMultiCountersDot(t *testing.T) { func TestCountersHook(t *testing.T) { var gotname string - var gotv *Counters + var gotv *CountersWithLabels clear() Register(func(name string, v expvar.Var) { gotname = name - gotv = v.(*Counters) + gotv = v.(*CountersWithLabels) }) - v := NewCounters("counter2") + v := NewCountersWithLabels("counter2", "help", "type") if gotname != "counter2" { t.Errorf("want counter2, got %s", gotname) } @@ -138,7 +138,7 @@ func TestCountersHook(t *testing.T) { } } -var benchCounter = NewCounters("bench") +var benchCounter = NewCountersWithLabels("bench", "help", "type") func BenchmarkCounters(b *testing.B) { clear() @@ -152,7 +152,7 @@ func BenchmarkCounters(b *testing.B) { }) } -var benchMultiCounter = NewMultiCounters("benchMulti", []string{"call", "keyspace", "dbtype"}) +var benchMultiCounter = NewCountersWithMultiLabels("benchMulti", "help", []string{"call", "keyspace", "dbtype"}) func BenchmarkMultiCounters(b *testing.B) { clear() diff --git a/go/stats/export.go b/go/stats/export.go index 6c1d57be749..1cdc452e021 100644 --- a/go/stats/export.go +++ b/go/stats/export.go @@ -74,6 +74,7 @@ func (vg *varGroup) register(nvh NewVarHook) { func (vg *varGroup) publish(name string, v expvar.Var) { vg.Lock() defer vg.Unlock() + expvar.Publish(name, v) if vg.newVarHook != nil { vg.newVarHook(name, v) @@ -195,40 +196,6 @@ func (f FloatFunc) String() string { return strconv.FormatFloat(f(), 'g', -1, 64) } -// Int is expvar.Int+Get+hook -type Int struct { - i sync2.AtomicInt64 -} - -// NewInt returns a new Int -func NewInt(name string) *Int { - v := new(Int) - if name != "" { - publish(name, v) - } - return v -} - -// Add adds the provided value to the Int -func (v *Int) Add(delta int64) { - v.i.Add(delta) -} - -// Set sets the value -func (v *Int) Set(value int64) { - v.i.Set(value) -} - -// Get returns the value -func (v *Int) Get() int64 { - return v.i.Get() -} - -// String is the implementation of expvar.var -func (v *Int) String() string { - return strconv.FormatInt(v.i.Get(), 10) -} - // Duration exports a time.Duration type Duration struct { i sync2.AtomicDuration @@ -261,15 +228,6 @@ func (v *Duration) String() string { return strconv.FormatInt(int64(v.i.Get()), 10) } -// IntFunc converts a function that returns -// an int64 as an expvar. -type IntFunc func() int64 - -// String is the implementation of expvar.var -func (f IntFunc) String() string { - return strconv.FormatInt(f(), 10) -} - // DurationFunc converts a function that returns // an time.Duration as an expvar. type DurationFunc func() time.Duration @@ -279,6 +237,11 @@ func (f DurationFunc) String() string { return strconv.FormatInt(int64(f()), 10) } +// FloatVal is the implementation of MetricFunc +func (f DurationFunc) FloatVal() float64 { + return f().Seconds() +} + // String is expvar.String+Get+hook type String struct { mu sync.Mutex diff --git a/go/stats/export_test.go b/go/stats/export_test.go index cb9ac764b55..b95db711fa7 100644 --- a/go/stats/export_test.go +++ b/go/stats/export_test.go @@ -29,8 +29,8 @@ func clear() { func TestNoHook(t *testing.T) { clear() - v := NewInt("plainint") - v.Set(1) + v := NewCounter("plainint", "help") + v.Add(1) if v.String() != "1" { t.Errorf("want 1, got %s", v.String()) } @@ -71,38 +71,55 @@ func TestFloat(t *testing.T) { } } -func TestInt(t *testing.T) { +func TestCounter(t *testing.T) { var gotname string - var gotv *Int + var gotv *Counter clear() Register(func(name string, v expvar.Var) { gotname = name - gotv = v.(*Int) + gotv = v.(*Counter) }) - v := NewInt("Int") + v := NewCounter("Int", "help") if gotname != "Int" { t.Errorf("want Int, got %s", gotname) } if gotv != v { t.Errorf("want %#v, got %#v", v, gotv) } - v.Set(5) - if v.Get() != 5 { - t.Errorf("want 5, got %v", v.Get()) - } v.Add(1) - if v.Get() != 6 { - t.Errorf("want 6, got %v", v.Get()) + if v.Get() != 1 { + t.Errorf("want 1, got %v", v.Get()) } - if v.String() != "6" { - t.Errorf("want 6, got %v", v.Get()) + if v.String() != "1" { + t.Errorf("want 1, got %v", v.Get()) + } + v.Reset() + if v.Get() != 0 { + t.Errorf("want 0, got %v", v.Get()) } - f := IntFunc(func() int64 { - return 1 +} + +func TestGaugeFunc(t *testing.T) { + var gotname string + var gotv *GaugeFunc + clear() + Register(func(name string, v expvar.Var) { + gotname = name + gotv = v.(*GaugeFunc) }) - if f.String() != "1" { - t.Errorf("want 1, got %v", f.String()) + + v := NewGaugeFunc("name", "help", IntFunc(func() int64 { + return 1 + })) + if v.String() != "1" { + t.Errorf("want 1, got %f", v.String()) + } + if gotv != v { + t.Errorf("want %#v, got %#v", v, gotv) + } + if gotname != "name" { + t.Errorf("want name, got %s", gotname) } } @@ -133,11 +150,29 @@ func TestDuration(t *testing.T) { t.Errorf("want 6, got %v", v.Get()) } - f := DurationFunc(func() time.Duration { - return time.Duration(1) +} + +func TestDurationFunc(t *testing.T) { + var gotname string + var gotv *CounterFunc + clear() + Register(func(name string, v expvar.Var) { + gotname = name + gotv = v.(*CounterFunc) }) - if f.String() != "1" { - t.Errorf("want 1, got %v", f.String()) + + v := NewCounterFunc("duration", "help", DurationFunc(func() time.Duration { + return time.Duration(1) + })) + + if gotv != v { + t.Errorf("want %#v, got %#v", v, gotv) + } + if v.String() != "1" { + t.Errorf("want 1, got %v", v.String()) + } + if gotname != "duration" { + t.Errorf("want duration, got %s", gotname) } } diff --git a/go/stats/influxdbbackend/influxdb_backend.go b/go/stats/influxdbbackend/influxdb_backend.go index 1fe3705403f..4fb5757c837 100644 --- a/go/stats/influxdbbackend/influxdb_backend.go +++ b/go/stats/influxdbbackend/influxdb_backend.go @@ -96,12 +96,10 @@ func statToValue(v expvar.Var) interface{} { switch v := v.(type) { case *stats.Float: return v.Get() - case *stats.Int: + case *stats.Counter: return v.Get() case stats.FloatFunc: return v() - case stats.IntFunc: - return v() default: return v.String() } diff --git a/go/stats/kebab_case_converter.go b/go/stats/kebab_case_converter.go index ad2c6f518fc..f288a8b6bbb 100644 --- a/go/stats/kebab_case_converter.go +++ b/go/stats/kebab_case_converter.go @@ -33,7 +33,7 @@ func toKebabCase(name string) (hyphenated string) { return hyphenated } hyphenated = name - for _, converter := range converters { + for _, converter := range kebabConverters { hyphenated = converter.re.ReplaceAllString(hyphenated, converter.repl) } hyphenated = strings.ToLower(hyphenated) @@ -41,7 +41,7 @@ func toKebabCase(name string) (hyphenated string) { return } -var converters = []struct { +var kebabConverters = []struct { re *regexp.Regexp repl string }{ diff --git a/go/stats/multidimensional_test.go b/go/stats/multidimensional_test.go index fb64b452ea1..c52490a7235 100644 --- a/go/stats/multidimensional_test.go +++ b/go/stats/multidimensional_test.go @@ -24,7 +24,7 @@ import ( func TestMultiTimingsCounterFor(t *testing.T) { clear() - mtm := NewMultiTimings("multitimings3", []string{"dim1", "dim2"}) + mtm := NewMultiTimings("multitimings3", "help", []string{"dim1", "dim2"}) mtm.Add([]string{"tag1a", "tag1b"}, 500*time.Microsecond) mtm.Add([]string{"tag1a", "tag2b"}, 500*time.Millisecond) diff --git a/go/stats/prometheusbackend/collectors.go b/go/stats/prometheusbackend/collectors.go new file mode 100644 index 00000000000..e5d02f80da6 --- /dev/null +++ b/go/stats/prometheusbackend/collectors.go @@ -0,0 +1,201 @@ +package prometheusbackend + +import ( + "strings" + + "github.com/prometheus/client_golang/prometheus" + "vitess.io/vitess/go/stats" +) + +type metricCollector struct { + counter *stats.Counter + desc *prometheus.Desc + vt prometheus.ValueType +} + +// Describe implements Collector. +func (c *metricCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- c.desc +} + +// Collect implements Collector. +func (c *metricCollector) Collect(ch chan<- prometheus.Metric) { + ch <- prometheus.MustNewConstMetric(c.desc, c.vt, float64(c.counter.Get())) +} + +// countersWithLabelsCollector collects stats.CountersWithLabels +type countersWithLabelsCollector struct { + counters *stats.CountersWithLabels + desc *prometheus.Desc + vt prometheus.ValueType +} + +// Describe implements Collector. +func (c *countersWithLabelsCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- c.desc +} + +// Collect implements Collector. +func (c *countersWithLabelsCollector) Collect(ch chan<- prometheus.Metric) { + for tag, val := range c.counters.Counts() { + ch <- prometheus.MustNewConstMetric( + c.desc, + c.vt, + float64(val), + tag) + } +} + +// gaugesWithLabelsCollector collects stats.GaugesWithLabels +type gaugesWithLabelsCollector struct { + gauges *stats.GaugesWithLabels + desc *prometheus.Desc + vt prometheus.ValueType +} + +// Describe implements Collector. +func (g *gaugesWithLabelsCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- g.desc +} + +// Collect implements Collector. +func (g *gaugesWithLabelsCollector) Collect(ch chan<- prometheus.Metric) { + for tag, val := range g.gauges.Counts() { + ch <- prometheus.MustNewConstMetric( + g.desc, + g.vt, + float64(val), + tag) + } +} + +type metricWithMultiLabelsCollector struct { + cml *stats.CountersWithMultiLabels + desc *prometheus.Desc +} + +// Describe implements Collector. +func (c *metricWithMultiLabelsCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- c.desc +} + +// Collect implements Collector. +func (c *metricWithMultiLabelsCollector) Collect(ch chan<- prometheus.Metric) { + for lvs, val := range c.cml.Counts() { + labelValues := strings.Split(lvs, ".") + value := float64(val) + ch <- prometheus.MustNewConstMetric(c.desc, prometheus.CounterValue, value, labelValues...) + } +} + +type multiGaugesCollector struct { + gml *stats.GaugesWithMultiLabels + desc *prometheus.Desc +} + +// Describe implements Collector. +func (c *multiGaugesCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- c.desc +} + +// Collect implements Collector. +func (c *multiGaugesCollector) Collect(ch chan<- prometheus.Metric) { + for lvs, val := range c.gml.Counts() { + labelValues := strings.Split(lvs, ".") + value := float64(val) + ch <- prometheus.MustNewConstMetric(c.desc, prometheus.GaugeValue, value, labelValues...) + } +} + +type metricsFuncWithMultiLabelsCollector struct { + cfml *stats.CountersFuncWithMultiLabels + desc *prometheus.Desc + vt prometheus.ValueType +} + +// Describe implements Collector. +func (c *metricsFuncWithMultiLabelsCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- c.desc +} + +// Collect implements Collector. +func (c *metricsFuncWithMultiLabelsCollector) Collect(ch chan<- prometheus.Metric) { + for lvs, val := range c.cfml.Counts() { + labelValues := strings.Split(lvs, ".") + value := float64(val) + ch <- prometheus.MustNewConstMetric(c.desc, c.vt, value, labelValues...) + } +} + +type timingsCollector struct { + t *stats.Timings + desc *prometheus.Desc +} + +// Describe implements Collector. +func (c *timingsCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- c.desc +} + +// Collect implements Collector. +func (c *timingsCollector) Collect(ch chan<- prometheus.Metric) { + for cat, his := range c.t.Histograms() { + ch <- prometheus.MustNewConstHistogram( + c.desc, + uint64(his.Count()), + float64(his.Total()), + makePromBucket(his.Cutoffs(), his.Buckets()), + cat) + } +} + +func makePromBucket(cutoffs []int64, buckets []int64) map[float64]uint64 { + output := make(map[float64]uint64) + last := uint64(0) + for i := range cutoffs { + key := float64(cutoffs[i]) / 1000000000 + //TODO(zmagg): int64 => uint64 conversion. error if it overflows? + output[key] = uint64(buckets[i]) + last + last = output[key] + } + return output +} + +type multiTimingsCollector struct { + mt *stats.MultiTimings + desc *prometheus.Desc +} + +// Describe implements Collector. +func (c *multiTimingsCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- c.desc +} + +// Collect implements Collector. +func (c *multiTimingsCollector) Collect(ch chan<- prometheus.Metric) { + for cat, his := range c.mt.Timings.Histograms() { + labelValues := strings.Split(cat, ".") + ch <- prometheus.MustNewConstHistogram( + c.desc, + uint64(his.Count()), + float64(his.Total()), + makePromBucket(his.Cutoffs(), his.Buckets()), + labelValues...) + } +} + +type metricFuncCollector struct { + cf *stats.CounterFunc + desc *prometheus.Desc + vt prometheus.ValueType +} + +// Describe implements Collector. +func (c *metricFuncCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- c.desc +} + +// Collect implements Collector. +func (c *metricFuncCollector) Collect(ch chan<- prometheus.Metric) { + ch <- prometheus.MustNewConstMetric(c.desc, c.vt, float64(c.cf.Mf.FloatVal())) +} diff --git a/go/stats/prometheusbackend/prometheusbackend.go b/go/stats/prometheusbackend/prometheusbackend.go new file mode 100644 index 00000000000..2dbfaa027e3 --- /dev/null +++ b/go/stats/prometheusbackend/prometheusbackend.go @@ -0,0 +1,205 @@ +package prometheusbackend + +import ( + "expvar" + "net/http" + "strings" + "time" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" + "vitess.io/vitess/go/stats" + "vitess.io/vitess/go/vt/logutil" +) + +// PromBackend implements PullBackend using Prometheus as the backing metrics storage. +type PromBackend struct { + namespace string +} + +var ( + be *PromBackend + logUnsupported *logutil.ThrottledLogger +) + +// Init initializes the Prometheus be with the given namespace. +func Init(namespace string) { + http.Handle("/metrics", promhttp.Handler()) + be := &PromBackend{namespace: namespace} + logUnsupported = logutil.NewThrottledLogger("PrometheusUnsupportedMetricType", 1*time.Minute) + stats.Register(be.publishPrometheusMetric) +} + +// PublishPromMetric is used to publish the metric to Prometheus. +func (be *PromBackend) publishPrometheusMetric(name string, v expvar.Var) { + switch st := v.(type) { + case *stats.Counter: + be.newMetric(st, name, prometheus.CounterValue) + case *stats.Gauge: + be.newMetric(&st.Counter, name, prometheus.GaugeValue) + case *stats.CounterFunc: + be.newMetricFunc(st, name, prometheus.CounterValue) + case *stats.GaugeFunc: + be.newMetricFunc(&st.CounterFunc, name, prometheus.GaugeValue) + case *stats.CountersWithLabels: + be.newCountersWithLabels(st, name, st.LabelName(), prometheus.CounterValue) + case *stats.CountersWithMultiLabels: + be.newCountersWithMultiLabels(st, name) + case *stats.CountersFuncWithMultiLabels: + be.newMetricsFuncWithMultiLabels(st, name, prometheus.CounterValue) + case *stats.GaugesFuncWithMultiLabels: + be.newMetricsFuncWithMultiLabels(&st.CountersFuncWithMultiLabels, name, prometheus.GaugeValue) + case *stats.GaugesWithLabels: + be.newGaugesWithLabels(st, name, st.LabelName(), prometheus.GaugeValue) + case *stats.GaugesWithMultiLabels: + be.newGaugesWithMultiLabels(st, name) + case *stats.Timings: + be.newTiming(st, name) + case *stats.MultiTimings: + be.newMultiTiming(st, name) + default: + logUnsupported.Infof("Not exporting to Prometheus an unsupported metric type of %T: %s", st, name) + } +} + +func (be *PromBackend) newCountersWithLabels(c *stats.CountersWithLabels, name string, labelName string, vt prometheus.ValueType) { + collector := &countersWithLabelsCollector{ + counters: c, + desc: prometheus.NewDesc( + be.buildPromName(name), + c.Help(), + []string{labelName}, + nil), + vt: vt} + + prometheus.MustRegister(collector) +} + +func (be *PromBackend) newGaugesWithLabels(g *stats.GaugesWithLabels, name string, labelName string, vt prometheus.ValueType) { + collector := &gaugesWithLabelsCollector{ + gauges: g, + desc: prometheus.NewDesc( + be.buildPromName(name), + g.Help(), + []string{labelName}, + nil), + vt: vt} + + prometheus.MustRegister(collector) +} + +func (be *PromBackend) newCountersWithMultiLabels(cml *stats.CountersWithMultiLabels, name string) { + c := &metricWithMultiLabelsCollector{ + cml: cml, + desc: prometheus.NewDesc( + be.buildPromName(name), + cml.Help(), + labelsToSnake(cml.Labels()), + nil), + } + + prometheus.MustRegister(c) +} + +func (be *PromBackend) newGaugesWithMultiLabels(gml *stats.GaugesWithMultiLabels, name string) { + c := &multiGaugesCollector{ + gml: gml, + desc: prometheus.NewDesc( + be.buildPromName(name), + gml.Help(), + labelsToSnake(gml.Labels()), + nil), + } + + prometheus.MustRegister(c) +} + +func (be *PromBackend) newMetricsFuncWithMultiLabels(cfml *stats.CountersFuncWithMultiLabels, name string, vt prometheus.ValueType) { + collector := &metricsFuncWithMultiLabelsCollector{ + cfml: cfml, + desc: prometheus.NewDesc( + be.buildPromName(name), + cfml.Help(), + labelsToSnake(cfml.Labels()), + nil), + vt: vt, + } + + prometheus.MustRegister(collector) +} + +func (be *PromBackend) newTiming(t *stats.Timings, name string) { + collector := &timingsCollector{ + t: t, + desc: prometheus.NewDesc( + be.buildPromName(name), + t.Help(), + []string{"Histograms"}, // hard coded label key + nil), + } + + prometheus.MustRegister(collector) +} + +func (be *PromBackend) newMultiTiming(mt *stats.MultiTimings, name string) { + collector := &multiTimingsCollector{ + mt: mt, + desc: prometheus.NewDesc( + be.buildPromName(name), + mt.Help(), + labelsToSnake(mt.Labels()), + nil), + } + + prometheus.MustRegister(collector) +} + +func (be *PromBackend) newMetric(c *stats.Counter, name string, vt prometheus.ValueType) { + collector := &metricCollector{ + counter: c, + desc: prometheus.NewDesc( + be.buildPromName(name), + c.Help(), + nil, + nil), + vt: vt} + + prometheus.MustRegister(collector) +} + +func (be *PromBackend) newMetricFunc(cf *stats.CounterFunc, name string, vt prometheus.ValueType) { + collector := &metricFuncCollector{ + cf: cf, + desc: prometheus.NewDesc( + be.buildPromName(name), + cf.Help(), + nil, + nil), + vt: vt} + + prometheus.MustRegister(collector) +} + +// buildPromName specifies the namespace as a prefix to the metric name +func (be *PromBackend) buildPromName(name string) string { + s := strings.TrimPrefix(normalizeMetric(name), be.namespace+"_") + return prometheus.BuildFQName("", be.namespace, s) +} + +func labelsToSnake(labels []string) []string { + output := make([]string, len(labels)) + for i, l := range labels { + output[i] = normalizeMetric(l) + } + return output +} + +// normalizeMetricForPrometheus produces a compliant name by applying +// special case conversions and then applying a camel case to snake case converter. +func normalizeMetric(name string) string { + // Special cases + r := strings.NewReplacer("VSchema", "vschema", "VtGate", "vtgate") + name = r.Replace(name) + + return stats.GetSnakeName(name) +} diff --git a/go/stats/prometheusbackend/prometheusbackend_test.go b/go/stats/prometheusbackend/prometheusbackend_test.go new file mode 100644 index 00000000000..a688389717f --- /dev/null +++ b/go/stats/prometheusbackend/prometheusbackend_test.go @@ -0,0 +1,270 @@ +package prometheusbackend + +import ( + "fmt" + "net/http" + "net/http/httptest" + "os" + "strings" + "testing" + "time" + + "vitess.io/vitess/go/stats" + + "github.com/prometheus/client_golang/prometheus/promhttp" +) + +const namespace = "namespace" + +func TestPrometheusCounter(t *testing.T) { + name := "blah" + c := stats.NewCounter(name, "blah") + c.Add(1) + checkHandlerForMetrics(t, name, 1) + //TODO: ban this? And for other counter types too? + // c.Add(-1) + c.Reset() + checkHandlerForMetrics(t, name, 0) +} + +func TestPrometheusGauge(t *testing.T) { + name := "blah_gauge" + c := stats.NewGauge(name, "help") + c.Add(1) + checkHandlerForMetrics(t, name, 1) + c.Add(-1) + checkHandlerForMetrics(t, name, 0) + c.Set(-5) + checkHandlerForMetrics(t, name, -5) + c.Reset() + checkHandlerForMetrics(t, name, 0) +} + +func TestPrometheusCounterFunc(t *testing.T) { + name := "blah_counterfunc" + stats.NewCounterFunc(name, "help", stats.IntFunc(func() int64 { + return 2 + })) + + checkHandlerForMetrics(t, name, 2) +} + +func TestPrometheusGaugeFunc(t *testing.T) { + name := "blah_gaugefunc" + + stats.NewGaugeFunc(name, "help", stats.IntFunc(func() int64 { + return -3 + })) + + checkHandlerForMetrics(t, name, -3) +} + +func checkHandlerForMetrics(t *testing.T, metric string, value int) { + response := testMetricsHandler(t) + + expected := fmt.Sprintf("%s_%s %d", namespace, metric, value) + + if !strings.Contains(response.Body.String(), expected) { + t.Fatalf("Expected %s got %s", expected, response.Body.String()) + } +} + +func TestPrometheusCountersWithLabels(t *testing.T) { + name := "blah_counterswithlabels" + c := stats.NewCountersWithLabels(name, "help", "tag", "tag1", "tag2") + c.Add("tag1", 1) + checkHandlerForMetricWithLabels(t, name, "tag", "tag1", 1) + checkHandlerForMetricWithLabels(t, name, "tag", "tag2", 0) + c.Add("tag2", 41) + checkHandlerForMetricWithLabels(t, name, "tag", "tag1", 1) + checkHandlerForMetricWithLabels(t, name, "tag", "tag2", 41) + c.Reset("tag2") + checkHandlerForMetricWithLabels(t, name, "tag", "tag1", 1) + checkHandlerForMetricWithLabels(t, name, "tag", "tag2", 0) +} + +func TestPrometheusGaugesWithLabels(t *testing.T) { + name := "blah_gaugeswithlabels" + c := stats.NewGaugesWithLabels(name, "help", "tag", "tag1", "tag2") + c.Add("tag1", 1) + checkHandlerForMetricWithLabels(t, name, "tag", "tag1", 1) + + c.Add("tag2", 1) + checkHandlerForMetricWithLabels(t, name, "tag", "tag2", 1) + + c.Set("tag1", -1) + checkHandlerForMetricWithLabels(t, name, "tag", "tag1", -1) + + c.Reset("tag2") + checkHandlerForMetricWithLabels(t, name, "tag", "tag1", -1) + checkHandlerForMetricWithLabels(t, name, "tag", "tag2", 0) +} + +func checkHandlerForMetricWithLabels(t *testing.T, metric string, tagName string, tagValue string, value int) { + response := testMetricsHandler(t) + + expected := fmt.Sprintf("%s_%s{%s=\"%s\"} %d", namespace, metric, tagName, tagValue, value) + + if !strings.Contains(response.Body.String(), expected) { + t.Fatalf("Expected %s got %s", expected, response.Body.String()) + } +} + +func TestPrometheusCountersWithMultiLabels(t *testing.T) { + name := "blah_counterswithmultilabels" + labels := []string{"label1", "label2"} + labelValues := []string{"foo", "bar"} + c := stats.NewCountersWithMultiLabels(name, "help", labels) + c.Add(labelValues, 1) + checkHandlerForMetricWithMultiLabels(t, name, labels, labelValues, 1) + labelValues2 := []string{"baz", "bazbar"} + c.Add(labelValues2, 1) + checkHandlerForMetricWithMultiLabels(t, name, labels, labelValues, 1) + checkHandlerForMetricWithMultiLabels(t, name, labels, labelValues2, 1) + c.Reset(labelValues) + checkHandlerForMetricWithMultiLabels(t, name, labels, labelValues, 0) + checkHandlerForMetricWithMultiLabels(t, name, labels, labelValues2, 1) +} + +func TestPrometheusGaugesWithMultiLabels(t *testing.T) { + name := "blah_gaugeswithmultilabels" + labels := []string{"label1", "label2"} + labelValues := []string{"foo", "bar"} + c := stats.NewGaugesWithMultiLabels(name, "help", labels) + c.Add(labelValues, 1) + checkHandlerForMetricWithMultiLabels(t, name, labels, labelValues, 1) + + c.Set(labelValues, -1) + checkHandlerForMetricWithMultiLabels(t, name, labels, labelValues, -1) + + labelValues2 := []string{"baz", "bazbar"} + c.Add(labelValues2, 1) + checkHandlerForMetricWithMultiLabels(t, name, labels, labelValues, -1) + checkHandlerForMetricWithMultiLabels(t, name, labels, labelValues2, 1) + + c.Reset(labelValues) + checkHandlerForMetricWithMultiLabels(t, name, labels, labelValues, 0) + checkHandlerForMetricWithMultiLabels(t, name, labels, labelValues2, 1) +} + +func TestPrometheusCountersWithMultiLabels_AddPanic(t *testing.T) { + defer func() { + if r := recover(); r == nil { + t.Errorf("The code did not panic when adding to inequal label lengths") + } + }() + + name := "blah_counterswithmultilabels_inequallength" + c := stats.NewCountersWithMultiLabels(name, "help", []string{"label1", "label2"}) + c.Add([]string{"label1"}, 1) +} + +func TestPrometheusCountersFuncWithMultiLabels(t *testing.T) { + name := "blah_countersfuncwithmultilabels" + labels := []string{"label1", "label2"} + + stats.NewCountersFuncWithMultiLabels(name, labels, "help", func() map[string]int64 { + m := make(map[string]int64) + m["foo.bar"] = 1 + m["bar.baz"] = 1 + return m + }) + + checkHandlerForMetricWithMultiLabels(t, name, labels, []string{"foo", "bar"}, 1) + checkHandlerForMetricWithMultiLabels(t, name, labels, []string{"bar", "baz"}, 1) +} + +func checkHandlerForMetricWithMultiLabels(t *testing.T, metric string, labels []string, labelValues []string, value int64) { + response := testMetricsHandler(t) + + expected := fmt.Sprintf("%s_%s{%s=\"%s\",%s=\"%s\"} %d", namespace, metric, labels[0], labelValues[0], labels[1], labelValues[1], value) + + if !strings.Contains(response.Body.String(), expected) { + t.Fatalf("Expected %s got %s", expected, response.Body.String()) + } +} + +func TestPrometheusTimings(t *testing.T) { + name := "blah_timings" + cats := []string{"cat1", "cat2"} + timing := stats.NewTimings(name, "help", cats...) + timing.Add("cat1", time.Duration(1000000000)) + + response := testMetricsHandler(t) + var s []string + + s = append(s, fmt.Sprintf("%s_%s_bucket{Histograms=\"%s\",le=\"0.0005\"} %d", namespace, name, cats[0], 0)) + s = append(s, fmt.Sprintf("%s_%s_bucket{Histograms=\"%s\",le=\"0.001\"} %d", namespace, name, cats[0], 0)) + s = append(s, fmt.Sprintf("%s_%s_bucket{Histograms=\"%s\",le=\"0.005\"} %d", namespace, name, cats[0], 0)) + s = append(s, fmt.Sprintf("%s_%s_bucket{Histograms=\"%s\",le=\"0.01\"} %d", namespace, name, cats[0], 0)) + s = append(s, fmt.Sprintf("%s_%s_bucket{Histograms=\"%s\",le=\"0.05\"} %d", namespace, name, cats[0], 0)) + s = append(s, fmt.Sprintf("%s_%s_bucket{Histograms=\"%s\",le=\"0.1\"} %d", namespace, name, cats[0], 0)) + s = append(s, fmt.Sprintf("%s_%s_bucket{Histograms=\"%s\",le=\"0.5\"} %d", namespace, name, cats[0], 0)) + s = append(s, fmt.Sprintf("%s_%s_bucket{Histograms=\"%s\",le=\"1\"} %d", namespace, name, cats[0], 1)) + s = append(s, fmt.Sprintf("%s_%s_bucket{Histograms=\"%s\",le=\"5\"} %d", namespace, name, cats[0], 1)) + s = append(s, fmt.Sprintf("%s_%s_bucket{Histograms=\"%s\",le=\"10\"} %d", namespace, name, cats[0], 1)) + s = append(s, fmt.Sprintf("%s_%s_bucket{Histograms=\"%s\",le=\"+Inf\"} %d", namespace, name, cats[0], 1)) + s = append(s, fmt.Sprintf("%s_%s_sum{Histograms=\"%s\"} %d", namespace, name, cats[0], 1)) + s = append(s, fmt.Sprintf("%s_%s_count{Histograms=\"%s\"} %d", namespace, name, cats[0], 1)) + + for _, line := range s { + if !strings.Contains(response.Body.String(), line) { + t.Fatalf("Expected result to contain %s, got %s", line, response.Body.String()) + } + } +} + +func TestPrometheusMultiTimings(t *testing.T) { + name := "blah_multitimings" + cats := []string{"cat1", "cat2"} + catLabels := []string{"foo", "bar"} + timing := stats.NewMultiTimings(name, "help", cats) + timing.Add(catLabels, time.Duration(1000000000)) + + response := testMetricsHandler(t) + var s []string + + s = append(s, fmt.Sprintf("%s_%s_bucket{%s=\"%s\",%s=\"%s\",le=\"0.0005\"} %d", namespace, name, cats[0], catLabels[0], cats[1], catLabels[1], 0)) + s = append(s, fmt.Sprintf("%s_%s_bucket{%s=\"%s\",%s=\"%s\",le=\"0.001\"} %d", namespace, name, cats[0], catLabels[0], cats[1], catLabels[1], 0)) + s = append(s, fmt.Sprintf("%s_%s_bucket{%s=\"%s\",%s=\"%s\",le=\"0.005\"} %d", namespace, name, cats[0], catLabels[0], cats[1], catLabels[1], 0)) + s = append(s, fmt.Sprintf("%s_%s_bucket{%s=\"%s\",%s=\"%s\",le=\"0.01\"} %d", namespace, name, cats[0], catLabels[0], cats[1], catLabels[1], 0)) + s = append(s, fmt.Sprintf("%s_%s_bucket{%s=\"%s\",%s=\"%s\",le=\"0.05\"} %d", namespace, name, cats[0], catLabels[0], cats[1], catLabels[1], 0)) + s = append(s, fmt.Sprintf("%s_%s_bucket{%s=\"%s\",%s=\"%s\",le=\"0.1\"} %d", namespace, name, cats[0], catLabels[0], cats[1], catLabels[1], 0)) + s = append(s, fmt.Sprintf("%s_%s_bucket{%s=\"%s\",%s=\"%s\",le=\"0.5\"} %d", namespace, name, cats[0], catLabels[0], cats[1], catLabels[1], 0)) + s = append(s, fmt.Sprintf("%s_%s_bucket{%s=\"%s\",%s=\"%s\",le=\"1\"} %d", namespace, name, cats[0], catLabels[0], cats[1], catLabels[1], 1)) + s = append(s, fmt.Sprintf("%s_%s_bucket{%s=\"%s\",%s=\"%s\",le=\"5\"} %d", namespace, name, cats[0], catLabels[0], cats[1], catLabels[1], 1)) + s = append(s, fmt.Sprintf("%s_%s_bucket{%s=\"%s\",%s=\"%s\",le=\"10\"} %d", namespace, name, cats[0], catLabels[0], cats[1], catLabels[1], 1)) + s = append(s, fmt.Sprintf("%s_%s_bucket{%s=\"%s\",%s=\"%s\",le=\"+Inf\"} %d", namespace, name, cats[0], catLabels[0], cats[1], catLabels[1], 1)) + s = append(s, fmt.Sprintf("%s_%s_sum{%s=\"%s\",%s=\"%s\"} %d", namespace, name, cats[0], catLabels[0], cats[1], catLabels[1], 1)) + s = append(s, fmt.Sprintf("%s_%s_count{%s=\"%s\",%s=\"%s\"} %d", namespace, name, cats[0], catLabels[0], cats[1], catLabels[1], 1)) + + for _, line := range s { + if !strings.Contains(response.Body.String(), line) { + t.Fatalf("Expected result to contain %s, got %s", line, response.Body.String()) + } + } +} + +func TestPrometheusMultiTimings_PanicWrongLength(t *testing.T) { + defer func() { + if r := recover(); r == nil { + t.Errorf("The code did not panic when adding to inequal label lengths") + } + }() + + c := stats.NewMultiTimings("name", "help", []string{"label1", "label2"}) + c.Add([]string{"label1"}, time.Duration(100000000)) +} + +func testMetricsHandler(t *testing.T) *httptest.ResponseRecorder { + req, _ := http.NewRequest("GET", "/metrics", nil) + response := httptest.NewRecorder() + + promhttp.Handler().ServeHTTP(response, req) + return response +} + +func TestMain(m *testing.M) { + Init(namespace) + os.Exit(m.Run()) +} diff --git a/go/stats/promstats/collector.go b/go/stats/promstats/collector.go deleted file mode 100644 index c1794404308..00000000000 --- a/go/stats/promstats/collector.go +++ /dev/null @@ -1,213 +0,0 @@ -/* -Copyright 2017 Google Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreedto in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -/* -Package promstats contains adapters to publish stats variables to prometheus (http://prometheus.io) -*/ -package promstats - -import ( - "expvar" - "fmt" - "strings" - "time" - - log "github.com/golang/glog" - "github.com/prometheus/client_golang/prometheus" - "vitess.io/vitess/go/stats" -) - -// NewCollector returns a prometheus.Collector for a given stats var. -// It supports all stats var types except String, StringFunc and Rates. -// The returned collector still needs to be registered with prometheus registry. -func NewCollector(opts prometheus.Opts, v expvar.Var) prometheus.Collector { - switch st := v.(type) { - case *stats.Int: - return prometheus.NewGaugeFunc(prometheus.GaugeOpts(opts), func() float64 { - return float64(st.Get()) - }) - case stats.IntFunc: - return prometheus.NewGaugeFunc(prometheus.GaugeOpts(opts), func() float64 { - return float64(st()) - }) - case *stats.Duration: - return prometheus.NewGaugeFunc(prometheus.GaugeOpts(opts), func() float64 { - return st.Get().Seconds() - }) - case stats.DurationFunc: - return prometheus.NewGaugeFunc(prometheus.GaugeOpts(opts), func() float64 { - return st().Seconds() - }) - case *stats.Float: - return prometheus.NewGaugeFunc(prometheus.GaugeOpts(opts), st.Get) - case stats.FloatFunc: - return prometheus.NewGaugeFunc(prometheus.GaugeOpts(opts), st) - case *stats.Counters: - return newCountersCollector(opts, st, "tag") - case stats.CountersFunc: - return newCountersCollector(opts, st, "tag") - case *stats.MultiCounters: - return newCountersCollector(opts, st, st.Labels()...) - case *stats.MultiCountersFunc: - return newCountersCollector(opts, st, st.Labels()...) - case *stats.Histogram: - return newHistogramCollector(opts, st) - case *stats.Timings: - return newTimingsCollector(opts, st, "category") - case *stats.MultiTimings: - return newTimingsCollector(opts, &st.Timings, st.Labels()...) - case *stats.String: - // prometheus can't collect string values - return nil - case stats.StringFunc: - // prometheus can't collect string values - return nil - case *stats.Rates: - // Ignore these, because monitoring tools will calculate - // rates for us. - return nil - default: - log.Warningf("Unsupported type for %s: %T", opts.Name, v) - return nil - } -} - -type countersCollector struct { - desc *prometheus.Desc - c stats.CountTracker - nLabels int -} - -func newCountersCollector(opts prometheus.Opts, c stats.CountTracker, labels ...string) prometheus.Collector { - desc := prometheus.NewDesc( - prometheus.BuildFQName(opts.Namespace, opts.Subsystem, opts.Name), - opts.Help, - labels, - opts.ConstLabels, - ) - return countersCollector{ - desc: desc, - c: c, - nLabels: len(labels), - } -} - -func (c countersCollector) Describe(ch chan<- *prometheus.Desc) { - ch <- c.desc -} - -var replacer = strings.NewReplacer(`\\`, `\`, `\.`, `.`, `.`, "\000") - -func split(key string) []string { - return strings.Split(replacer.Replace(key), "\000") -} - -func (c countersCollector) Collect(ch chan<- prometheus.Metric) { - for k, n := range c.c.Counts() { - if c.nLabels > 1 { - labels := split(k) - if len(labels) != c.nLabels { - err := fmt.Errorf("wrong number of labels in MultiCounters key: %d != %d (key=%q)", len(labels), c.nLabels, k) - ch <- prometheus.NewInvalidMetric(c.desc, err) - continue - } - ch <- prometheus.MustNewConstMetric(c.desc, prometheus.GaugeValue, float64(n), labels...) - continue - } - ch <- prometheus.MustNewConstMetric(c.desc, prometheus.GaugeValue, float64(n), k) - } -} - -type histogramCollector struct { - desc *prometheus.Desc - h *stats.Histogram -} - -func newHistogramCollector(opts prometheus.Opts, h *stats.Histogram) histogramCollector { - desc := prometheus.NewDesc( - prometheus.BuildFQName(opts.Namespace, opts.Subsystem, opts.Name), - opts.Help, - nil, - opts.ConstLabels, - ) - return histogramCollector{ - desc: desc, - h: h, - } -} - -func histogramMetric(desc *prometheus.Desc, h *stats.Histogram, scale float64, labels ...string) prometheus.Metric { - count := uint64(0) - sum := float64(h.Total()) * scale - cutoffs := h.Cutoffs() - statBuckets := h.Buckets() - promBuckets := make(map[float64]uint64, len(cutoffs)) - for i, cutoff := range cutoffs { - upperBound := float64(cutoff) * scale - count += uint64(statBuckets[i]) - promBuckets[upperBound] = count - } - count += uint64(statBuckets[len(statBuckets)-1]) - return prometheus.MustNewConstHistogram(desc, count, sum, promBuckets, labels...) -} - -func (h histogramCollector) Describe(ch chan<- *prometheus.Desc) { - ch <- h.desc -} - -func (h histogramCollector) Collect(ch chan<- prometheus.Metric) { - ch <- histogramMetric(h.desc, h.h, 1) -} - -type timingsCollector struct { - desc *prometheus.Desc - t *stats.Timings - nLabels int -} - -func newTimingsCollector(opts prometheus.Opts, t *stats.Timings, labels ...string) prometheus.Collector { - desc := prometheus.NewDesc( - prometheus.BuildFQName(opts.Namespace, opts.Subsystem, opts.Name), - opts.Help, - labels, - opts.ConstLabels, - ) - return timingsCollector{ - desc: desc, - t: t, - nLabels: len(labels), - } -} - -func (c timingsCollector) Describe(ch chan<- *prometheus.Desc) { - ch <- c.desc -} - -func (c timingsCollector) Collect(ch chan<- prometheus.Metric) { - for k, h := range c.t.Histograms() { - if c.nLabels > 1 { - labels := split(k) - if len(labels) != c.nLabels { - err := fmt.Errorf("wrong number of labels in MultiTimings key: %d != %d (key=%q)", len(labels), c.nLabels, k) - ch <- prometheus.NewInvalidMetric(c.desc, err) - continue - } - ch <- histogramMetric(c.desc, h, 1/float64(time.Second), labels...) - continue - } - ch <- histogramMetric(c.desc, h, 1/float64(time.Second), k) - } -} diff --git a/go/stats/promstats/collector_test.go b/go/stats/promstats/collector_test.go deleted file mode 100644 index f1a4d3eb1ae..00000000000 --- a/go/stats/promstats/collector_test.go +++ /dev/null @@ -1,214 +0,0 @@ -/* -Copyright 2017 Google Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreedto in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package promstats - -import ( - "expvar" - "testing" - "time" - - "github.com/prometheus/client_golang/prometheus" - "vitess.io/vitess/go/stats" - - pb "github.com/prometheus/client_model/go" -) - -func testMetric(t *testing.T, v expvar.Var, load func(), desc string, vals ...string) { - coll := NewCollector(prometheus.Opts{ - Name: "test_name", - Help: "test_help", - }, v) - if load != nil { - load() - } - ch := make(chan prometheus.Metric, 1) - go func() { - coll.Collect(ch) - close(ch) - }() - for _, val := range vals { - met, ok := <-ch - if !ok { - t.Error("coll.Collect(ch): too few metrics returned") - } - if got := met.Desc().String(); got != desc { - t.Errorf("met.Desc().String(): %q, want %q", got, desc) - } - m := pb.Metric{} - if err := met.Write(&m); err != nil { - t.Fatalf("met.Write(): err=%s, want nil", err) - } - if got := m.String(); got != val { - t.Errorf("met.Write(&m); m.String(): %q, want %q", got, val) - } - } - _, ok := <-ch - if ok { - t.Error("coll.Collect(ch): too many metrics returned") - } -} - -func TestInt(t *testing.T) { - v := stats.NewInt("") - load := func() { - v.Set(1234) - } - testMetric(t, v, load, - `Desc{fqName: "test_name", help: "test_help", constLabels: {}, variableLabels: []}`, - `gauge: `, - ) -} - -func TestIntFunc(t *testing.T) { - f := func() int64 { - return 1234 - } - v := stats.IntFunc(f) - testMetric(t, v, nil, - `Desc{fqName: "test_name", help: "test_help", constLabels: {}, variableLabels: []}`, - `gauge: `, - ) -} - -func TestDuration(t *testing.T) { - v := stats.NewDuration("") - load := func() { - v.Set(42 * time.Minute) - } - testMetric(t, v, load, - `Desc{fqName: "test_name", help: "test_help", constLabels: {}, variableLabels: []}`, - `gauge: `, - ) -} - -func TestDurationFunc(t *testing.T) { - f := func() time.Duration { - return 42 * time.Minute - } - v := stats.DurationFunc(f) - testMetric(t, v, nil, - `Desc{fqName: "test_name", help: "test_help", constLabels: {}, variableLabels: []}`, - `gauge: `, - ) -} - -func TestFloat(t *testing.T) { - v := stats.NewFloat("Floaty McFloatface") - load := func() { - v.Set(1234) - } - testMetric(t, v, load, - `Desc{fqName: "test_name", help: "test_help", constLabels: {}, variableLabels: []}`, - `gauge: `, - ) -} - -func TestFloatFunc(t *testing.T) { - f := func() float64 { - return 1234 - } - v := stats.FloatFunc(f) - testMetric(t, v, nil, - `Desc{fqName: "test_name", help: "test_help", constLabels: {}, variableLabels: []}`, - `gauge: `, - ) -} - -func TestCounters(t *testing.T) { - v := stats.NewCounters("") - load := func() { - v.Add("a", 1) - } - testMetric(t, v, load, - `Desc{fqName: "test_name", help: "test_help", constLabels: {}, variableLabels: [tag]}`, - `label: gauge: `, - ) -} - -func TestCountersFunc(t *testing.T) { - f := func() map[string]int64 { - return map[string]int64{"a": 1234} - } - v := stats.CountersFunc(f) - testMetric(t, v, nil, - `Desc{fqName: "test_name", help: "test_help", constLabels: {}, variableLabels: [tag]}`, - `label: gauge: `, - ) -} - -func TestMultiCounters(t *testing.T) { - v := stats.NewMultiCounters("", []string{"label1", "label2"}) - load := func() { - v.Add([]string{"a", "b"}, 1) - } - testMetric(t, v, load, - `Desc{fqName: "test_name", help: "test_help", constLabels: {}, variableLabels: [label1 label2]}`, - `label: label: gauge: `, - ) -} - -func TestMultiCountersFunc(t *testing.T) { - f := func() map[string]int64 { - return map[string]int64{ - "a.b": 1, - } - } - v := stats.NewMultiCountersFunc("", []string{"label1", "label2"}, f) - testMetric(t, v, nil, - `Desc{fqName: "test_name", help: "test_help", constLabels: {}, variableLabels: [label1 label2]}`, - `label: label: gauge: `, - ) -} - -func TestHistogram(t *testing.T) { - v := stats.NewHistogram("", []int64{1, 3, 5, 7}) - load := func() { - for i := int64(0); i < 10; i++ { - v.Add(i) - } - } - testMetric(t, v, load, - `Desc{fqName: "test_name", help: "test_help", constLabels: {}, variableLabels: []}`, - `histogram: bucket: bucket: bucket: > `, - ) -} - -func TestTimings(t *testing.T) { - v := stats.NewTimings("") - load := func() { - for i := 100 * time.Microsecond; i < time.Second; i *= 2 { - v.Add("a", i) - } - } - testMetric(t, v, load, - `Desc{fqName: "test_name", help: "test_help", constLabels: {}, variableLabels: [category]}`, - `label: histogram: bucket: bucket: bucket: bucket: bucket: bucket: bucket: bucket: bucket: > `, - ) -} - -func TestMultiTimings(t *testing.T) { - v := stats.NewMultiTimings("", []string{"label1", "label2"}) - load := func() { - for i := 100 * time.Microsecond; i < time.Second; i *= 2 { - v.Add([]string{"a", "b"}, i) - } - } - testMetric(t, v, load, - `Desc{fqName: "test_name", help: "test_help", constLabels: {}, variableLabels: [label1 label2]}`, - `label: label: histogram: bucket: bucket: bucket: bucket: bucket: bucket: bucket: bucket: bucket: > `, - ) -} diff --git a/go/stats/rates_test.go b/go/stats/rates_test.go index a2adcf31ce5..7e8cc901a65 100644 --- a/go/stats/rates_test.go +++ b/go/stats/rates_test.go @@ -42,7 +42,7 @@ func TestRates(t *testing.T) { } clear() - c := NewCounters("rcounter1") + c := NewCountersWithLabels("rcounter1", "rcounter help", "type") r := NewRates("rates1", c, 3, -1*time.Second) r.snapshot() now = now.Add(epsilon) @@ -90,7 +90,7 @@ func TestRatesConsistency(t *testing.T) { // covered by rates, the sum of the rates reported must be // equal to the count reported by the counter. clear() - c := NewCounters("rcounter4") + c := NewCountersWithLabels("rcounter4", "rcounter4 help", "type") r := NewRates("rates4", c, 100, -1*time.Second) r.snapshot() @@ -123,7 +123,7 @@ func TestRatesConsistency(t *testing.T) { func TestRatesHook(t *testing.T) { clear() - c := NewCounters("rcounter2") + c := NewCountersWithLabels("rcounter2", "rcounter2 help", "type") var gotname string var gotv *Rates clear() diff --git a/go/stats/snake_case_converter.go b/go/stats/snake_case_converter.go new file mode 100644 index 00000000000..c29f8c83fbe --- /dev/null +++ b/go/stats/snake_case_converter.go @@ -0,0 +1,64 @@ +/* +Copyright 2018 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package stats + +import ( + "regexp" + "strings" +) + +// GetSnakeName calls toSnakeName on the passed in string. It produces +// a snake-cased name from the provided camel-cased name. +// It memoizes the transformation and returns the stored result if available. +func GetSnakeName(name string) string { + return toSnakeCase(name) +} + +// toSnakeCase produces a monitoring compliant name from the original. +// For systems (like Prometheus) that ask for snake-case names. +// It converts CamelCase to camel_case, and CAMEL_CASE to camel_case. +// For numbers, it converts 0.5 to v0_5. +func toSnakeCase(name string) (hyphenated string) { + snakeMemoizer.Lock() + defer snakeMemoizer.Unlock() + if hyphenated = snakeMemoizer.memo[name]; hyphenated != "" { + return hyphenated + } + hyphenated = name + for _, converter := range snakeConverters { + hyphenated = converter.re.ReplaceAllString(hyphenated, converter.repl) + } + hyphenated = strings.ToLower(hyphenated) + snakeMemoizer.memo[name] = hyphenated + return +} + +var snakeConverters = []struct { + re *regexp.Regexp + repl string +}{ + // example: LC -> L_C (e.g. CamelCase -> Camel_Case). + {regexp.MustCompile("([a-z])([A-Z])"), "${1}_${2}"}, + // example: CCa -> C_Ca (e.g. CCamel -> C_Camel). + {regexp.MustCompile("([A-Z])([A-Z][a-z])"), "${1}_${2}"}, + {regexp.MustCompile("\\."), "_"}, + {regexp.MustCompile("-"), "_"}, +} + +var snakeMemoizer = memoizerType{ + memo: make(map[string]string), +} diff --git a/go/stats/snake_case_converter_test.go b/go/stats/snake_case_converter_test.go new file mode 100644 index 00000000000..bcd4194ac79 --- /dev/null +++ b/go/stats/snake_case_converter_test.go @@ -0,0 +1,52 @@ +/* +Copyright 2018 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package stats + +import "testing" + +func TestToSnakeCase(t *testing.T) { + var snakeCaseTest = []struct{ input, output string }{ + {"Camel", "camel"}, + {"Camel", "camel"}, + {"CamelCase", "camel_case"}, + {"CamelCaseAgain", "camel_case_again"}, + {"CCamel", "c_camel"}, + {"CCCamel", "cc_camel"}, + {"CAMEL_CASE", "camel_case"}, + {"camel-case", "camel_case"}, + {"0", "0"}, + {"0.0", "0_0"}, + {"JSON", "json"}, + } + + for _, tt := range snakeCaseTest { + if got, want := toSnakeCase(tt.input), tt.output; got != want { + t.Errorf("want '%s', got '%s'", want, got) + } + } +} + +func TestSnakeMemoize(t *testing.T) { + key := "Test" + if snakeMemoizer.memo[key] != "" { + t.Errorf("want '', got '%s'", snakeMemoizer.memo[key]) + } + toSnakeCase(key) + if snakeMemoizer.memo[key] != "test" { + t.Errorf("want 'test', got '%s'", snakeMemoizer.memo[key]) + } +} diff --git a/go/stats/timings.go b/go/stats/timings.go index f5370ccc069..ce3cd685e72 100644 --- a/go/stats/timings.go +++ b/go/stats/timings.go @@ -37,20 +37,25 @@ type Timings struct { mu sync.RWMutex histograms map[string]*Histogram hook func(string, time.Duration) + help string } // NewTimings creates a new Timings object, and publishes it if name is set. // categories is an optional list of categories to initialize to 0. // Categories that aren't initialized will be missing from the map until the // first time they are updated. -func NewTimings(name string, categories ...string) *Timings { - t := &Timings{histograms: make(map[string]*Histogram)} +func NewTimings(name string, help string, categories ...string) *Timings { + t := &Timings{ + histograms: make(map[string]*Histogram), + help: help, + } for _, cat := range categories { t.histograms[cat] = NewGenericHistogram("", bucketCutoffs, bucketLabels, "Count", "Time") } if name != "" { publish(name, t) } + return t } @@ -150,6 +155,11 @@ func (t *Timings) Cutoffs() []int64 { return bucketCutoffs } +// Help returns the help string. +func (t *Timings) Help() string { + return t.help +} + var bucketCutoffs = []int64{5e5, 1e6, 5e6, 1e7, 5e7, 1e8, 5e8, 1e9, 5e9, 1e10} var bucketLabels []string @@ -171,14 +181,18 @@ type MultiTimings struct { } // NewMultiTimings creates a new MultiTimings object. -func NewMultiTimings(name string, labels []string) *MultiTimings { +func NewMultiTimings(name string, help string, labels []string) *MultiTimings { t := &MultiTimings{ - Timings: Timings{histograms: make(map[string]*Histogram)}, - labels: labels, + Timings: Timings{ + histograms: make(map[string]*Histogram), + help: help, + }, + labels: labels, } if name != "" { publish(name, t) } + return t } diff --git a/go/stats/timings_test.go b/go/stats/timings_test.go index a704ed5d3f5..e5a76426269 100644 --- a/go/stats/timings_test.go +++ b/go/stats/timings_test.go @@ -24,7 +24,7 @@ import ( func TestTimings(t *testing.T) { clear() - tm := NewTimings("timings1") + tm := NewTimings("timings1", "help") tm.Add("tag1", 500*time.Microsecond) tm.Add("tag1", 1*time.Millisecond) tm.Add("tag2", 1*time.Millisecond) @@ -36,7 +36,7 @@ func TestTimings(t *testing.T) { func TestMultiTimings(t *testing.T) { clear() - mtm := NewMultiTimings("maptimings1", []string{"dim1", "dim2"}) + mtm := NewMultiTimings("maptimings1", "help", []string{"dim1", "dim2"}) mtm.Add([]string{"tag1a", "tag1b"}, 500*time.Microsecond) mtm.Add([]string{"tag1a", "tag1b"}, 1*time.Millisecond) mtm.Add([]string{"tag2a", "tag2b"}, 1*time.Millisecond) @@ -56,7 +56,7 @@ func TestTimingsHook(t *testing.T) { }) name := "timings2" - v := NewTimings(name) + v := NewTimings(name, "help") if gotname != name { t.Errorf("got %q, want %q", gotname, name) } diff --git a/go/streamlog/streamlog.go b/go/streamlog/streamlog.go index 50e9b3418d7..31a81bbd16f 100644 --- a/go/streamlog/streamlog.go +++ b/go/streamlog/streamlog.go @@ -40,9 +40,15 @@ var ( // QueryLogFormat controls the format of the query log (either text or json) QueryLogFormat = flag.String("querylog-format", "text", "format for query logs (\"text\" or \"json\")") - sendCount = stats.NewCounters("StreamlogSend") - deliveredCount = stats.NewMultiCounters("StreamlogDelivered", []string{"Log", "Subscriber"}) - deliveryDropCount = stats.NewMultiCounters("StreamlogDeliveryDroppedMessages", []string{"Log", "Subscriber"}) + sendCount = stats.NewCountersWithLabels("StreamlogSend", "stream log send count", "logger_names") + deliveredCount = stats.NewCountersWithMultiLabels( + "StreamlogDelivered", + "Stream log delivered", + []string{"Log", "Subscriber"}) + deliveryDropCount = stats.NewCountersWithMultiLabels( + "StreamlogDeliveryDroppedMessages", + "Dropped messages by streamlog delivery", + []string{"Log", "Subscriber"}) ) const ( diff --git a/go/vt/binlog/binlog_streamer.go b/go/vt/binlog/binlog_streamer.go index e5ed24e1739..f133109f01c 100644 --- a/go/vt/binlog/binlog_streamer.go +++ b/go/vt/binlog/binlog_streamer.go @@ -37,7 +37,7 @@ import ( ) var ( - binlogStreamerErrors = stats.NewCounters("BinlogStreamerErrors") + binlogStreamerErrors = stats.NewCountersWithLabels("BinlogStreamerErrors", "error count when streaming binlog", "state") // ErrClientEOF is returned by Streamer if the stream ended because the // consumer of the stream indicated it doesn't want any more events. diff --git a/go/vt/binlog/binlogplayer/binlog_player.go b/go/vt/binlog/binlogplayer/binlog_player.go index d3bb97d2e03..e9b13c7d42a 100644 --- a/go/vt/binlog/binlogplayer/binlog_player.go +++ b/go/vt/binlog/binlogplayer/binlog_player.go @@ -88,7 +88,7 @@ func (bps *Stats) GetLastPosition() mysql.Position { // NewStats creates a new Stats structure func NewStats() *Stats { bps := &Stats{} - bps.Timings = stats.NewTimings("") + bps.Timings = stats.NewTimings("", "") bps.Rates = stats.NewRates("", bps.Timings, 15, 60e9) return bps } diff --git a/go/vt/binlog/updatestreamctl.go b/go/vt/binlog/updatestreamctl.go index a628d926ffd..e97278bef49 100644 --- a/go/vt/binlog/updatestreamctl.go +++ b/go/vt/binlog/updatestreamctl.go @@ -47,12 +47,12 @@ var usStateNames = map[int64]string{ } var ( - streamCount = stats.NewCounters("UpdateStreamStreamCount") - updateStreamErrors = stats.NewCounters("UpdateStreamErrors") - keyrangeStatements = stats.NewInt("UpdateStreamKeyRangeStatements") - keyrangeTransactions = stats.NewInt("UpdateStreamKeyRangeTransactions") - tablesStatements = stats.NewInt("UpdateStreamTablesStatements") - tablesTransactions = stats.NewInt("UpdateStreamTablesTransactions") + streamCount = stats.NewCountersWithLabels("UpdateStreamStreamCount", "update stream count", "type") + updateStreamErrors = stats.NewCountersWithLabels("UpdateStreamErrors", "update stream error count", "type") + keyrangeStatements = stats.NewCounter("UpdateStreamKeyRangeStatements", "update stream key range statement count") + keyrangeTransactions = stats.NewCounter("UpdateStreamKeyRangeTransactions", "update stream key range transaction count") + tablesStatements = stats.NewCounter("UpdateStreamTablesStatements", "update stream table statement count") + tablesTransactions = stats.NewCounter("UpdateStreamTablesTransactions", "update stream table transaction count") ) // UpdateStreamControl is the interface an UpdateStream service implements diff --git a/go/vt/dbconnpool/connection_pool.go b/go/vt/dbconnpool/connection_pool.go index 4b127b0ed6b..2d27c337aaf 100644 --- a/go/vt/dbconnpool/connection_pool.go +++ b/go/vt/dbconnpool/connection_pool.go @@ -65,15 +65,15 @@ func NewConnectionPool(name string, capacity int, idleTimeout time.Duration) *Co return cp } usedNames[name] = true - stats.Publish(name+"Capacity", stats.IntFunc(cp.Capacity)) - stats.Publish(name+"Available", stats.IntFunc(cp.Available)) - stats.Publish(name+"Active", stats.IntFunc(cp.Active)) - stats.Publish(name+"InUse", stats.IntFunc(cp.InUse)) - stats.Publish(name+"MaxCap", stats.IntFunc(cp.MaxCap)) - stats.Publish(name+"WaitCount", stats.IntFunc(cp.WaitCount)) - stats.Publish(name+"WaitTime", stats.DurationFunc(cp.WaitTime)) - stats.Publish(name+"IdleTimeout", stats.DurationFunc(cp.IdleTimeout)) - stats.Publish(name+"IdleClosed", stats.IntFunc(cp.IdleClosed)) + stats.NewGaugeFunc(name+"Capacity", "Connection pool capacity", stats.IntFunc(cp.Capacity)) + stats.NewGaugeFunc(name+"Available", "Connection pool available", stats.IntFunc(cp.Available)) + stats.NewGaugeFunc(name+"Active", "Connection pool active", stats.IntFunc(cp.Active)) + stats.NewGaugeFunc(name+"InUse", "Connection pool in-use", stats.IntFunc(cp.InUse)) + stats.NewGaugeFunc(name+"MaxCap", "Connection pool max cap", stats.IntFunc(cp.MaxCap)) + stats.NewCounterFunc(name+"WaitCount", "Connection pool wait count", stats.IntFunc(cp.WaitCount)) + stats.NewCounterFunc(name+"WaitTime", "Connection pool wait time", stats.DurationFunc(cp.WaitTime)) + stats.NewCounterFunc(name+"IdleTimeout", "Connection pool idle timeout", stats.DurationFunc(cp.IdleTimeout)) + stats.NewGaugeFunc(name+"IdleClosed", "Connection pool idle closed", stats.IntFunc(cp.IdleClosed)) return cp } diff --git a/go/vt/discovery/healthcheck.go b/go/vt/discovery/healthcheck.go index 52f8041a37d..592177de63b 100644 --- a/go/vt/discovery/healthcheck.go +++ b/go/vt/discovery/healthcheck.go @@ -64,8 +64,8 @@ import ( ) var ( - hcErrorCounters = stats.NewMultiCounters("HealthcheckErrors", []string{"Keyspace", "ShardName", "TabletType"}) - hcMasterPromotedCounters = stats.NewMultiCounters("HealthcheckMasterPromoted", []string{"Keyspace", "ShardName"}) + hcErrorCounters = stats.NewCountersWithMultiLabels("HealthcheckErrors", "Healthcheck Errors", []string{"Keyspace", "ShardName", "TabletType"}) + hcMasterPromotedCounters = stats.NewCountersWithMultiLabels("HealthcheckMasterPromoted", "Master promoted in keyspace/shard name because of health check errors", []string{"Keyspace", "ShardName"}) healthcheckOnce sync.Once ) @@ -332,7 +332,11 @@ func NewHealthCheck(retryDelay, healthCheckTimeout time.Duration) HealthCheck { // RegisterStats registers the connection counts stats func (hc *HealthCheckImpl) RegisterStats() { - stats.NewMultiCountersFunc("HealthcheckConnections", []string{"Keyspace", "ShardName", "TabletType"}, hc.servingConnStats) + stats.NewCountersFuncWithMultiLabels( + "HealthcheckConnections", + []string{"Keyspace", "ShardName", "TabletType"}, + "the numb of healthcheck connections registered", + hc.servingConnStats) } // ServeHTTP is part of the http.Handler interface. It renders the current state of the discovery gateway tablet cache into json. diff --git a/go/vt/mysqlctl/fakemysqldaemon/fakemysqldaemon.go b/go/vt/mysqlctl/fakemysqldaemon/fakemysqldaemon.go index 6672bd39b86..7f2d4a76e4c 100644 --- a/go/vt/mysqlctl/fakemysqldaemon/fakemysqldaemon.go +++ b/go/vt/mysqlctl/fakemysqldaemon/fakemysqldaemon.go @@ -146,7 +146,7 @@ func NewFakeMysqlDaemon(db *fakesqldb.DB) *FakeMysqlDaemon { } if db != nil { result.appPool = dbconnpool.NewConnectionPool("AppConnPool", 5, time.Minute) - result.appPool.Open(db.ConnParams(), stats.NewTimings("")) + result.appPool.Open(db.ConnParams(), stats.NewTimings("", "")) } return result } @@ -410,12 +410,12 @@ func (fmd *FakeMysqlDaemon) GetAppConnection(ctx context.Context) (*dbconnpool.P // GetDbaConnection is part of the MysqlDaemon interface. func (fmd *FakeMysqlDaemon) GetDbaConnection() (*dbconnpool.DBConnection, error) { - return dbconnpool.NewDBConnection(fmd.db.ConnParams(), stats.NewTimings("")) + return dbconnpool.NewDBConnection(fmd.db.ConnParams(), stats.NewTimings("", "")) } // GetAllPrivsConnection is part of the MysqlDaemon interface. func (fmd *FakeMysqlDaemon) GetAllPrivsConnection() (*dbconnpool.DBConnection, error) { - return dbconnpool.NewDBConnection(fmd.db.ConnParams(), stats.NewTimings("")) + return dbconnpool.NewDBConnection(fmd.db.ConnParams(), stats.NewTimings("", "")) } // SetSemiSyncEnabled is part of the MysqlDaemon interface. diff --git a/go/vt/mysqlctl/mysqld.go b/go/vt/mysqlctl/mysqld.go index 63230d48242..6080a3159f0 100644 --- a/go/vt/mysqlctl/mysqld.go +++ b/go/vt/mysqlctl/mysqld.go @@ -73,9 +73,9 @@ var ( // masterConnectRetry is used in 'SET MASTER' commands masterConnectRetry = flag.Duration("master_connect_retry", 10*time.Second, "how long to wait in between slave -> connection attempts. Only precise to the second.") - dbaMysqlStats = stats.NewTimings("MysqlDba") - allprivsMysqlStats = stats.NewTimings("MysqlAllPrivs") - appMysqlStats = stats.NewTimings("MysqlApp") + dbaMysqlStats = stats.NewTimings("MysqlDba", "MySQL DBA stats") + allprivsMysqlStats = stats.NewTimings("MysqlAllPrivs", "MySQl Stats for all privs") + appMysqlStats = stats.NewTimings("MysqlApp", "MySQL app stats") ) // Mysqld is the object that represents a mysqld daemon running on this server. diff --git a/go/vt/servenv/buildinfo.go b/go/vt/servenv/buildinfo.go index fdbfae39252..2a6f4cbc904 100644 --- a/go/vt/servenv/buildinfo.go +++ b/go/vt/servenv/buildinfo.go @@ -88,10 +88,10 @@ func init() { stats.NewString("BuildHost").Set(AppVersion.buildHost) stats.NewString("BuildUser").Set(AppVersion.buildUser) - stats.NewInt("BuildTimestamp").Set(AppVersion.buildTime) + stats.NewGauge("BuildTimestamp", "build timestamp").Set(AppVersion.buildTime) stats.NewString("BuildGitRev").Set(AppVersion.buildGitRev) stats.NewString("BuildGitBranch").Set(AppVersion.buildGitBranch) - stats.NewInt("BuildNumber").Set(AppVersion.jenkinsBuildNumber) + stats.NewGauge("BuildNumber", "build number").Set(AppVersion.jenkinsBuildNumber) stats.NewString("GoVersion").Set(AppVersion.goVersion) stats.NewString("GoOS").Set(AppVersion.goOS) stats.NewString("GoArch").Set(AppVersion.goArch) diff --git a/go/vt/servenv/servenv.go b/go/vt/servenv/servenv.go index 140ecf4b8e6..f87d0eb7cee 100644 --- a/go/vt/servenv/servenv.go +++ b/go/vt/servenv/servenv.go @@ -98,7 +98,7 @@ func Init() { if err := syscall.Getrlimit(syscall.RLIMIT_NOFILE, fdLimit); err != nil { log.Errorf("max-open-fds failed: %v", err) } - fdl := stats.NewInt("MaxFds") + fdl := stats.NewGauge("MaxFds", "File descriptor limit") fdl.Set(int64(fdLimit.Cur)) onInitHooks.Fire() diff --git a/go/vt/sqlannotation/sqlannotation.go b/go/vt/sqlannotation/sqlannotation.go index 24098ffde36..9542ac0d193 100644 --- a/go/vt/sqlannotation/sqlannotation.go +++ b/go/vt/sqlannotation/sqlannotation.go @@ -40,7 +40,7 @@ const ( ) var ( - filteredReplicationUnfriendlyStatementsCount = stats.NewInt("FilteredReplicationUnfriendlyStatementsCount") + filteredReplicationUnfriendlyStatementsCount = stats.NewCounter("FilteredReplicationUnfriendlyStatementsCount", "Count of unfriendly statements found in filtered replication") ) // AnnotateIfDML annotates 'sql' based on 'keyspaceIDs' diff --git a/go/vt/srvtopo/resilient_server.go b/go/vt/srvtopo/resilient_server.go index 8252401fe61..3501ff37f9d 100644 --- a/go/vt/srvtopo/resilient_server.go +++ b/go/vt/srvtopo/resilient_server.go @@ -120,7 +120,7 @@ type ResilientServer struct { topoServer *topo.Server cacheTTL time.Duration cacheRefresh time.Duration - counts *stats.Counters + counts *stats.CountersWithLabels // mutex protects the cache map itself, not the individual // values in the cache. @@ -215,7 +215,7 @@ func NewResilientServer(base *topo.Server, counterPrefix string) *ResilientServe topoServer: base, cacheTTL: *srvTopoCacheTTL, cacheRefresh: *srvTopoCacheRefresh, - counts: stats.NewCounters(counterPrefix + "Counts"), + counts: stats.NewCountersWithLabels(counterPrefix+"Counts", "Resilient srvtopo server operations", "type"), srvKeyspaceNamesCache: make(map[string]*srvKeyspaceNamesEntry), srvKeyspaceCache: make(map[string]*srvKeyspaceEntry), diff --git a/go/vt/vtgate/buffer/buffer_test.go b/go/vt/vtgate/buffer/buffer_test.go index 17badd61950..ee94dd52010 100644 --- a/go/vt/vtgate/buffer/buffer_test.go +++ b/go/vt/vtgate/buffer/buffer_test.go @@ -899,17 +899,17 @@ func TestShutdown(t *testing.T) { // resetVariables resets the task level variables. The code does not reset these // with very failover. func resetVariables() { - starts.Reset() - stops.Reset() + starts.ResetAll() + stops.ResetAll() - utilizationSum.Reset() - utilizationDryRunSum.Reset() + utilizationSum.ResetAll() + utilizationDryRunSum.ResetAll() - requestsBuffered.Reset() - requestsBufferedDryRun.Reset() - requestsDrained.Reset() - requestsEvicted.Reset() - requestsSkipped.Reset() + requestsBuffered.ResetAll() + requestsBufferedDryRun.ResetAll() + requestsDrained.ResetAll() + requestsEvicted.ResetAll() + requestsSkipped.ResetAll() } // checkVariables makes sure that the invariants described in variables.go diff --git a/go/vt/vtgate/buffer/shard_buffer.go b/go/vt/vtgate/buffer/shard_buffer.go index f936884a521..87c2b7520ee 100644 --- a/go/vt/vtgate/buffer/shard_buffer.go +++ b/go/vt/vtgate/buffer/shard_buffer.go @@ -263,7 +263,7 @@ func (sb *shardBuffer) startBufferingLocked(err error) { // Reset monitoring data from previous failover. lastRequestsInFlightMax.Set(sb.statsKey, 0) lastRequestsDryRunMax.Set(sb.statsKey, 0) - failoverDurationSumMs.Set(sb.statsKey, 0) + failoverDurationSumMs.Reset(sb.statsKey) sb.lastStart = sb.now() sb.logErrorIfStateNotLocked(stateIdle) diff --git a/go/vt/vtgate/buffer/variables.go b/go/vt/vtgate/buffer/variables.go index 71917c7b5b0..a8e3ee960bb 100644 --- a/go/vt/vtgate/buffer/variables.go +++ b/go/vt/vtgate/buffer/variables.go @@ -23,22 +23,34 @@ import "vitess.io/vitess/go/stats" var ( // starts counts how often we started buffering (including dry-run bufferings). - starts = stats.NewMultiCounters("BufferStarts", []string{"Keyspace", "ShardName"}) + starts = stats.NewCountersWithMultiLabels( + "BufferStarts", + "Buffering operation starts, including dry-run", + []string{"Keyspace", "ShardName"}) // stops counts how often we triggered the stop of a buffering, including // dry-run bufferings. // See the type "stopReason" below for all possible values of "Reason". - stops = stats.NewMultiCounters("BufferStops", []string{"Keyspace", "ShardName", "Reason"}) + stops = stats.NewCountersWithMultiLabels( + "BufferStops", + "Buffering operation stops, including dry-runs", + []string{"Keyspace", "ShardName", "Reason"}) // failoverDurationSumMs is the cumulative sum of all failover durations. // In connection with "starts" it can be used to calculate a moving average. - failoverDurationSumMs = stats.NewMultiCounters("BufferFailoverDurationSumMs", []string{"Keyspace", "ShardName"}) + failoverDurationSumMs = stats.NewCountersWithMultiLabels( + "BufferFailoverDurationSumMs", + "Total buffering failover duration", + []string{"Keyspace", "ShardName"}) // utilizationSum is the cumulative sum of the maximum buffer utilization // (in percentage) during each failover. // Utilization = maximum number of requests buffered / buffer size. // In connection with "starts" it can be used to calculate a moving average. // TODO(mberlin): Replace this with a MultiHistogram once it's available. - utilizationSum = stats.NewMultiCounters("BufferUtilizationSum", []string{"Keyspace", "ShardName"}) + utilizationSum = stats.NewGaugesWithMultiLabels( + "BufferUtilizationSum", + "Cumulative buffer utilization (in %) during failover", + []string{"Keyspace", "ShardName"}) // utilizationDryRunSum is the cumulative sum of the maximum *theoretical* // buffer utilization (in percentage) during each failover. // Utilization = maximum number of requests buffered seen / buffer size. @@ -48,26 +60,44 @@ var ( // utilization). The moving average would be 100% because there were two // failovers in that period. // TODO(mberlin): Replace this with a MultiHistogram once it's available. - utilizationDryRunSum = stats.NewMultiCounters("BufferUtilizationDryRunSum", []string{"Keyspace", "ShardName"}) + utilizationDryRunSum = stats.NewCountersWithMultiLabels( + "BufferUtilizationDryRunSum", + "Cumulative buffer utilization % during failover (dry-run)", + []string{"Keyspace", "ShardName"}) // requestsBuffered tracks how many requests were added to the buffer. // NOTE: The two counters "Buffered" and "Skipped" should cover all requests // which passed through the buffer. - requestsBuffered = stats.NewMultiCounters("BufferRequestsBuffered", []string{"Keyspace", "ShardName"}) + requestsBuffered = stats.NewCountersWithMultiLabels( + "BufferRequestsBuffered", + "Buffered requests", + []string{"Keyspace", "ShardName"}) // requestsBufferedDryRun tracks how many requests would have been added to // the buffer (dry-run mode). - requestsBufferedDryRun = stats.NewMultiCounters("BufferRequestsBufferedDryRun", []string{"Keyspace", "ShardName"}) + requestsBufferedDryRun = stats.NewCountersWithMultiLabels( + "BufferRequestsBufferedDryRun", + "Buffered requests (dry-run)", + []string{"Keyspace", "ShardName"}) // requestsBuffered tracks how many requests were drained from the buffer. // NOTE: The sum of the two counters "Drained" and "Evicted" should be // identical to the "Buffered" counter value. - requestsDrained = stats.NewMultiCounters("BufferRequestsDrained", []string{"Keyspace", "ShardName"}) + requestsDrained = stats.NewCountersWithMultiLabels( + "BufferRequestsDrained", + "Drained buffered requests", + []string{"Keyspace", "ShardName"}) // requestsEvicted tracks how many requests were evicted early from the buffer. // See the type "evictedReason" below for all possible values of "Reason". - requestsEvicted = stats.NewMultiCounters("BufferRequestsEvicted", []string{"Keyspace", "ShardName", "Reason"}) + requestsEvicted = stats.NewCountersWithMultiLabels( + "BufferRequestsEvicted", + "Evicted buffered requests", + []string{"Keyspace", "ShardName", "Reason"}) // requestsSkipped tracks how many requests would have been buffered but // eventually were not (includes dry-run bufferings). // See the type "skippedReason" below for all possible values of "Reason". - requestsSkipped = stats.NewMultiCounters("BufferRequestsSkipped", []string{"Keyspace", "ShardName", "Reason"}) + requestsSkipped = stats.NewCountersWithMultiLabels( + "BufferRequestsSkipped", + "Skipped buffering requests (incl. dry-run)", + []string{"Keyspace", "ShardName", "Reason"}) ) // stopReason is used in "stopsByReason" as "Reason" label. @@ -116,27 +146,27 @@ const ( // "no value for this label set (NaN)" to "a value". // "statsKey" should have two members for keyspace and shard. func initVariablesForShard(statsKey []string) { - starts.Set(statsKey, 0) + starts.Reset(statsKey) for _, reason := range stopReasons { key := append(statsKey, string(reason)) - stops.Set(key, 0) + stops.Reset(key) } - failoverDurationSumMs.Set(statsKey, 0) + failoverDurationSumMs.Reset(statsKey) utilizationSum.Set(statsKey, 0) - utilizationDryRunSum.Set(statsKey, 0) + utilizationDryRunSum.Reset(statsKey) - requestsBuffered.Set(statsKey, 0) - requestsBufferedDryRun.Set(statsKey, 0) - requestsDrained.Set(statsKey, 0) + requestsBuffered.Reset(statsKey) + requestsBufferedDryRun.Reset(statsKey) + requestsDrained.Reset(statsKey) for _, reason := range evictReasons { key := append(statsKey, string(reason)) - requestsEvicted.Set(key, 0) + requestsEvicted.Reset(key) } for _, reason := range skippedReasons { key := append(statsKey, string(reason)) - requestsSkipped.Set(key, 0) + requestsSkipped.Reset(key) } } @@ -145,17 +175,20 @@ func initVariablesForShard(statsKey []string) { var ( // bufferSize publishes the configured per vtgate buffer size. It can be used // to calculate the utilization of the buffer. - bufferSize = stats.NewInt("BufferSize") - // lastFailoverDurationMs tracks for how long vtgate buffered requests during - // the last failover. - // The value for a given shard will be reset at the next failover. - lastFailoverDurationMs = stats.NewMultiCounters("BufferLastFailoverDurationMs", []string{"Keyspace", "ShardName"}) - // lastRequestsInFlightMax has the maximum value of buffered requests in flight - // of the last failover. - // The value for a given shard will be reset at the next failover. - lastRequestsInFlightMax = stats.NewMultiCounters("BufferLastRequestsInFlightMax", []string{"Keyspace", "ShardName"}) + bufferSize = stats.NewGauge("BufferSize", "The configured per vtgate buffer size") + lastFailoverDurationMs = stats.NewGaugesWithMultiLabels( + "BufferLastFailoverDurationMs", + "Buffered requests during the last failover. The value for a given shard will be reset at the next failover.", + []string{"Keyspace", "ShardName"}) + lastRequestsInFlightMax = stats.NewGaugesWithMultiLabels( + "BufferLastRequestsInFlightMax", + "The max value of buffered requests in flight of the last failover. The value for a given shard will be reset at the next failover.", + []string{"Keyspace", "ShardName"}) // lastRequestsDryRunMax has the maximum number of requests which were seen during // a dry-run buffering of the last failover. // The value for a given shard will be reset at the next failover. - lastRequestsDryRunMax = stats.NewMultiCounters("BufferLastRequestsDryRunMax", []string{"Keyspace", "ShardName"}) + lastRequestsDryRunMax = stats.NewGaugesWithMultiLabels( + "BufferLastRequestsDryRunMax", + "Max # of requests which were seen during a dry-run buffering of the last failover", + []string{"Keyspace", "ShardName"}) ) diff --git a/go/vt/vtgate/buffer/variables_test.go b/go/vt/vtgate/buffer/variables_test.go index 101ece25c1c..8890b208d6b 100644 --- a/go/vt/vtgate/buffer/variables_test.go +++ b/go/vt/vtgate/buffer/variables_test.go @@ -51,13 +51,13 @@ func TestVariablesAreInitialized(t *testing.T) { statsKey := []string{"init_test", "0"} type testCase struct { desc string - counter *stats.MultiCounters + counter *stats.CountersWithMultiLabels statsKey []string } testCases := []testCase{ {"starts", starts, statsKey}, {"failoverDurationSumMs", failoverDurationSumMs, statsKey}, - {"utilizationSum", utilizationSum, statsKey}, + {"utilizationSum", &utilizationSum.CountersWithMultiLabels, statsKey}, {"utilizationDryRunSum", utilizationDryRunSum, statsKey}, {"requestsBuffered", requestsBuffered, statsKey}, {"requestsBufferedDryRun", requestsBufferedDryRun, statsKey}, @@ -85,7 +85,7 @@ func TestVariablesAreInitialized(t *testing.T) { } } -func checkEntry(counters *stats.MultiCounters, statsKey []string, want int) error { +func checkEntry(counters *stats.CountersWithMultiLabels, statsKey []string, want int) error { name := strings.Join(statsKey, ".") got, ok := counters.Counts()[name] if !ok { diff --git a/go/vt/vtgate/executor.go b/go/vt/vtgate/executor.go index 92b4c190b3c..40f1d0bb9cc 100644 --- a/go/vt/vtgate/executor.go +++ b/go/vt/vtgate/executor.go @@ -104,10 +104,10 @@ func NewExecutor(ctx context.Context, serv srvtopo.Server, cell, statsName strin e.vm.watchSrvVSchema(ctx, cell) executorOnce.Do(func() { - stats.Publish("QueryPlanCacheLength", stats.IntFunc(e.plans.Length)) - stats.Publish("QueryPlanCacheSize", stats.IntFunc(e.plans.Size)) - stats.Publish("QueryPlanCacheCapacity", stats.IntFunc(e.plans.Capacity)) - stats.Publish("QueryPlanCacheEvictions", stats.IntFunc(e.plans.Evictions)) + stats.NewGaugeFunc("QueryPlanCacheLength", "Query plan cache length", stats.IntFunc(e.plans.Length)) + stats.NewGaugeFunc("QueryPlanCacheSize", "Query plan cache size", stats.IntFunc(e.plans.Size)) + stats.NewGaugeFunc("QueryPlanCacheCapacity", "Query plan cache capacity", stats.IntFunc(e.plans.Capacity)) + stats.NewCounterFunc("QueryPlanCacheEvictions", "Query plan cache evictions", stats.IntFunc(e.plans.Evictions)) stats.Publish("QueryPlanCacheOldest", stats.StringFunc(func() string { return fmt.Sprintf("%v", e.plans.Oldest()) })) diff --git a/go/vt/vtgate/gateway/hybridgateway.go b/go/vt/vtgate/gateway/hybridgateway.go index 485fad41ee5..985f15bc9df 100644 --- a/go/vt/vtgate/gateway/hybridgateway.go +++ b/go/vt/vtgate/gateway/hybridgateway.go @@ -90,7 +90,11 @@ func (h *HybridGateway) WaitForTablets(ctx context.Context, tabletTypesToWait [] // RegisterStats registers the l2vtgate connection counts stats. func (h *HybridGateway) RegisterStats() { - stats.NewMultiCountersFunc("L2VtgateConnections", []string{"Keyspace", "ShardName", "TabletType"}, h.servingConnStats) + stats.NewCountersFuncWithMultiLabels( + "L2VtgateConnections", + []string{"Keyspace", "ShardName", "TabletType"}, + "The l2vtgate connection counts", + h.servingConnStats) } func (h *HybridGateway) servingConnStats() map[string]int64 { diff --git a/go/vt/vtgate/gateway/status.go b/go/vt/vtgate/gateway/status.go index 2ef71f38dfa..8bd4ab736d9 100644 --- a/go/vt/vtgate/gateway/status.go +++ b/go/vt/vtgate/gateway/status.go @@ -84,13 +84,13 @@ var ( aggregators []*TabletStatusAggregator // gatewayStatsChanFull tracks the number of times // aggrChan becomes full. - gatewayStatsChanFull *stats.Int + gatewayStatsChanFull *stats.Counter ) func init() { // init global goroutines to aggregate stats. aggrChan = make(chan *queryInfo, aggrChanSize) - gatewayStatsChanFull = stats.NewInt("GatewayStatsChanFullCount") + gatewayStatsChanFull = stats.NewCounter("GatewayStatsChanFullCount", "The number of times the queryInfo buffer becomes full") go resetAggregators() go processQueryInfo() } diff --git a/go/vt/vtgate/l2vtgate.go b/go/vt/vtgate/l2vtgate.go index 9a73ffaa9a3..f4f7c6cdadd 100644 --- a/go/vt/vtgate/l2vtgate.go +++ b/go/vt/vtgate/l2vtgate.go @@ -42,7 +42,7 @@ var ( type L2VTGate struct { queryservice.QueryService timings *stats.MultiTimings - errorCounts *stats.MultiCounters + errorCounts *stats.CountersWithMultiLabels gateway gateway.Gateway } @@ -59,9 +59,15 @@ func initL2VTGate(gw gateway.Gateway) *L2VTGate { } l2VTGate = &L2VTGate{ - timings: stats.NewMultiTimings("QueryServiceCall", []string{"Operation", "Keyspace", "ShardName", "DbType"}), - errorCounts: stats.NewMultiCounters("QueryServiceCallErrorCount", []string{"Operation", "Keyspace", "ShardName", "DbType"}), - gateway: gw, + timings: stats.NewMultiTimings( + "QueryServiceCall", + "l2VTGate query service call timings", + []string{"Operation", "Keyspace", "ShardName", "DbType"}), + errorCounts: stats.NewCountersWithMultiLabels( + "QueryServiceCallErrorCount", + "Error count from calls to the query service", + []string{"Operation", "Keyspace", "ShardName", "DbType"}), + gateway: gw, } l2VTGate.QueryService = queryservice.Wrap( gw, diff --git a/go/vt/vtgate/scatter_conn.go b/go/vt/vtgate/scatter_conn.go index ff518941b1e..896f2062528 100644 --- a/go/vt/vtgate/scatter_conn.go +++ b/go/vt/vtgate/scatter_conn.go @@ -49,7 +49,7 @@ var ( // multiple shard level connections. type ScatterConn struct { timings *stats.MultiTimings - tabletCallErrorCount *stats.MultiCounters + tabletCallErrorCount *stats.CountersWithMultiLabels txConn *TxConn gateway gateway.Gateway healthCheck discovery.HealthCheck @@ -79,11 +79,17 @@ func NewScatterConn(statsName string, txConn *TxConn, gw gateway.Gateway, hc dis tabletCallErrorCountStatsName = statsName + "ErrorCount" } return &ScatterConn{ - timings: stats.NewMultiTimings(statsName, []string{"Operation", "Keyspace", "ShardName", "DbType"}), - tabletCallErrorCount: stats.NewMultiCounters(tabletCallErrorCountStatsName, []string{"Operation", "Keyspace", "ShardName", "DbType"}), - txConn: txConn, - gateway: gw, - healthCheck: hc, + timings: stats.NewMultiTimings( + statsName, + "Scatter connection timings", + []string{"Operation", "Keyspace", "ShardName", "DbType"}), + tabletCallErrorCount: stats.NewCountersWithMultiLabels( + tabletCallErrorCountStatsName, + "Error count from tablet calls in scatter conns", + []string{"Operation", "Keyspace", "ShardName", "DbType"}), + txConn: txConn, + gateway: gw, + healthCheck: hc, } } diff --git a/go/vt/vtgate/vtgate.go b/go/vt/vtgate/vtgate.go index 83379064432..90da5189322 100644 --- a/go/vt/vtgate/vtgate.go +++ b/go/vt/vtgate/vtgate.go @@ -89,7 +89,7 @@ var ( qpsByKeyspace *stats.Rates qpsByDbType *stats.Rates - vschemaCounters *stats.Counters + vschemaCounters *stats.CountersWithLabels errorsByOperation *stats.Rates errorsByKeyspace *stats.Rates @@ -97,9 +97,9 @@ var ( errorsByCode *stats.Rates // Error counters should be global so they can be set from anywhere - errorCounts *stats.MultiCounters + errorCounts *stats.CountersWithMultiLabels - warnings *stats.Counters + warnings *stats.CountersWithLabels ) // VTGate is the rpc interface to vtgate. Only one instance @@ -124,7 +124,7 @@ type VTGate struct { // TODO(sougou): This needs to be cleaned up. There // are global vars that depend on this member var. timings *stats.MultiTimings - rowsReturned *stats.MultiCounters + rowsReturned *stats.CountersWithMultiLabels // the throttled loggers for all errors, one per API entry logExecute *logutil.ThrottledLogger @@ -156,7 +156,7 @@ func Init(ctx context.Context, hc discovery.HealthCheck, serv srvtopo.Server, ce // vschemaCounters needs to be initialized before planner to // catch the initial load stats. - vschemaCounters = stats.NewCounters("VtgateVSchemaCounts") + vschemaCounters = stats.NewCountersWithLabels("VtgateVSchemaCounts", "Vtgate vschema counts", "changes") // Build objects from low to high level. // Start with the gateway. If we can't reach the topology service, @@ -199,13 +199,19 @@ func Init(ctx context.Context, hc discovery.HealthCheck, serv srvtopo.Server, ce resolver := NewResolver(srvResolver, serv, cell, sc) rpcVTGate = &VTGate{ - executor: NewExecutor(ctx, serv, cell, "VTGateExecutor", resolver, *normalizeQueries, *streamBufferSize, *queryPlanCacheSize, *legacyAutocommit), - resolver: resolver, - txConn: tc, - gw: gw, - l2vtgate: l2vtgate, - timings: stats.NewMultiTimings("VtgateApi", []string{"Operation", "Keyspace", "DbType"}), - rowsReturned: stats.NewMultiCounters("VtgateApiRowsReturned", []string{"Operation", "Keyspace", "DbType"}), + executor: NewExecutor(ctx, serv, cell, "VTGateExecutor", resolver, *normalizeQueries, *streamBufferSize, *queryPlanCacheSize, *legacyAutocommit), + resolver: resolver, + txConn: tc, + gw: gw, + l2vtgate: l2vtgate, + timings: stats.NewMultiTimings( + "VtgateApi", + "VtgateApi timings", + []string{"Operation", "Keyspace", "DbType"}), + rowsReturned: stats.NewCountersWithMultiLabels( + "VtgateApiRowsReturned", + "Rows returned through the VTgate API", + []string{"Operation", "Keyspace", "DbType"}), logExecute: logutil.NewThrottledLogger("Execute", 5*time.Second), logStreamExecute: logutil.NewThrottledLogger("StreamExecute", 5*time.Second), @@ -222,7 +228,7 @@ func Init(ctx context.Context, hc discovery.HealthCheck, serv srvtopo.Server, ce logMessageStream: logutil.NewThrottledLogger("MessageStream", 5*time.Second), } - errorCounts = stats.NewMultiCounters("VtgateApiErrorCounts", []string{"Operation", "Keyspace", "DbType", "Code"}) + errorCounts = stats.NewCountersWithMultiLabels("VtgateApiErrorCounts", "Vtgate API error counts per error type", []string{"Operation", "Keyspace", "DbType", "Code"}) qpsByOperation = stats.NewRates("QPSByOperation", stats.CounterForDimension(rpcVTGate.timings, "Operation"), 15, 1*time.Minute) qpsByKeyspace = stats.NewRates("QPSByKeyspace", stats.CounterForDimension(rpcVTGate.timings, "Keyspace"), 15, 1*time.Minute) @@ -233,7 +239,7 @@ func Init(ctx context.Context, hc discovery.HealthCheck, serv srvtopo.Server, ce errorsByDbType = stats.NewRates("ErrorsByDbType", stats.CounterForDimension(errorCounts, "DbType"), 15, 1*time.Minute) errorsByCode = stats.NewRates("ErrorsByCode", stats.CounterForDimension(errorCounts, "Code"), 15, 1*time.Minute) - warnings = stats.NewCounters("VTGateWarnings", "IgnoredSet") + warnings = stats.NewCountersWithLabels("VtGateWarnings", "Vtgate warnings", "type", "IgnoredSet") servenv.OnRun(func() { for _, f := range RegisterVTGates { @@ -1091,6 +1097,7 @@ func recordAndAnnotateError(err error, statsKey []string, request map[string]int request = truncateErrorStrings(request) errorCounts.Add(fullKey, 1) + // Most errors are not logged by vtgate because they're either too spammy or logged elsewhere. switch ec { case vtrpcpb.Code_UNKNOWN, vtrpcpb.Code_INTERNAL, vtrpcpb.Code_DATA_LOSS: diff --git a/go/vt/vttablet/heartbeat/heartbeat.go b/go/vt/vttablet/heartbeat/heartbeat.go index cd4bd33214e..2d13e93fc4b 100644 --- a/go/vt/vttablet/heartbeat/heartbeat.go +++ b/go/vt/vttablet/heartbeat/heartbeat.go @@ -34,16 +34,16 @@ import ( var ( // HeartbeatWrites keeps a count of the number of heartbeats written over time. - writes = stats.NewInt("HeartbeatWrites") + writes = stats.NewCounter("HeartbeatWrites", "Count of heartbeats written over time") // HeartbeatWriteErrors keeps a count of errors encountered while writing heartbeats. - writeErrors = stats.NewInt("HeartbeatWriteErrors") + writeErrors = stats.NewCounter("HeartbeatWriteErrors", "Count of errors encountered while writing heartbeats") // HeartbeatReads keeps a count of the number of heartbeats read over time. - reads = stats.NewInt("HeartbeatReads") + reads = stats.NewCounter("HeartbeatReads", "Count of heartbeats read over time") // HeartbeatReadErrors keeps a count of errors encountered while reading heartbeats. - readErrors = stats.NewInt("HeartbeatReadErrors") + readErrors = stats.NewCounter("HeartbeatReadErrors", "Count of errors encountered while reading heartbeats") // HeartbeatCumulativeLagNs is incremented by the current lag at each heartbeat read interval. Plotting this // over time allows calculating of a rolling average lag. - cumulativeLagNs = stats.NewInt("HeartbeatCumulativeLagNs") + cumulativeLagNs = stats.NewCounter("HeartbeatCumulativeLagNs", "Incremented by the current lag at each heartbeat read interval") // HeartbeatCurrentLagNs is a point-in-time calculation of the lag, updated at each heartbeat read interval. - currentLagNs = stats.NewInt("HeartbeatCurrentLagNs") + currentLagNs = stats.NewGauge("HeartbeatCurrentLagNs", "Point in time calculation of the heartbeat lag") ) diff --git a/go/vt/vttablet/heartbeat/reader_test.go b/go/vt/vttablet/heartbeat/reader_test.go index a8a58ff4265..be26bc5af35 100644 --- a/go/vt/vttablet/heartbeat/reader_test.go +++ b/go/vt/vttablet/heartbeat/reader_test.go @@ -49,9 +49,9 @@ func TestReaderReadHeartbeat(t *testing.T) { }}, }) - cumulativeLagNs.Set(0) - readErrors.Set(0) - reads.Set(0) + cumulativeLagNs.Reset() + readErrors.Reset() + reads.Reset() tr.readHeartbeat() lag, err := tr.GetLatest() @@ -81,8 +81,8 @@ func TestReaderReadHeartbeatError(t *testing.T) { tr := newReader(db, mockNowFunc) defer tr.Close() - cumulativeLagNs.Set(0) - readErrors.Set(0) + cumulativeLagNs.Reset() + readErrors.Reset() tr.readHeartbeat() lag, err := tr.GetLatest() diff --git a/go/vt/vttablet/heartbeat/writer.go b/go/vt/vttablet/heartbeat/writer.go index b909d4a2625..9e3e6f080cd 100644 --- a/go/vt/vttablet/heartbeat/writer.go +++ b/go/vt/vttablet/heartbeat/writer.go @@ -155,7 +155,7 @@ func (w *Writer) Close() { // and we also execute them with an isolated connection that turns off the binlog and // is closed at the end. func (w *Writer) initializeTables(cp *mysql.ConnParams) error { - conn, err := dbconnpool.NewDBConnection(cp, stats.NewTimings("")) + conn, err := dbconnpool.NewDBConnection(cp, stats.NewTimings("", "")) if err != nil { return fmt.Errorf("Failed to create connection for heartbeat: %v", err) } diff --git a/go/vt/vttablet/heartbeat/writer_test.go b/go/vt/vttablet/heartbeat/writer_test.go index 57b89553957..a1b053b5509 100644 --- a/go/vt/vttablet/heartbeat/writer_test.go +++ b/go/vt/vttablet/heartbeat/writer_test.go @@ -46,7 +46,7 @@ func TestCreateSchema(t *testing.T) { defer db.Close() tw := newTestWriter(db, mockNowFunc) defer tw.Close() - writes.Set(0) + writes.Reset() db.AddQuery(sqlTurnoffBinlog, &sqltypes.Result{}) db.AddQuery(fmt.Sprintf(sqlCreateHeartbeatTable, tw.dbName), &sqltypes.Result{}) @@ -74,8 +74,8 @@ func TestWriteHeartbeat(t *testing.T) { tw := newTestWriter(db, mockNowFunc) db.AddQuery(fmt.Sprintf("UPDATE %s.heartbeat SET ts=%d, tabletUid=%d WHERE keyspaceShard='%s'", tw.dbName, now.UnixNano(), tw.tabletAlias.Uid, tw.keyspaceShard), &sqltypes.Result{}) - writes.Set(0) - writeErrors.Set(0) + writes.Reset() + writeErrors.Reset() tw.writeHeartbeat() if got, want := writes.Get(), int64(1); got != want { @@ -93,8 +93,8 @@ func TestWriteHeartbeatError(t *testing.T) { tw := newTestWriter(db, mockNowFunc) - writes.Set(0) - writeErrors.Set(0) + writes.Reset() + writeErrors.Reset() tw.writeHeartbeat() if got, want := writes.Get(), int64(0); got != want { diff --git a/go/vt/vttablet/tabletmanager/binlog_players.go b/go/vt/vttablet/tabletmanager/binlog_players.go index 338020cd084..2fcf953ddd3 100644 --- a/go/vt/vttablet/tabletmanager/binlog_players.go +++ b/go/vt/vttablet/tabletmanager/binlog_players.go @@ -421,16 +421,19 @@ func NewBinlogPlayerMap(ts *topo.Server, mysqld mysqlctl.MysqlDaemon, vtClientFa // RegisterBinlogPlayerMap registers the varz for the players. func RegisterBinlogPlayerMap(blm *BinlogPlayerMap) { - stats.Publish("BinlogPlayerMapSize", stats.IntFunc(stats.IntFunc(func() int64 { + stats.NewGaugeFunc("BinlogPlayerMapSize", "Binlog player map size", stats.IntFunc(func() int64 { blm.mu.Lock() defer blm.mu.Unlock() return int64(len(blm.players)) - }))) - stats.Publish("BinlogPlayerSecondsBehindMaster", stats.IntFunc(func() int64 { - blm.mu.Lock() - defer blm.mu.Unlock() - return blm.maxSecondsBehindMasterUNGUARDED() })) + stats.NewGaugeFunc( + "BinlogPlayerSecondsBehindMaster", + "Binlog player seconds behind master", + stats.IntFunc(func() int64 { + blm.mu.Lock() + defer blm.mu.Unlock() + return blm.maxSecondsBehindMasterUNGUARDED() + })) stats.Publish("BinlogPlayerSecondsBehindMasterMap", stats.CountersFunc(func() map[string]int64 { blm.mu.Lock() result := make(map[string]int64, len(blm.players)) diff --git a/go/vt/vttablet/tabletmanager/rpc_external_reparent.go b/go/vt/vttablet/tabletmanager/rpc_external_reparent.go index d54ad131b9b..b103c59a3db 100644 --- a/go/vt/vttablet/tabletmanager/rpc_external_reparent.go +++ b/go/vt/vttablet/tabletmanager/rpc_external_reparent.go @@ -39,7 +39,7 @@ import ( var ( finalizeReparentTimeout = flag.Duration("finalize_external_reparent_timeout", 30*time.Second, "Timeout for the finalize stage of a fast external reparent reconciliation.") - externalReparentStats = stats.NewTimings("ExternalReparents", "NewMasterVisible", "FullRebuild") + externalReparentStats = stats.NewTimings("ExternalReparents", "Stats from external reparentings", "NewMasterVisible", "FullRebuild") ) // SetReparentFlags changes flag values. It should only be used in tests. diff --git a/go/vt/vttablet/tabletserver/connpool/pool.go b/go/vt/vttablet/tabletserver/connpool/pool.go index 38ae5ef67eb..a36e4f29863 100644 --- a/go/vt/vttablet/tabletserver/connpool/pool.go +++ b/go/vt/vttablet/tabletserver/connpool/pool.go @@ -82,15 +82,15 @@ func New( return cp } usedNames[name] = true - stats.Publish(name+"Capacity", stats.IntFunc(cp.Capacity)) - stats.Publish(name+"Available", stats.IntFunc(cp.Available)) - stats.Publish(name+"Active", stats.IntFunc(cp.Active)) - stats.Publish(name+"InUse", stats.IntFunc(cp.InUse)) - stats.Publish(name+"MaxCap", stats.IntFunc(cp.MaxCap)) - stats.Publish(name+"WaitCount", stats.IntFunc(cp.WaitCount)) - stats.Publish(name+"WaitTime", stats.DurationFunc(cp.WaitTime)) - stats.Publish(name+"IdleTimeout", stats.DurationFunc(cp.IdleTimeout)) - stats.Publish(name+"IdleClosed", stats.IntFunc(cp.IdleClosed)) + stats.NewGaugeFunc(name+"Capacity", "Tablet server conn pool capacity", stats.IntFunc(cp.Capacity)) + stats.NewGaugeFunc(name+"Available", "Tablet server conn pool available", stats.IntFunc(cp.Available)) + stats.NewGaugeFunc(name+"Active", "Tablet server conn pool active", stats.IntFunc(cp.Active)) + stats.NewGaugeFunc(name+"InUse", "Tablet server conn pool in use", stats.IntFunc(cp.InUse)) + stats.NewGaugeFunc(name+"MaxCap", "Tablet server conn pool max cap", stats.IntFunc(cp.MaxCap)) + stats.NewCounterFunc(name+"WaitCount", "Tablet server conn pool wait count", stats.IntFunc(cp.WaitCount)) + stats.NewCounterFunc(name+"WaitTime", "Tablet server wait time", stats.DurationFunc(cp.WaitTime)) + stats.NewCounterFunc(name+"IdleTimeout", "Tablet server idle timeout", stats.DurationFunc(cp.IdleTimeout)) + stats.NewCounterFunc(name+"IdleClosed", "Tablet server conn pool idle closed", stats.IntFunc(cp.IdleClosed)) return cp } diff --git a/go/vt/vttablet/tabletserver/messager/message_manager.go b/go/vt/vttablet/tabletserver/messager/message_manager.go index a0cce941691..6dbbb8727fe 100644 --- a/go/vt/vttablet/tabletserver/messager/message_manager.go +++ b/go/vt/vttablet/tabletserver/messager/message_manager.go @@ -37,10 +37,16 @@ import ( ) // MessageStats tracks stats for messages. -var MessageStats = stats.NewMultiCounters("Messages", []string{"TableName", "Metric"}) +var MessageStats = stats.NewGaugesWithMultiLabels( + "Messages", + "Stats for messages", + []string{"TableName", "Metric"}) // MessageDelayTimings records total latency from queueing to sent to clients. -var MessageDelayTimings = stats.NewMultiTimings("MessageDelay", []string{"TableName"}) +var MessageDelayTimings = stats.NewMultiTimings( + "MessageDelay", + "MessageDelayTimings records total latency from queueing to client sends", + []string{"TableName"}) type messageReceiver struct { ctx context.Context diff --git a/go/vt/vttablet/tabletserver/query_engine.go b/go/vt/vttablet/tabletserver/query_engine.go index 447914fed46..6a3b399ffc1 100644 --- a/go/vt/vttablet/tabletserver/query_engine.go +++ b/go/vt/vttablet/tabletserver/query_engine.go @@ -234,24 +234,24 @@ func NewQueryEngine(checker connpool.MySQLChecker, se *schema.Engine, config tab qe.accessCheckerLogger = logutil.NewThrottledLogger("accessChecker", 1*time.Second) qeOnce.Do(func() { - stats.Publish("MaxResultSize", stats.IntFunc(qe.maxResultSize.Get)) - stats.Publish("WarnResultSize", stats.IntFunc(qe.warnResultSize.Get)) - stats.Publish("MaxDMLRows", stats.IntFunc(qe.maxDMLRows.Get)) - stats.Publish("StreamBufferSize", stats.IntFunc(qe.streamBufferSize.Get)) - stats.Publish("TableACLExemptCount", stats.IntFunc(qe.tableaclExemptCount.Get)) - stats.Publish("QueryPoolWaiters", stats.IntFunc(qe.queryPoolWaiters.Get)) - - stats.Publish("QueryCacheLength", stats.IntFunc(qe.plans.Length)) - stats.Publish("QueryCacheSize", stats.IntFunc(qe.plans.Size)) - stats.Publish("QueryCacheCapacity", stats.IntFunc(qe.plans.Capacity)) - stats.Publish("QueryCacheEvictions", stats.IntFunc(qe.plans.Evictions)) + stats.NewGaugeFunc("MaxResultSize", "Query engine max result size", stats.IntFunc(qe.maxResultSize.Get)) + stats.NewGaugeFunc("WarnResultSize", "Query engine warn result size", stats.IntFunc(qe.warnResultSize.Get)) + stats.NewGaugeFunc("MaxDMLRows", "Query engine max DML rows", stats.IntFunc(qe.maxDMLRows.Get)) + stats.NewGaugeFunc("StreamBufferSize", "Query engine stream buffer size", stats.IntFunc(qe.streamBufferSize.Get)) + stats.NewCounterFunc("TableACLExemptCount", "Query engine table ACL exempt count", stats.IntFunc(qe.tableaclExemptCount.Get)) + stats.NewGaugeFunc("QueryPoolWaiters", "Query engine query pool waiters", stats.IntFunc(qe.queryPoolWaiters.Get)) + + stats.NewGaugeFunc("QueryCacheLength", "Query engine query cache length", stats.IntFunc(qe.plans.Length)) + stats.NewGaugeFunc("QueryCacheSize", "Query engine query cache size", stats.IntFunc(qe.plans.Size)) + stats.NewGaugeFunc("QueryCacheCapacity", "Query engine query cache capacity", stats.IntFunc(qe.plans.Capacity)) + stats.NewCounterFunc("QueryCacheEvictions", "Query engine query cache evictions", stats.IntFunc(qe.plans.Evictions)) stats.Publish("QueryCacheOldest", stats.StringFunc(func() string { return fmt.Sprintf("%v", qe.plans.Oldest()) })) - _ = stats.NewMultiCountersFunc("QueryCounts", []string{"Table", "Plan"}, qe.getQueryCount) - _ = stats.NewMultiCountersFunc("QueryTimesNs", []string{"Table", "Plan"}, qe.getQueryTime) - _ = stats.NewMultiCountersFunc("QueryRowCounts", []string{"Table", "Plan"}, qe.getQueryRowCount) - _ = stats.NewMultiCountersFunc("QueryErrorCounts", []string{"Table", "Plan"}, qe.getQueryErrorCount) + _ = stats.NewCountersFuncWithMultiLabels("QueryCounts", []string{"Table", "Plan"}, "query counts", qe.getQueryCount) + _ = stats.NewCountersFuncWithMultiLabels("QueryTimesNs", []string{"Table", "Plan"}, "query times in ns", qe.getQueryTime) + _ = stats.NewCountersFuncWithMultiLabels("QueryRowCounts", []string{"Table", "Plan"}, "query row counts", qe.getQueryRowCount) + _ = stats.NewCountersFuncWithMultiLabels("QueryErrorCounts", []string{"Table", "Plan"}, "query error counts", qe.getQueryErrorCount) http.Handle("/debug/hotrows", qe.txSerializer) diff --git a/go/vt/vttablet/tabletserver/query_executor_test.go b/go/vt/vttablet/tabletserver/query_executor_test.go index 1bb1699054a..daf887b5792 100644 --- a/go/vt/vttablet/tabletserver/query_executor_test.go +++ b/go/vt/vttablet/tabletserver/query_executor_test.go @@ -1821,13 +1821,13 @@ func TestQueryExecutorTableAclDryRun(t *testing.T) { qre := newTestQueryExecutor(ctx, tsv, query, 0) defer tsv.StopService() checkPlanID(t, planbuilder.PlanPassSelect, qre.plan.PlanID) - beforeCount := tabletenv.TableaclPseudoDenied.Counters.Counts()[tableACLStatsKey] + beforeCount := tabletenv.TableaclPseudoDenied.Counts()[tableACLStatsKey] // query should fail because current user do not have read permissions _, err := qre.Execute() if err != nil { t.Fatalf("qre.Execute() = %v, want: nil", err) } - afterCount := tabletenv.TableaclPseudoDenied.Counters.Counts()[tableACLStatsKey] + afterCount := tabletenv.TableaclPseudoDenied.Counts()[tableACLStatsKey] if afterCount-beforeCount != 1 { t.Fatalf("table acl pseudo denied count should increase by one. got: %d, want: %d", afterCount, beforeCount+1) } diff --git a/go/vt/vttablet/tabletserver/replication_watcher.go b/go/vt/vttablet/tabletserver/replication_watcher.go index 5e78af72e6e..cc4fe412e28 100644 --- a/go/vt/vttablet/tabletserver/replication_watcher.go +++ b/go/vt/vttablet/tabletserver/replication_watcher.go @@ -68,12 +68,15 @@ func NewReplicationWatcher(se *schema.Engine, config tabletenv.TabletConfig) *Re } return "" })) - stats.Publish("EventTokenTimestamp", stats.IntFunc(func() int64 { - if e := rpw.EventToken(); e != nil { - return e.Timestamp - } - return 0 - })) + stats.NewGaugeFunc( + "EventTokenTimestamp", + "Replication watcher event token timestamp", + stats.IntFunc(func() int64 { + if e := rpw.EventToken(); e != nil { + return e.Timestamp + } + return 0 + })) }) return rpw } diff --git a/go/vt/vttablet/tabletserver/schema/engine.go b/go/vt/vttablet/tabletserver/schema/engine.go index 8007ae11229..dfa5c52a601 100644 --- a/go/vt/vttablet/tabletserver/schema/engine.go +++ b/go/vt/vttablet/tabletserver/schema/engine.go @@ -77,12 +77,12 @@ func NewEngine(checker connpool.MySQLChecker, config tabletenv.TabletConfig) *En reloadTime: reloadTime, } schemaOnce.Do(func() { - stats.Publish("SchemaReloadTime", stats.DurationFunc(se.ticks.Interval)) - _ = stats.NewMultiCountersFunc("TableRows", []string{"Table"}, se.getTableRows) - _ = stats.NewMultiCountersFunc("DataLength", []string{"Table"}, se.getDataLength) - _ = stats.NewMultiCountersFunc("IndexLength", []string{"Table"}, se.getIndexLength) - _ = stats.NewMultiCountersFunc("DataFree", []string{"Table"}, se.getDataFree) - _ = stats.NewMultiCountersFunc("MaxDataLength", []string{"Table"}, se.getMaxDataLength) + _ = stats.NewGaugeFunc("SchemaReloadTime", "vttablet keeps table schemas in its own memory and periodically refreshes it from MySQL. This config controls the reload time.", stats.DurationFunc(se.ticks.Interval)) + _ = stats.NewGaugesFuncWithMultiLabels("TableRows", []string{"Table"}, "table rows created in tabletserver", se.getTableRows) + _ = stats.NewGaugesFuncWithMultiLabels("DataLength", []string{"Table"}, "data length in tabletserver", se.getDataLength) + _ = stats.NewGaugesFuncWithMultiLabels("IndexLength", []string{"Table"}, "index length in tabletserver", se.getIndexLength) + _ = stats.NewGaugesFuncWithMultiLabels("DataFree", []string{"Table"}, "data free in tabletserver", se.getDataFree) + _ = stats.NewGaugesFuncWithMultiLabels("MaxDataLength", []string{"Table"}, "max data length in tabletserver", se.getMaxDataLength) http.Handle("/debug/schema", se) http.HandleFunc("/schemaz", func(w http.ResponseWriter, r *http.Request) { diff --git a/go/vt/vttablet/tabletserver/tabletenv/tabletenv.go b/go/vt/vttablet/tabletserver/tabletenv/tabletenv.go index 0dc5986c266..49859b52474 100644 --- a/go/vt/vttablet/tabletserver/tabletenv/tabletenv.go +++ b/go/vt/vttablet/tabletserver/tabletenv/tabletenv.go @@ -34,18 +34,20 @@ import ( var ( // MySQLStats shows the time histogram for operations spent on mysql side. - MySQLStats = stats.NewTimings("Mysql") + MySQLStats = stats.NewTimings("Mysql", "MySQl query time") // QueryStats shows the time histogram for each type of queries. - QueryStats = stats.NewTimings("Queries") + QueryStats = stats.NewTimings("Queries", "MySQL query timings") // QPSRates shows the qps of QueryStats. Sample every 5 seconds and keep samples for up to 15 mins. QPSRates = stats.NewRates("QPS", QueryStats, 15*60/5, 5*time.Second) // WaitStats shows the time histogram for wait operations - WaitStats = stats.NewTimings("Waits") + WaitStats = stats.NewTimings("Waits", "Wait operations") // KillStats shows number of connections being killed. - KillStats = stats.NewCounters("Kills", "Transactions", "Queries") - // ErrorStats shows number of critial erros happened. - ErrorStats = stats.NewCounters( + KillStats = stats.NewCountersWithLabels("Kills", "Number of connections being killed", "query_type", "Transactions", "Queries") + // ErrorStats shows number of critial errors happened. + ErrorStats = stats.NewCountersWithLabels( "Errors", + "Critical errors", + "error_code", vtrpcpb.Code_OK.String(), vtrpcpb.Code_CANCELED.String(), vtrpcpb.Code_UNKNOWN.String(), @@ -65,27 +67,48 @@ var ( vtrpcpb.Code_DATA_LOSS.String(), ) // InternalErrors shows number of errors from internal components. - InternalErrors = stats.NewCounters("InternalErrors", "Task", "StrayTransactions", "Panic", "HungQuery", "Schema", "TwopcCommit", "TwopcResurrection", "WatchdogFail", "Messages") + InternalErrors = stats.NewCountersWithLabels("InternalErrors", "Internal component errors", "type", "Task", "StrayTransactions", "Panic", "HungQuery", "Schema", "TwopcCommit", "TwopcResurrection", "WatchdogFail", "Messages") // Warnings shows number of warnings - Warnings = stats.NewCounters("Warnings", "ResultsExceeded") + Warnings = stats.NewCountersWithLabels("Warnings", "Warnings", "type", "ResultsExceeded") // Unresolved tracks unresolved items. For now it's just Prepares. - Unresolved = stats.NewCounters("Unresolved", "Prepares") + Unresolved = stats.NewGaugesWithLabels("Unresolved", "Unresolved items", "item_type", "Prepares") // UserTableQueryCount shows number of queries received for each CallerID/table combination. - UserTableQueryCount = stats.NewMultiCounters("UserTableQueryCount", []string{"TableName", "CallerID", "Type"}) + UserTableQueryCount = stats.NewCountersWithMultiLabels( + "UserTableQueryCount", + "Queries received for each CallerID/table combination", + []string{"TableName", "CallerID", "Type"}) // UserTableQueryTimesNs shows total latency for each CallerID/table combination. - UserTableQueryTimesNs = stats.NewMultiCounters("UserTableQueryTimesNs", []string{"TableName", "CallerID", "Type"}) + UserTableQueryTimesNs = stats.NewCountersWithMultiLabels( + "UserTableQueryTimesNs", + "Total latency for each CallerID/table combination", + []string{"TableName", "CallerID", "Type"}) // UserTransactionCount shows number of transactions received for each CallerID. - UserTransactionCount = stats.NewMultiCounters("UserTransactionCount", []string{"CallerID", "Conclusion"}) + UserTransactionCount = stats.NewCountersWithMultiLabels( + "UserTransactionCount", + "transactions received for each CallerID", + []string{"CallerID", "Conclusion"}) // UserTransactionTimesNs shows total transaction latency for each CallerID. - UserTransactionTimesNs = stats.NewMultiCounters("UserTransactionTimesNs", []string{"CallerID", "Conclusion"}) + UserTransactionTimesNs = stats.NewCountersWithMultiLabels( + "UserTransactionTimesNs", + "Total transaction latency for each CallerID", + []string{"CallerID", "Conclusion"}) // ResultStats shows the histogram of number of rows returned. ResultStats = stats.NewHistogram("Results", []int64{0, 1, 5, 10, 50, 100, 500, 1000, 5000, 10000}) // TableaclAllowed tracks the number allows. - TableaclAllowed = stats.NewMultiCounters("TableACLAllowed", []string{"TableName", "TableGroup", "PlanID", "Username"}) + TableaclAllowed = stats.NewCountersWithMultiLabels( + "TableACLAllowed", + "ACL acceptances", + []string{"TableName", "TableGroup", "PlanID", "Username"}) // TableaclDenied tracks the number of denials. - TableaclDenied = stats.NewMultiCounters("TableACLDenied", []string{"TableName", "TableGroup", "PlanID", "Username"}) + TableaclDenied = stats.NewCountersWithMultiLabels( + "TableACLDenied", + "ACL denials", + []string{"TableName", "TableGroup", "PlanID", "Username"}) // TableaclPseudoDenied tracks the number of pseudo denies. - TableaclPseudoDenied = stats.NewMultiCounters("TableACLPseudoDenied", []string{"TableName", "TableGroup", "PlanID", "Username"}) + TableaclPseudoDenied = stats.NewCountersWithMultiLabels( + "TableACLPseudoDenied", + "ACL pseudodenials", + []string{"TableName", "TableGroup", "PlanID", "Username"}) // Infof can be overridden during tests Infof = log.Infof // Warningf can be overridden during tests diff --git a/go/vt/vttablet/tabletserver/tabletserver.go b/go/vt/vttablet/tabletserver/tabletserver.go index a53dfae91b1..30ea83f48d3 100644 --- a/go/vt/vttablet/tabletserver/tabletserver.go +++ b/go/vt/vttablet/tabletserver/tabletserver.go @@ -238,15 +238,15 @@ func NewTabletServer(config tabletenv.TabletConfig, topoServer *topo.Server, ali // So that vtcombo doesn't even call it once, on the first tablet. // And we can remove the tsOnce variable. tsOnce.Do(func() { - stats.Publish("TabletState", stats.IntFunc(func() int64 { + stats.NewGaugeFunc("TabletState", "Tablet server state", stats.IntFunc(func() int64 { tsv.mu.Lock() state := tsv.state tsv.mu.Unlock() return state })) - stats.Publish("QueryTimeout", stats.DurationFunc(tsv.QueryTimeout.Get)) - stats.Publish("QueryPoolTimeout", stats.DurationFunc(tsv.qe.connTimeout.Get)) - stats.Publish("BeginTimeout", stats.DurationFunc(tsv.BeginTimeout.Get)) + stats.NewGaugeFunc("QueryTimeout", "Tablet server query timeout", stats.DurationFunc(tsv.QueryTimeout.Get)) + stats.NewGaugeFunc("QueryPoolTimeout", "Tablet server timeout to get a connection from the query pool", stats.DurationFunc(tsv.qe.connTimeout.Get)) + stats.NewGaugeFunc("BeginTimeout", "Tablet server begin timeout", stats.DurationFunc(tsv.BeginTimeout.Get)) stats.Publish("TabletStateName", stats.StringFunc(tsv.GetState)) }) return tsv diff --git a/go/vt/vttablet/tabletserver/twopc.go b/go/vt/vttablet/tabletserver/twopc.go index b6c4c539706..ee8cc5bc82c 100644 --- a/go/vt/vttablet/tabletserver/twopc.go +++ b/go/vt/vttablet/tabletserver/twopc.go @@ -130,7 +130,7 @@ func NewTwoPC(readPool *connpool.Pool) *TwoPC { // are not present, they are created. func (tpc *TwoPC) Init(sidecarDBName string, dbaparams *mysql.ConnParams) error { dbname := sqlescape.EscapeID(sidecarDBName) - conn, err := dbconnpool.NewDBConnection(dbaparams, stats.NewTimings("")) + conn, err := dbconnpool.NewDBConnection(dbaparams, stats.NewTimings("", "")) if err != nil { return err } diff --git a/go/vt/vttablet/tabletserver/tx_pool.go b/go/vt/vttablet/tabletserver/tx_pool.go index 85aaa965fa7..8aeb5ca2fff 100644 --- a/go/vt/vttablet/tabletserver/tx_pool.go +++ b/go/vt/vttablet/tabletserver/tx_pool.go @@ -56,7 +56,7 @@ const txLogInterval = time.Duration(1 * time.Minute) var ( txOnce sync.Once - txStats = stats.NewTimings("Transactions") + txStats = stats.NewTimings("Transactions", "Transaction stats") txIsolations = map[querypb.ExecuteOptions_TransactionIsolation]string{ querypb.ExecuteOptions_REPEATABLE_READ: "set transaction isolation level REPEATABLE READ", @@ -119,8 +119,8 @@ func NewTxPool( txOnce.Do(func() { // Careful: conns also exports name+"xxx" vars, // but we know it doesn't export Timeout. - stats.Publish(prefix+"TransactionPoolTimeout", stats.DurationFunc(axp.timeout.Get)) - stats.Publish(prefix+"TransactionPoolWaiters", stats.IntFunc(axp.waiters.Get)) + stats.NewGaugeFunc(prefix+"TransactionPoolTimeout", "Transaction pool timeout", stats.DurationFunc(axp.timeout.Get)) + stats.NewGaugeFunc(prefix+"TransactionPoolWaiters", "Transaction pool waiters", stats.IntFunc(axp.waiters.Get)) }) return axp } diff --git a/go/vt/vttablet/tabletserver/txlimiter/tx_limiter.go b/go/vt/vttablet/tabletserver/txlimiter/tx_limiter.go index acc4e97b8d4..eb725da9e87 100644 --- a/go/vt/vttablet/tabletserver/txlimiter/tx_limiter.go +++ b/go/vt/vttablet/tabletserver/txlimiter/tx_limiter.go @@ -32,8 +32,8 @@ import ( const unknown string = "unknown" var ( - rejections = stats.NewCounters("TxLimiterRejections") - rejectionsDryRun = stats.NewCounters("TxLimiterRejectionsDryRun") + rejections = stats.NewCountersWithLabels("TxLimiterRejections", "rejections from TxLimiter", "user") + rejectionsDryRun = stats.NewCountersWithLabels("TxLimiterRejectionsDryRun", "rejections from TxLimiter in dry run", "user") ) // TxLimiter is the transaction limiter interface. diff --git a/go/vt/vttablet/tabletserver/txlimiter/tx_limiter_test.go b/go/vt/vttablet/tabletserver/txlimiter/tx_limiter_test.go index 48251e76013..e56b093e093 100644 --- a/go/vt/vttablet/tabletserver/txlimiter/tx_limiter_test.go +++ b/go/vt/vttablet/tabletserver/txlimiter/tx_limiter_test.go @@ -10,8 +10,8 @@ import ( ) func resetVariables() { - rejections.Reset() - rejectionsDryRun.Reset() + rejections.ResetAll() + rejectionsDryRun.ResetAll() } func createCallers(username, principal, component, subcomponent string) (*querypb.VTGateCallerID, *vtrpcpb.CallerID) { diff --git a/go/vt/vttablet/tabletserver/txserializer/tx_serializer.go b/go/vt/vttablet/tabletserver/txserializer/tx_serializer.go index bf7238b5ce0..15bc1a63258 100644 --- a/go/vt/vttablet/tabletserver/txserializer/tx_serializer.go +++ b/go/vt/vttablet/tabletserver/txserializer/tx_serializer.go @@ -40,21 +40,38 @@ var ( // waits stores how many times a transaction was queued because another // transaction was already in flight for the same row (range). // The key of the map is the table name of the query. - waits = stats.NewCounters("TxSerializerWaits") + waits = stats.NewCountersWithLabels( + "TxSerializerWaits", + "Number of times a transaction was queued because another transaction was already in flight for the same row range", + "table_name") // waitsDryRun is similar as "waits": In dry-run mode it records how many // transactions would have been queued. // The key of the map is the table name of the query. - waitsDryRun = stats.NewCounters("TxSerializerWaitsDryRun") + waitsDryRun = stats.NewCountersWithLabels( + "TxSerializerWaitsDryRun", + "Dry run number of transactions that would've been queued", + "table_name") // queueExceeded counts per table how many transactions were rejected because // the max queue size per row (range) was exceeded. - queueExceeded = stats.NewCounters("TxSerializerQueueExceeded") + queueExceeded = stats.NewCountersWithLabels( + "TxSerializerQueueExceeded", + "Number of transactions that were rejected because the max queue size per row range was exceeded", + "table_name") // queueExceededDryRun counts in dry-run mode how many transactions would have // been rejected due to exceeding the max queue size per row (range). - queueExceededDryRun = stats.NewCounters("TxSerializerQueueExceededDryRun") + queueExceededDryRun = stats.NewCountersWithLabels( + "TxSerializerQueueExceededDryRun", + "Dry-run Number of transactions that were rejcted because the max queue size was exceeded", + "table_name") + // globalQueueExceeded is the same as queueExceeded but for the global queue. - globalQueueExceeded = stats.NewInt("TxSerializerGlobalQueueExceeded") - globalQueueExceededDryRun = stats.NewInt("TxSerializerGlobalQueueExceededDryRun") + globalQueueExceeded = stats.NewCounter( + "TxSerializerGlobalQueueExceeded", + "Number of transactions that were rejected on the global queue because of exceeding the max queue size per row range") + globalQueueExceededDryRun = stats.NewCounter( + "TxSerializerGlobalQueueExceededDryRun", + "Dry-run stats for TxSerializerGlobalQueueExceeded") ) // TxSerializer serializes incoming transactions which target the same row range diff --git a/go/vt/vttablet/tabletserver/txserializer/tx_serializer_test.go b/go/vt/vttablet/tabletserver/txserializer/tx_serializer_test.go index 6f74649d2d7..bc64e396a66 100644 --- a/go/vt/vttablet/tabletserver/txserializer/tx_serializer_test.go +++ b/go/vt/vttablet/tabletserver/txserializer/tx_serializer_test.go @@ -34,12 +34,12 @@ import ( ) func resetVariables() { - waits.Reset() - waitsDryRun.Reset() - queueExceeded.Reset() - queueExceededDryRun.Reset() - globalQueueExceeded.Set(0) - globalQueueExceededDryRun.Set(0) + waits.ResetAll() + waitsDryRun.ResetAll() + queueExceeded.ResetAll() + queueExceededDryRun.ResetAll() + globalQueueExceeded.Reset() + globalQueueExceededDryRun.Reset() } func TestTxSerializer_NoHotRow(t *testing.T) { diff --git a/go/vt/worker/legacy_split_clone_test.go b/go/vt/worker/legacy_split_clone_test.go index 7115b106086..06f9c9f4e58 100644 --- a/go/vt/worker/legacy_split_clone_test.go +++ b/go/vt/worker/legacy_split_clone_test.go @@ -469,7 +469,7 @@ func TestLegacySplitCloneV2_NoMasterAvailable(t *testing.T) { // Reset the retry stats now. It also happens when the worker starts but that // is too late because this Go routine potentially reads it before the worker // resets the old value. - statsRetryCounters.Reset() + statsRetryCounters.ResetAll() go func() { ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() diff --git a/go/vt/worker/row_aggregator.go b/go/vt/worker/row_aggregator.go index 9d7c4f54d7d..97726208ddf 100644 --- a/go/vt/worker/row_aggregator.go +++ b/go/vt/worker/row_aggregator.go @@ -48,9 +48,9 @@ type RowAggregator struct { td *tabletmanagerdatapb.TableDefinition diffType DiffType builder QueryBuilder - // statsCounters has a "diffType" specific stats.Counters object to track how + // statsCounters has a "diffType" specific stats.CountersWithLabels object to track how // many rows were changed per table. - statsCounters *stats.Counters + statsCounters *stats.CountersWithLabels buffer bytes.Buffer bufferedRows int @@ -60,7 +60,7 @@ type RowAggregator struct { // The index of the elements in statCounters must match the elements // in "DiffTypes" i.e. the first counter is for inserts, second for updates // and the third for deletes. -func NewRowAggregator(ctx context.Context, maxRows, maxSize int, insertChannel chan string, dbName string, td *tabletmanagerdatapb.TableDefinition, diffType DiffType, statsCounters *stats.Counters) *RowAggregator { +func NewRowAggregator(ctx context.Context, maxRows, maxSize int, insertChannel chan string, dbName string, td *tabletmanagerdatapb.TableDefinition, diffType DiffType, statsCounters *stats.CountersWithLabels) *RowAggregator { // Construct head and tail base commands for the reconciliation statement. var builder QueryBuilder switch diffType { diff --git a/go/vt/worker/row_differ.go b/go/vt/worker/row_differ.go index 6f412d89bff..6f1e50b196f 100644 --- a/go/vt/worker/row_differ.go +++ b/go/vt/worker/row_differ.go @@ -74,7 +74,7 @@ type RowDiffer2 struct { // aggregators are keyed by destination shard and DiffType. aggregators [][]*RowAggregator // equalRowsStatsCounters tracks per table how many rows are equal. - equalRowsStatsCounters *stats.Counters + equalRowsStatsCounters *stats.CountersWithLabels // tableName is required to update "equalRowsStatsCounters". tableName string } @@ -89,7 +89,7 @@ func NewRowDiffer2(ctx context.Context, left, right ResultReader, td *tabletmana // Parameters required by RowRouter. destinationShards []*topo.ShardInfo, keyResolver keyspaceIDResolver, // Parameters required by RowAggregator. - insertChannels []chan string, abort <-chan struct{}, dbNames []string, writeQueryMaxRows, writeQueryMaxSize int, statsCounters []*stats.Counters) (*RowDiffer2, error) { + insertChannels []chan string, abort <-chan struct{}, dbNames []string, writeQueryMaxRows, writeQueryMaxSize int, statsCounters []*stats.CountersWithLabels) (*RowDiffer2, error) { if len(statsCounters) != len(DiffTypes) { panic(fmt.Sprintf("statsCounter has the wrong number of elements. got = %v, want = %v", len(statsCounters), len(DiffTypes))) diff --git a/go/vt/worker/split_clone.go b/go/vt/worker/split_clone.go index 4d19fb28ee5..3dcc884c079 100644 --- a/go/vt/worker/split_clone.go +++ b/go/vt/worker/split_clone.go @@ -826,14 +826,14 @@ func (scw *SplitCloneWorker) clone(ctx context.Context, state StatusWorkerState) } firstSourceTablet = tablets[0].Tablet } - var statsCounters []*stats.Counters + var statsCounters []*stats.CountersWithLabels var tableStatusList *tableStatusList switch state { case WorkerStateCloneOnline: - statsCounters = []*stats.Counters{statsOnlineInsertsCounters, statsOnlineUpdatesCounters, statsOnlineDeletesCounters, statsOnlineEqualRowsCounters} + statsCounters = []*stats.CountersWithLabels{statsOnlineInsertsCounters, statsOnlineUpdatesCounters, statsOnlineDeletesCounters, statsOnlineEqualRowsCounters} tableStatusList = scw.tableStatusListOnline case WorkerStateCloneOffline: - statsCounters = []*stats.Counters{statsOfflineInsertsCounters, statsOfflineUpdatesCounters, statsOfflineDeletesCounters, statsOfflineEqualRowsCounters} + statsCounters = []*stats.CountersWithLabels{statsOfflineInsertsCounters, statsOfflineUpdatesCounters, statsOfflineDeletesCounters, statsOfflineEqualRowsCounters} tableStatusList = scw.tableStatusListOffline } diff --git a/go/vt/worker/split_clone_test.go b/go/vt/worker/split_clone_test.go index c7151333652..7eeba186e65 100644 --- a/go/vt/worker/split_clone_test.go +++ b/go/vt/worker/split_clone_test.go @@ -1013,7 +1013,7 @@ func TestSplitCloneV2_NoMasterAvailable(t *testing.T) { // // Reset the stats now. It also happens when the worker starts but that's too // late because this Go routine looks at it and can run before the worker. - statsRetryCounters.Reset() + statsRetryCounters.ResetAll() go func() { ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() diff --git a/go/vt/worker/worker.go b/go/vt/worker/worker.go index 1e36228f302..bec1083785a 100644 --- a/go/vt/worker/worker.go +++ b/go/vt/worker/worker.go @@ -57,54 +57,67 @@ var ( healthcheckRetryDelay = flag.Duration("worker_healthcheck_retry_delay", 5*time.Second, "delay before retrying a failed healthcheck") healthCheckTimeout = flag.Duration("worker_healthcheck_timeout", time.Minute, "the health check timeout period") - statsState = stats.NewString("WorkerState") - // statsRetryCount is the total number of times a query to vttablet had to be retried. - statsRetryCount = stats.NewInt("WorkerRetryCount") - // statsRetryCount groups the number of retries by category e.g. "TimeoutError" or "Readonly". - statsRetryCounters = stats.NewCounters("WorkerRetryCounters") - // statsThrottledCounters is the number of times a write has been throttled, - // grouped by (keyspace, shard, threadID). Mainly used for testing. - // If throttling is enabled, this should always be non-zero for all threads. - statsThrottledCounters = stats.NewMultiCounters("WorkerThrottledCounters", []string{"Keyspace", "ShardName", "ThreadId"}) - // statsStateDurations tracks for each state how much time was spent in it. Mainly used for testing. - statsStateDurationsNs = stats.NewCounters("WorkerStateDurations") - - // statsOnlineInsertsCounters tracks for every table how many rows were - // inserted during the online clone (reconciliation) phase. - statsOnlineInsertsCounters = stats.NewCounters("WorkerOnlineInsertsCounters") - // statsOnlineUpdatesCounters tracks for every table how many rows were updated. - statsOnlineUpdatesCounters = stats.NewCounters("WorkerOnlineUpdatesCounters") - // statsOnlineUpdatesCounters tracks for every table how many rows were deleted. - statsOnlineDeletesCounters = stats.NewCounters("WorkerOnlineDeletesCounters") - // statsOnlineEqualRowsCounters tracks for every table how many rows were equal. - statsOnlineEqualRowsCounters = stats.NewCounters("WorkerOnlineEqualRowsCounters") - - // statsOfflineInsertsCounters tracks for every table how many rows were - // inserted during the online clone (reconciliation) phase. - statsOfflineInsertsCounters = stats.NewCounters("WorkerOfflineInsertsCounters") - // statsOfflineUpdatesCounters tracks for every table how many rows were updated. - statsOfflineUpdatesCounters = stats.NewCounters("WorkerOfflineUpdatesCounters") - // statsOfflineUpdatesCounters tracks for every table how many rows were deleted. - statsOfflineDeletesCounters = stats.NewCounters("WorkerOfflineDeletesCounters") - // statsOfflineEqualRowsCounters tracks for every table how many rows were equal. - statsOfflineEqualRowsCounters = stats.NewCounters("WorkerOfflineEqualRowsCounters") - - // statsStreamingQueryRestartsCounters tracks for every tablet alias how often - // a streaming query was succesfully established there. - statsStreamingQueryCounters = stats.NewCounters("StreamingQueryCounters") - // statsStreamingQueryErrorsCounters tracks for every tablet alias how often - // a (previously successfully established) streaming query did error. - statsStreamingQueryErrorsCounters = stats.NewCounters("StreamingQueryErrorsCounters") - // statsStreamingQueryRestartsSameTabletCounters tracks for every tablet alias - // how often we successfully restarted a streaming query on the first retry. - // This kind of restart is usually necessary when our streaming query is idle - // and MySQL aborts it after a timeout. - statsStreamingQueryRestartsSameTabletCounters = stats.NewCounters("StreamingQueryRestartsSameTabletCounters") - // statsStreamingQueryRestartsDifferentTablet records how many restarts were - // successful on the 2 (or higher) retry after the initial retry to the same - // tablet failed and we switched to a different tablet. In practice, this - // happens when a tablet did go away due to a maintenance operation. - statsStreamingQueryRestartsDifferentTablet = stats.NewInt("StreamingQueryRestartsDifferentTablet") + statsState = stats.NewString("WorkerState") + statsRetryCount = stats.NewCounter("WorkerRetryCount", "Total number of times a query to a vttablet had to be retried") + statsRetryCounters = stats.NewCountersWithLabels("WorkerRetryCounters", "Number of retries grouped by category e.g. TimeoutError or ReadOnly", "category") + statsThrottledCounters = stats.NewCountersWithMultiLabels( + "WorkerThrottledCounters", + `Number of times a write has been throttled grouped by (keyspace, shard, threadID). + Mainly used for testing. If throttling is enabled this should always be non-zero for all threads`, + []string{"Keyspace", "ShardName", "ThreadId"}) + statsStateDurationsNs = stats.NewGaugesWithLabels("WorkerStateDurations", "How much time was spent in each state. Mainly used for testing.", "state") + + statsOnlineInsertsCounters = stats.NewCountersWithLabels( + "WorkerOnlineInsertsCounters", + "For every table how many rows were inserted during the online clone (reconciliation) phase", + "table") + statsOnlineUpdatesCounters = stats.NewCountersWithLabels( + "WorkerOnlineUpdatesCounters", + "For every table how many rows were updated", + "table") + statsOnlineDeletesCounters = stats.NewCountersWithLabels( + "WorkerOnlineDeletesCounters", + "For every table how many rows were deleted", + "table") + statsOnlineEqualRowsCounters = stats.NewCountersWithLabels( + "WorkerOnlineEqualRowsCounters", + "For every table how many rows were equal", + "table") + + statsOfflineInsertsCounters = stats.NewCountersWithLabels( + "WorkerOfflineInsertsCounters", + "For every table how many rows were inserted during the online clone (reconciliation) phase", + "table") + statsOfflineUpdatesCounters = stats.NewCountersWithLabels( + "WorkerOfflineUpdatesCounters", + "For every table how many rows were updated", + "table") + statsOfflineDeletesCounters = stats.NewCountersWithLabels( + "WorkerOfflineDeletesCounters", + "For every table how many rows were deleted", + "table") + statsOfflineEqualRowsCounters = stats.NewCountersWithLabels( + "WorkerOfflineEqualRowsCounters", + "For every table how many rows were equal", + "table") + + statsStreamingQueryCounters = stats.NewCountersWithLabels( + "StreamingQueryCounters", + "For every tablet alias how often a streaming query was successfully established there", + "tablet_alias") + statsStreamingQueryErrorsCounters = stats.NewCountersWithLabels( + "StreamingQueryErrorsCounters", + "For every tablet alias how often a (previously successfully established) streaming query did error", + "tablet_alias") + statsStreamingQueryRestartsSameTabletCounters = stats.NewCountersWithLabels( + "StreamingQueryRestartsSameTabletCounters", + `For every tablet alias how often we successfully restarted a streaming query on the first retry. + This kind of restart is usually necessary when our streaming query is idle and MySQL aborts it after a timeout.`, + "tablet_alias") + statsStreamingQueryRestartsDifferentTablet = stats.NewCounter( + "StreamingQueryRestartsDifferentTablet", + `How many restarts were successful on the 2 (or higher) retry after the initial retry to the same tablet fails we switch to a different tablet. + In practice, this happens when a tablet did go away due to a maintenance operation.`) ) const ( @@ -118,21 +131,21 @@ const ( // per-run basis. This should be called at the beginning of each worker run. func resetVars() { statsState.Set("") - statsRetryCount.Set(0) - statsRetryCounters.Reset() + statsRetryCount.Reset() + statsRetryCounters.ResetAll() - statsOnlineInsertsCounters.Reset() - statsOnlineUpdatesCounters.Reset() - statsOnlineDeletesCounters.Reset() - statsOnlineEqualRowsCounters.Reset() + statsOnlineInsertsCounters.ResetAll() + statsOnlineUpdatesCounters.ResetAll() + statsOnlineDeletesCounters.ResetAll() + statsOnlineEqualRowsCounters.ResetAll() - statsOfflineInsertsCounters.Reset() - statsOfflineUpdatesCounters.Reset() - statsOfflineDeletesCounters.Reset() - statsOfflineEqualRowsCounters.Reset() + statsOfflineInsertsCounters.ResetAll() + statsOfflineUpdatesCounters.ResetAll() + statsOfflineDeletesCounters.ResetAll() + statsOfflineEqualRowsCounters.ResetAll() - statsStreamingQueryCounters.Reset() - statsStreamingQueryErrorsCounters.Reset() + statsStreamingQueryCounters.ResetAll() + statsStreamingQueryErrorsCounters.ResetAll() } // checkDone returns ctx.Err() iff ctx.Done().