Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions go/vt/topo/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ import (
"sync"

"golang.org/x/net/context"

"vitess.io/vitess/go/vt/log"
)

Expand Down Expand Up @@ -175,13 +176,15 @@ func NewWithFactory(factory Factory, serverAddress, root string) (*Server, error
if err != nil {
return nil, err
}
conn = NewStatsConn(GlobalCell, conn)

var connReadOnly Conn
if factory.HasGlobalReadOnlyCell(serverAddress, root) {
connReadOnly, err = factory.Create(GlobalReadOnlyCell, serverAddress, root)
if err != nil {
return nil, err
}
connReadOnly = NewStatsConn(GlobalReadOnlyCell, connReadOnly)
} else {
connReadOnly = conn
}
Expand Down Expand Up @@ -257,6 +260,7 @@ func (ts *Server) ConnForCell(ctx context.Context, cell string) (Conn, error) {
if err != nil {
return nil, fmt.Errorf("failed to create topo connection to %v, %v: %v", ci.ServerAddress, ci.Root, err)
}
conn = NewStatsConn(cell, conn)
ts.cells[cell] = conn
return conn, nil
}
Expand Down
157 changes: 157 additions & 0 deletions go/vt/topo/stats_conn.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
/*
Copyright 2018 The Vitess Authors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package topo

import (
"time"

"golang.org/x/net/context"

"vitess.io/vitess/go/stats"
)

var _ Conn = (*StatsConn)(nil)

var (
topoStatsConnTimings = stats.NewMultiTimings(
"TopologyConnOperations",
"TopologyConnOperations timings",
[]string{"Operation", "Cell"})

topoStatsConnErrors = stats.NewCountersWithMultiLabels(
"TopologyConnErrors",
"TopologyConnErrors errors per operation",
[]string{"Operation", "Cell"})
)

// The StatsConn is a wrapper for a Conn that emits stats for every operation
type StatsConn struct {
cell string
conn Conn
}

// NewStatsConn returns a StatsConn
func NewStatsConn(cell string, conn Conn) *StatsConn {
return &StatsConn{
cell: cell,
conn: conn,
}
}

// ListDir is part of the Conn interface
func (st *StatsConn) ListDir(ctx context.Context, dirPath string, full bool) ([]DirEntry, error) {
startTime := time.Now()
statsKey := []string{"ListDir", st.cell}
defer topoStatsConnTimings.Record(statsKey, startTime)
res, err := st.conn.ListDir(ctx, dirPath, full)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure how much visibility you want into the class of failures but it potentially useful to tag based on the error code. So something like adding:

// vt/topo/errors.go
// caveat: ideally we'd convert this back to *string for display or have a parallel ErrorCodeName
func ErrorCode(e error) *ErrorCode {
	if e == nil { return nil }
	switch err := e.(type) {
	case *Error:
		return err.code
	case Error:
		return err.code
	default:
		return nil
	}
}

and then in your error handling (I'm taking some liberty with go for brevity ❤️ ... and also eliding some of the necessary error handling / details)

if err != nil {
  topoStatsConnErrors.Add(statsKey, int64(1))
  topoStatsConnScopedErrors.Add([...statsKey, topo.ErrorCode(err)], int64(1))
}

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think for now I would like to keep the generic error count. I'm mostly interested in tracking general errors as we make the changes from skipping consul agent and talking to consul via the HA proxy vs normal operations.

Should we iterate in the future if we see need for more granularity ?

Copy link
Copy Markdown
Contributor

@falun falun Nov 29, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minor, not strong, objections; presumably something downstream of these calls is logging error codes so we can check aggregation for details if necessary?

My thought here is that if our error counts are non-zero (and we don't know because we don't have telemetry right now, correct?) then moving from non-zero to non-zero as we shift between consul and consul-via-proxy doesn't actually tell us if we've functionally changed the behavior since the type of errors are also important.

Two caveats

  1. If we know/suspect there are no errors in steady-state then this is less important
  2. If my read of stats.CountersWithMultiLabels is incorrect and it's more effort than just adding a single scoped counter then the effort:(payoff | code noise) ratio changes and it's less compelling for me

That said, if you want, I'm pretty okay pushing this into the the future should we need it.

Copy link
Copy Markdown
Member Author

@rafael rafael Nov 29, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with your reasoning here.

My current understanding is number 1. I'm expecting to have zero errors during normal operations. So thinking it should be ok.

if err != nil {
topoStatsConnErrors.Add(statsKey, int64(1))
return res, err
}
return res, err
}

// Create is part of the Conn interface
func (st *StatsConn) Create(ctx context.Context, filePath string, contents []byte) (Version, error) {
startTime := time.Now()
statsKey := []string{"Create", st.cell}
defer topoStatsConnTimings.Record(statsKey, startTime)
res, err := st.conn.Create(ctx, filePath, contents)
if err != nil {
topoStatsConnErrors.Add(statsKey, int64(1))
return res, err
}
return res, err
}

// Update is part of the Conn interface
func (st *StatsConn) Update(ctx context.Context, filePath string, contents []byte, version Version) (Version, error) {
startTime := time.Now()
statsKey := []string{"Update", st.cell}
defer topoStatsConnTimings.Record(statsKey, startTime)
res, err := st.conn.Update(ctx, filePath, contents, version)
if err != nil {
topoStatsConnErrors.Add(statsKey, int64(1))
return res, err
}
return res, err
}

// Get is part of the Conn interface
func (st *StatsConn) Get(ctx context.Context, filePath string) ([]byte, Version, error) {
startTime := time.Now()
statsKey := []string{"Get", st.cell}
defer topoStatsConnTimings.Record(statsKey, startTime)
bytes, version, err := st.conn.Get(ctx, filePath)
if err != nil {
topoStatsConnErrors.Add(statsKey, int64(1))
return bytes, version, err
}
return bytes, version, err
}

// Delete is part of the Conn interface
func (st *StatsConn) Delete(ctx context.Context, filePath string, version Version) error {
startTime := time.Now()
statsKey := []string{"Delete", st.cell}
defer topoStatsConnTimings.Record(statsKey, startTime)
err := st.conn.Delete(ctx, filePath, version)
if err != nil {
topoStatsConnErrors.Add(statsKey, int64(1))
return err
}
return err
}

// Lock is part of the Conn interface
func (st *StatsConn) Lock(ctx context.Context, dirPath, contents string) (LockDescriptor, error) {
startTime := time.Now()
statsKey := []string{"Lock", st.cell}
defer topoStatsConnTimings.Record(statsKey, startTime)
res, err := st.conn.Lock(ctx, dirPath, contents)
if err != nil {
topoStatsConnErrors.Add(statsKey, int64(1))
return res, err
}
return res, err
}

// Watch is part of the Conn interface
func (st *StatsConn) Watch(ctx context.Context, filePath string) (current *WatchData, changes <-chan *WatchData, cancel CancelFunc) {
startTime := time.Now()
statsKey := []string{"Watch", st.cell}
defer topoStatsConnTimings.Record(statsKey, startTime)
return st.conn.Watch(ctx, filePath)
}

// NewMasterParticipation is part of the Conn interface
func (st *StatsConn) NewMasterParticipation(name, id string) (MasterParticipation, error) {
startTime := time.Now()
statsKey := []string{"NewMasterParticipation", st.cell}
defer topoStatsConnTimings.Record(statsKey, startTime)
res, err := st.conn.NewMasterParticipation(name, id)
if err != nil {
topoStatsConnErrors.Add(statsKey, int64(1))
return res, err
}
return res, err
}

// Close is part of the Conn interface
func (st *StatsConn) Close() {
startTime := time.Now()
statsKey := []string{"Close", st.cell}
defer topoStatsConnTimings.Record(statsKey, startTime)
st.conn.Close()
}
Loading