diff --git a/changelog/23.0/23.0.0/summary.md b/changelog/23.0/23.0.0/summary.md new file mode 100644 index 00000000000..b48297f52af --- /dev/null +++ b/changelog/23.0/23.0.0/summary.md @@ -0,0 +1,14 @@ +## Summary + +### Table of Contents +- **[Minor Changes](#minor-changes)** + - **[VTTablet](#minor-changes-vttablet)** + - [CLI Flags](#flags-vttablet) + +## Minor Changes + +### VTTablet + +#### CLI Flags + +- `skip-user-metrics` flag if enabled, replaces the username label with "UserLabelDisabled" to prevent metric explosion in environments with many unique users. \ No newline at end of file diff --git a/changelog/23.0/README.md b/changelog/23.0/README.md new file mode 100644 index 00000000000..42a151fd40c --- /dev/null +++ b/changelog/23.0/README.md @@ -0,0 +1,2 @@ +## v23.0 +* **[23.0.0](23.0.0)** diff --git a/changelog/README.md b/changelog/README.md index 7cade673d26..b5cfdedcc8c 100644 --- a/changelog/README.md +++ b/changelog/README.md @@ -1,4 +1,5 @@ ## Releases +* [23.0](23.0) * [22.0](22.0) * [21.0](21.0) * [20.0](20.0) diff --git a/go/flags/endtoend/vtcombo.txt b/go/flags/endtoend/vtcombo.txt index 9bb706b42b7..cf382598cae 100644 --- a/go/flags/endtoend/vtcombo.txt +++ b/go/flags/endtoend/vtcombo.txt @@ -326,6 +326,7 @@ Flags: --serving_state_grace_period duration how long to pause after broadcasting health to vtgate, before enforcing a new serving state --shard_sync_retry_delay duration delay between retries of updates to keep the tablet and its shard record in sync (default 30s) --shutdown_grace_period duration how long to wait for queries and transactions to complete during graceful shutdown. (default 3s) + --skip-user-metrics If true, user based stats are not recorded. --sql-max-length-errors int truncate queries in error logs to the given length (default unlimited) --sql-max-length-ui int truncate queries in debug UIs to the given length (default 512) (default 512) --srv_topo_cache_refresh duration how frequently to refresh the topology for cached entries (default 1s) diff --git a/go/flags/endtoend/vttablet.txt b/go/flags/endtoend/vttablet.txt index 04eb16edc25..973ee500558 100644 --- a/go/flags/endtoend/vttablet.txt +++ b/go/flags/endtoend/vttablet.txt @@ -327,6 +327,7 @@ Flags: --serving_state_grace_period duration how long to pause after broadcasting health to vtgate, before enforcing a new serving state --shard_sync_retry_delay duration delay between retries of updates to keep the tablet and its shard record in sync (default 30s) --shutdown_grace_period duration how long to wait for queries and transactions to complete during graceful shutdown. (default 3s) + --skip-user-metrics If true, user based stats are not recorded. --sql-max-length-errors int truncate queries in error logs to the given length (default unlimited) --sql-max-length-ui int truncate queries in debug UIs to the given length (default 512) (default 512) --srv_topo_cache_refresh duration how frequently to refresh the topology for cached entries (default 1s) diff --git a/go/vt/tableacl/acl/acl.go b/go/vt/tableacl/acl/acl.go index abff749384c..7180e2c00f7 100644 --- a/go/vt/tableacl/acl/acl.go +++ b/go/vt/tableacl/acl/acl.go @@ -47,3 +47,12 @@ type AcceptAllACL struct{} func (acl AcceptAllACL) IsMember(principal *querypb.VTGateCallerID) bool { return true } + +type ACLState int8 + +const ( + ACLUnknown ACLState = iota + ACLAllow + ACLDenied + ACLPseudoDenied +) diff --git a/go/vt/vttablet/endtoend/transaction_test.go b/go/vt/vttablet/endtoend/transaction_test.go index 5e5889ad671..50d9b06ffbd 100644 --- a/go/vt/vttablet/endtoend/transaction_test.go +++ b/go/vt/vttablet/endtoend/transaction_test.go @@ -846,3 +846,81 @@ func TestUnresolvedTransactionsOrdering(t *testing.T) { assert.Equal(t, want[i].Participants, transaction.Participants) } } + +// TestSkipUserMetrics tests the SkipUserMetrics flag in the config that disables user label in the metrics. +func TestSkipUserMetrics(t *testing.T) { + client := framework.NewClient() + query := "select * from vitess_test" + + runQueries := func() { + // non-tx execute + _, err := client.Execute(query, nil) + require.NoError(t, err) + + // tx execute + _, err = client.BeginExecute(query, nil, nil) + require.NoError(t, err) + require.NoError(t, client.Commit()) + } + + // Initial test with user metrics enabled + vstart := framework.DebugVars() + runQueries() + + expectedDiffs := []struct { + tag string + diff int + }{{ // not dependent on user + tag: "Transactions/TotalCount", diff: 1, + }, { // not dependent on user + tag: "Transactions/Histograms/commit/Count", diff: 1, + }, { // dependent on user + tag: "TableACLAllowed/vitess_test.vitess_test.Select.dev", diff: 2, + }, { // user metric enabled so this should be zero. + tag: "TableACLAllowed/vitess_test.vitess_test.Select.UserLabelDisabled", diff: 0, + }, { // dependent on user + tag: "UserTableQueryCount/vitess_test.dev.Execute", diff: 2, + }, { // user metric enabled so this should be zero. + tag: "UserTableQueryCount/vitess_test.UserLabelDisabled.Execute", diff: 0, + }, { // dependent on user + tag: "UserTransactionCount/dev.commit", diff: 1, + }} + vend := framework.DebugVars() + for _, expected := range expectedDiffs { + compareIntDiff(t, vend, expected.tag, vstart, expected.diff) + } + + // Enable SkipUserMetrics and re-run tests + framework.Server.Config().SkipUserMetrics = true + defer func() { + framework.Server.Config().SkipUserMetrics = false + }() + vstart = framework.DebugVars() + runQueries() + + expectedDiffs = []struct { + tag string + diff int + }{{ // not dependent on user + tag: "Transactions/TotalCount", diff: 1, + }, { // not dependent on user + tag: "Transactions/Histograms/commit/Count", diff: 1, + }, { // dependent on user - should be zero now + tag: "TableACLAllowed/vitess_test.vitess_test.Select.dev", diff: 0, + }, { // user metric disabled so this should be non-zero. + tag: "TableACLAllowed/vitess_test.vitess_test.Select.UserLabelDisabled", diff: 2, + }, { // dependent on user - should be zero now + tag: "UserTableQueryCount/vitess_test.dev.Execute", diff: 0, + }, { // user metric disabled so this should be non-zero. + tag: "UserTableQueryCount/vitess_test.UserLabelDisabled.Execute", diff: 2, + }, { // dependent on user + tag: "UserTransactionCount/dev.commit", diff: 0, + }, { // no need to publish this as "Transactions" histogram already captures this. + tag: "UserTransactionCount/UserLabelDisabled.commit", diff: 0, + }} + vend = framework.DebugVars() + for _, expected := range expectedDiffs { + compareIntDiff(t, vend, expected.tag, vstart, expected.diff) + } + +} diff --git a/go/vt/vttablet/tabletserver/dt_executor_test.go b/go/vt/vttablet/tabletserver/dt_executor_test.go index f83d7ff9006..ec2418e16e3 100644 --- a/go/vt/vttablet/tabletserver/dt_executor_test.go +++ b/go/vt/vttablet/tabletserver/dt_executor_test.go @@ -31,12 +31,11 @@ import ( "github.com/stretchr/testify/require" "google.golang.org/protobuf/proto" - "vitess.io/vitess/go/streamlog" - "vitess.io/vitess/go/event/syslogger" "vitess.io/vitess/go/mysql/fakesqldb" "vitess.io/vitess/go/mysql/sqlerror" "vitess.io/vitess/go/sqltypes" + "vitess.io/vitess/go/streamlog" querypb "vitess.io/vitess/go/vt/proto/query" topodatapb "vitess.io/vitess/go/vt/proto/topodata" "vitess.io/vitess/go/vt/vtenv" @@ -58,7 +57,7 @@ func TestTxExecutorEmptyPrepare(t *testing.T) { // taint the connection. sc, err := tsv.te.txPool.GetAndLock(txid, "taint") require.NoError(t, err) - sc.Taint(ctx, nil) + sc.Taint(ctx, tsv.te.reservedConnStats) sc.Unlock() err = txe.Prepare(txid, "aa") @@ -80,7 +79,7 @@ func TestExecutorPrepareFailure(t *testing.T) { // taint the connection. sc, err := tsv.te.txPool.GetAndLock(txid, "taint") require.NoError(t, err) - sc.Taint(ctx, nil) + sc.Taint(ctx, tsv.te.reservedConnStats) sc.Unlock() // try 2pc commit of Metadata Manager. @@ -374,7 +373,7 @@ func TestExecutorStartCommitFailure(t *testing.T) { // taint the connection. sc, err := tsv.te.txPool.GetAndLock(txid, "taint") require.NoError(t, err) - sc.Taint(ctx, nil) + sc.Taint(ctx, tsv.te.reservedConnStats) sc.Unlock() // add rollback state update expectation diff --git a/go/vt/vttablet/tabletserver/query_executor.go b/go/vt/vttablet/tabletserver/query_executor.go index 2a7d2049ba9..382cd9e58f2 100644 --- a/go/vt/vttablet/tabletserver/query_executor.go +++ b/go/vt/vttablet/tabletserver/query_executor.go @@ -40,6 +40,7 @@ import ( "vitess.io/vitess/go/vt/schema" "vitess.io/vitess/go/vt/sqlparser" "vitess.io/vitess/go/vt/tableacl" + "vitess.io/vitess/go/vt/tableacl/acl" "vitess.io/vitess/go/vt/vterrors" "vitess.io/vitess/go/vt/vtgate/evalengine" "vitess.io/vitess/go/vt/vttablet/tabletserver/connpool" @@ -69,9 +70,10 @@ type QueryExecutor struct { } const ( - streamRowsSize = 256 - resetLastIDQuery = "select last_insert_id(18446744073709547416)" - resetLastIDValue = 18446744073709547416 + streamRowsSize = 256 + resetLastIDQuery = "select last_insert_id(18446744073709547416)" + resetLastIDValue = 18446744073709547416 + userLabelDisabled = "UserLabelDisabled" ) var ( @@ -530,10 +532,14 @@ func (qre *QueryExecutor) checkPermissions() error { } func (qre *QueryExecutor) checkAccess(authorized *tableacl.ACLResult, tableName string, callerID *querypb.VTGateCallerID) error { - statsKey := []string{tableName, authorized.GroupName, qre.plan.PlanID.String(), callerID.Username} + var aclState acl.ACLState + defer func() { + statsKey := qre.generateACLStatsKey(tableName, authorized, callerID) + qre.recordACLStats(statsKey, aclState) + }() if !authorized.IsMember(callerID) { if qre.tsv.qe.enableTableACLDryRun { - qre.tsv.Stats().TableaclPseudoDenied.Add(statsKey, 1) + aclState = acl.ACLPseudoDenied return nil } @@ -547,17 +553,37 @@ func (qre *QueryExecutor) checkAccess(authorized *tableacl.ACLResult, tableName if len(callerID.Groups) > 0 { groupStr = fmt.Sprintf(", in groups [%s],", strings.Join(callerID.Groups, ", ")) } + aclState = acl.ACLDenied errStr := fmt.Sprintf("%s command denied to user '%s'%s for table '%s' (ACL check error)", qre.plan.PlanID.String(), callerID.Username, groupStr, tableName) - qre.tsv.Stats().TableaclDenied.Add(statsKey, 1) qre.tsv.qe.accessCheckerLogger.Infof("%s", errStr) return vterrors.Errorf(vtrpcpb.Code_PERMISSION_DENIED, "%s", errStr) } return nil } - qre.tsv.Stats().TableaclAllowed.Add(statsKey, 1) + aclState = acl.ACLAllow return nil } +func (qre *QueryExecutor) generateACLStatsKey(tableName string, authorized *tableacl.ACLResult, callerID *querypb.VTGateCallerID) []string { + if qre.tsv.Config().SkipUserMetrics { + return []string{tableName, authorized.GroupName, qre.plan.PlanID.String(), userLabelDisabled} + } + return []string{tableName, authorized.GroupName, qre.plan.PlanID.String(), callerID.Username} +} + +func (qre *QueryExecutor) recordACLStats(key []string, aclState acl.ACLState) { + switch aclState { + case acl.ACLAllow: + qre.tsv.Stats().TableaclAllowed.Add(key, 1) + case acl.ACLDenied: + qre.tsv.Stats().TableaclDenied.Add(key, 1) + case acl.ACLPseudoDenied: + qre.tsv.Stats().TableaclPseudoDenied.Add(key, 1) + case acl.ACLUnknown: + // nothing to record here. + } +} + func (qre *QueryExecutor) execDDL(conn *StatefulConnection) (result *sqltypes.Result, err error) { // Let's see if this is a normal DDL statement or an Online DDL statement. // An Online DDL statement is identified by /*vt+ .. */ comment with expected directives, like uuid etc. @@ -1274,9 +1300,14 @@ func (qre *QueryExecutor) execStreamSQL(conn *connpool.PooledConn, isTransaction } func (qre *QueryExecutor) recordUserQuery(queryType string, duration int64) { - username := callerid.GetPrincipal(callerid.EffectiveCallerIDFromContext(qre.ctx)) - if username == "" { - username = callerid.GetUsername(callerid.ImmediateCallerIDFromContext(qre.ctx)) + var username string + if qre.tsv.config.SkipUserMetrics { + username = userLabelDisabled + } else { + username = callerid.GetPrincipal(callerid.EffectiveCallerIDFromContext(qre.ctx)) + if username == "" { + username = callerid.GetUsername(callerid.ImmediateCallerIDFromContext(qre.ctx)) + } } tableName := qre.plan.TableName().String() qre.tsv.Stats().UserTableQueryCount.Add([]string{tableName, username, queryType}, 1) diff --git a/go/vt/vttablet/tabletserver/stateful_connection.go b/go/vt/vttablet/tabletserver/stateful_connection.go index 10fc763984f..d64d2fefe9c 100644 --- a/go/vt/vttablet/tabletserver/stateful_connection.go +++ b/go/vt/vttablet/tabletserver/stateful_connection.go @@ -179,7 +179,7 @@ func (sc *StatefulConnection) ReleaseString(reason string) { } sc.dbConn.Recycle() sc.dbConn = nil - sc.logReservedConn() + sc.logReservedConn(reason) } // Renew the existing connection with new connection id. @@ -260,7 +260,11 @@ func (sc *StatefulConnection) Taint(ctx context.Context, stats *servenv.TimingsW Stats: stats, } sc.dbConn.Taint() - sc.Stats().UserActiveReservedCount.Add(sc.getUsername(), 1) + if sc.env.Config().SkipUserMetrics { + sc.Stats().UserActiveReservedCount.Add(userLabelDisabled, 1) + } else { + sc.Stats().UserActiveReservedCount.Add(sc.getUsername(), 1) + } return nil } @@ -282,9 +286,11 @@ func (sc *StatefulConnection) LogTransaction(reason tx.ReleaseReason) { username = callerid.GetUsername(sc.txProps.ImmediateCaller) } duration := sc.txProps.EndTime.Sub(sc.txProps.StartTime) - sc.Stats().UserTransactionCount.Add([]string{username, reason.Name()}, 1) - sc.Stats().UserTransactionTimesNs.Add([]string{username, reason.Name()}, int64(duration)) sc.txProps.Stats.Add(reason.Name(), duration) + if !sc.env.Config().SkipUserMetrics { + sc.Stats().UserTransactionCount.Add([]string{username, reason.Name()}, 1) + sc.Stats().UserTransactionTimesNs.Add([]string{username, reason.Name()}, int64(duration)) + } tabletenv.TxLogger.Send(sc) } @@ -294,15 +300,19 @@ func (sc *StatefulConnection) SetTimeout(timeout time.Duration) { } // logReservedConn logs reserved connection related stats. -func (sc *StatefulConnection) logReservedConn() { +func (sc *StatefulConnection) logReservedConn(reason string) { if sc.reservedProps == nil { return // Nothing to log as this connection is not reserved. } - duration := time.Since(sc.reservedProps.StartTime) - username := sc.getUsername() - sc.Stats().UserActiveReservedCount.Add(username, -1) - sc.Stats().UserReservedCount.Add(username, 1) - sc.Stats().UserReservedTimesNs.Add(username, int64(duration)) + sc.reservedProps.Stats.Record(reason, sc.reservedProps.StartTime) + if sc.env.Config().SkipUserMetrics { + sc.Stats().UserActiveReservedCount.Add(userLabelDisabled, -1) + } else { + username := sc.getUsername() + sc.Stats().UserActiveReservedCount.Add(username, -1) + sc.Stats().UserReservedCount.Add(username, 1) + sc.Stats().UserReservedTimesNs.Add(username, int64(time.Since(sc.reservedProps.StartTime))) + } } func (sc *StatefulConnection) getUsername() string { diff --git a/go/vt/vttablet/tabletserver/stateful_connection_pool_test.go b/go/vt/vttablet/tabletserver/stateful_connection_pool_test.go index b93c822cfdc..cb7b9190abf 100644 --- a/go/vt/vttablet/tabletserver/stateful_connection_pool_test.go +++ b/go/vt/vttablet/tabletserver/stateful_connection_pool_test.go @@ -27,6 +27,7 @@ import ( "vitess.io/vitess/go/sqltypes" "vitess.io/vitess/go/vt/dbconfigs" querypb "vitess.io/vitess/go/vt/proto/query" + "vitess.io/vitess/go/vt/servenv" "vitess.io/vitess/go/vt/vttablet/tabletserver/tx" ) @@ -96,24 +97,25 @@ func TestStatefulPoolShutdownNonTx(t *testing.T) { pool := newActivePool() params := dbconfigs.New(db.ConnParams()) pool.Open(params, params, params) + rcStats := servenv.NewExporter("TestStatefulPoolShutdownNonTx", "").NewTimings("rconn", "test1", "test2") // conn1 non-tx, not in use. conn1, err := pool.NewConn(ctx, &querypb.ExecuteOptions{}, nil) require.NoError(t, err) - conn1.Taint(ctx, nil) + conn1.Taint(ctx, rcStats) conn1.Unlock() // conn2 tx, not in use. conn2, err := pool.NewConn(ctx, &querypb.ExecuteOptions{}, nil) require.NoError(t, err) - conn2.Taint(ctx, nil) + conn2.Taint(ctx, rcStats) conn2.txProps = &tx.Properties{} conn2.Unlock() // conn3 non-tx, in use. conn3, err := pool.NewConn(ctx, &querypb.ExecuteOptions{}, nil) require.NoError(t, err) - conn3.Taint(ctx, nil) + conn3.Taint(ctx, rcStats) // After ShutdownNonTx, conn1 should be closed, but not conn3. pool.ShutdownNonTx() diff --git a/go/vt/vttablet/tabletserver/tabletenv/config.go b/go/vt/vttablet/tabletserver/tabletenv/config.go index e4ca2bfc96a..b52157b1452 100644 --- a/go/vt/vttablet/tabletserver/tabletenv/config.go +++ b/go/vt/vttablet/tabletserver/tabletenv/config.go @@ -216,6 +216,7 @@ func registerTabletEnvFlags(fs *pflag.FlagSet) { fs.BoolVar(¤tConfig.EnableViews, "queryserver-enable-views", false, "Enable views support in vttablet.") fs.BoolVar(¤tConfig.EnablePerWorkloadTableMetrics, "enable-per-workload-table-metrics", defaultConfig.EnablePerWorkloadTableMetrics, "If true, query counts and query error metrics include a label that identifies the workload") + fs.BoolVar(¤tConfig.SkipUserMetrics, "skip-user-metrics", defaultConfig.SkipUserMetrics, "If true, user based stats are not recorded.") fs.BoolVar(¤tConfig.Unmanaged, "unmanaged", false, "Indicates an unmanaged tablet, i.e. using an external mysql-compatible database") } @@ -370,6 +371,7 @@ type TabletConfig struct { EnableViews bool `json:"-"` EnablePerWorkloadTableMetrics bool `json:"-"` + SkipUserMetrics bool `json:"-"` } func (cfg *TabletConfig) MarshalJSON() ([]byte, error) { diff --git a/misc/git/hooks/golangci-lint b/misc/git/hooks/golangci-lint index 10fc03f0505..fa81f5b52a3 100755 --- a/misc/git/hooks/golangci-lint +++ b/misc/git/hooks/golangci-lint @@ -52,7 +52,7 @@ else if ! version_greater_or_equal "$INSTALLED_VERSION" "${REQUIRED_VERSION#v}"; then echo "golangci-lint version $INSTALLED_VERSION found, but $REQUIRED_VERSION or newer is required." echo "Installing version $REQUIRED_VERSION..." - go install github.com/golangci/golangci-lint/cmd/golangci-lint@$REQUIRED_VERSION + go install github.com/golangci/golangci-lint/v2/cmd/golangci-lint@$REQUIRED_VERSION fi fi