diff --git a/pkg/base/test_server_args.go b/pkg/base/test_server_args.go index 049ed2c3df50..15f6fafac694 100644 --- a/pkg/base/test_server_args.go +++ b/pkg/base/test_server_args.go @@ -122,7 +122,14 @@ type TestServerArgs struct { UseDatabase string // If set, this will be configured in the test server to check connections - // from other test servers and to report in the SQL introspection. + // from other test servers and to report in the SQL introspection. It is + // advised to make the name sufficiently unique, in order to prevent a + // TestCluster from accidentally getting messages from unrelated clusters in + // the same environment that used the same TCP ports recently (e.g. see + // https://github.com/cockroachdb/cockroach/issues/157838). + // + // If empty (most cases), a unique ClusterName is generated automatically, or + // a higher-level default is used (e.g. taken from TestClusterArgs). ClusterName string // Stopper can be used to stop the server. If not set, a stopper will be diff --git a/pkg/cli/debug_recover_loss_of_quorum_test.go b/pkg/cli/debug_recover_loss_of_quorum_test.go index cce476f2f814..d5fff9f835e3 100644 --- a/pkg/cli/debug_recover_loss_of_quorum_test.go +++ b/pkg/cli/debug_recover_loss_of_quorum_test.go @@ -135,8 +135,8 @@ func TestCollectInfoFromOnlineCluster(t *testing.T) { "recover", "collect-info", "--insecure", - "--host", - tc.Server(2).AdvRPCAddr(), + "--host", tc.Server(2).AdvRPCAddr(), + "--cluster-name", tc.ClusterName(), replicaInfoFileName, }) @@ -548,6 +548,7 @@ func TestHalfOnlineLossOfQuorumRecovery(t *testing.T) { "--confirm=y", "--certs-dir=test_certs", "--host=" + tc.Server(0).AdvRPCAddr(), + "--cluster-name=" + tc.ClusterName(), "--plan=" + planFile, }) require.NoError(t, err, "failed to run make-plan") @@ -571,6 +572,7 @@ func TestHalfOnlineLossOfQuorumRecovery(t *testing.T) { "debug", "recover", "apply-plan", "--certs-dir=test_certs", "--host=" + tc.Server(0).AdvRPCAddr(), + "--cluster-name=" + tc.ClusterName(), "--confirm=y", planFile, }) require.NoError(t, err, "failed to run apply plan") @@ -586,6 +588,7 @@ func TestHalfOnlineLossOfQuorumRecovery(t *testing.T) { "debug", "recover", "verify", "--certs-dir=test_certs", "--host=" + tc.Server(0).AdvRPCAddr(), + "--cluster-name=" + tc.ClusterName(), planFile, }) require.NoError(t, err, "failed to run verify plan") @@ -635,6 +638,7 @@ func TestHalfOnlineLossOfQuorumRecovery(t *testing.T) { "debug", "recover", "verify", "--certs-dir=test_certs", "--host=" + tc.Server(0).AdvRPCAddr(), + "--cluster-name=" + tc.ClusterName(), planFile, }) require.NoError(t, err, "failed to run verify plan") diff --git a/pkg/cli/testdata/zip/partial1 b/pkg/cli/testdata/zip/partial1 index a3904653c8a5..b5ac7537cea6 100644 --- a/pkg/cli/testdata/zip/partial1 +++ b/pkg/cli/testdata/zip/partial1 @@ -1,6 +1,6 @@ zip ---- -debug zip --concurrency=1 --cpu-profile-duration=0s --validate-zip-file=false /dev/null +debug zip --concurrency=1 --cpu-profile-duration=0s --validate-zip-file=false --cluster-name= /dev/null [cluster] discovering virtual clusters... done [cluster] creating output file /dev/null... done [cluster] establishing RPC connection to ... diff --git a/pkg/cli/testdata/zip/partial1_excluded b/pkg/cli/testdata/zip/partial1_excluded index 9d1d401efbf0..1834802a1994 100644 --- a/pkg/cli/testdata/zip/partial1_excluded +++ b/pkg/cli/testdata/zip/partial1_excluded @@ -1,6 +1,6 @@ zip ---- -debug zip /dev/null --concurrency=1 --exclude-nodes=2 --cpu-profile-duration=0 --validate-zip-file=false +debug zip --concurrency=1 --exclude-nodes=2 --cpu-profile-duration=0 --validate-zip-file=false --cluster-name= /dev/null [cluster] discovering virtual clusters... done [cluster] creating output file /dev/null... done [cluster] establishing RPC connection to ... diff --git a/pkg/cli/testdata/zip/partial2 b/pkg/cli/testdata/zip/partial2 index 3cfcce8bccb2..77ae82cfa762 100644 --- a/pkg/cli/testdata/zip/partial2 +++ b/pkg/cli/testdata/zip/partial2 @@ -1,6 +1,6 @@ zip ---- -debug zip --concurrency=1 --cpu-profile-duration=0 --validate-zip-file=false /dev/null +debug zip --concurrency=1 --cpu-profile-duration=0 --validate-zip-file=false --cluster-name= /dev/null [cluster] discovering virtual clusters... done [cluster] creating output file /dev/null... done [cluster] establishing RPC connection to ... diff --git a/pkg/cli/testdata/zip/testzip_concurrent b/pkg/cli/testdata/zip/testzip_concurrent index 1bbfa28432d9..d5b1c6d3a879 100644 --- a/pkg/cli/testdata/zip/testzip_concurrent +++ b/pkg/cli/testdata/zip/testzip_concurrent @@ -227,4 +227,4 @@ zip [node ?] ? log files found [node ?] ? log files found [node ?] ? log files found -debug zip --timeout=30s --cpu-profile-duration=0s --validate-zip-file=false /dev/null +debug zip --timeout=30s --cpu-profile-duration=0s --validate-zip-file=false --cluster-name= /dev/null diff --git a/pkg/cli/zip_test.go b/pkg/cli/zip_test.go index 3fd3d076e79e..0ccf923cd028 100644 --- a/pkg/cli/zip_test.go +++ b/pkg/cli/zip_test.go @@ -421,10 +421,11 @@ func TestConcurrentZip(t *testing.T) { defer func(prevStderr *os.File) { stderr = prevStderr }(stderr) stderr = os.Stdout - out, err := c.RunWithCapture("debug zip --timeout=30s --cpu-profile-duration=0s --validate-zip-file=false " + os.DevNull) - if err != nil { - t.Fatal(err) - } + out, err := c.RunWithCapture(fmt.Sprintf( + "debug zip --timeout=30s --cpu-profile-duration=0s --validate-zip-file=false --cluster-name=%s %s", + tc.ClusterName(), os.DevNull, + )) + require.NoError(t, err) // Strip any non-deterministic messages. out = eraseNonDeterministicZipOutput(out) @@ -437,6 +438,8 @@ func TestConcurrentZip(t *testing.T) { // which the original messages interleve with other messages mean the number // of them after each series is collapsed is also non-derministic. out = regexp.MustCompile(`\n`).ReplaceAllString(out, "") + // Replace the non-deterministic cluster name with a placeholder. + out = eraseClusterName(out, tc.ClusterName()) // We use datadriven simply to read the golden output file; we don't actually // run any commands. Using datadriven allows TESTFLAGS=-rewrite. @@ -541,9 +544,8 @@ func TestUnavailableZip(t *testing.T) { tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{ServerArgs: base.TestServerArgs{ DefaultTestTenant: base.TestIsSpecificToStorageLayerAndNeedsASystemTenant, - - Insecure: true, - Knobs: base.TestingKnobs{Store: knobs}, + Insecure: true, + Knobs: base.TestingKnobs{Store: knobs}, }}) defer tc.Stopper().Stop(context.Background()) @@ -559,9 +561,10 @@ func TestUnavailableZip(t *testing.T) { defer close(ch) // Run debug zip against node 1. - debugZipCommand := - "debug zip --concurrency=1 --cpu-profile-duration=0 " + os. - DevNull + " --timeout=.5s" + debugZipCommand := fmt.Sprintf( + "debug zip --concurrency=1 --cpu-profile-duration=0 --timeout=.5s --cluster-name=%s %s", + tc.ClusterName(), os.DevNull, + ) t.Run("server 1", func(t *testing.T) { c := TestCLI{ @@ -651,6 +654,10 @@ func baseZipOutput(nodeId int) []string { return output } +func eraseClusterName(str, name string) string { + return strings.ReplaceAll(str, name, "") +} + func eraseNonDeterministicZipOutput(out string) string { re := regexp.MustCompile(`(?m)postgresql://.*$`) out = re.ReplaceAllString(out, `postgresql://...`) @@ -736,13 +743,15 @@ func TestPartialZip(t *testing.T) { defer func(prevStderr *os.File) { stderr = prevStderr }(stderr) stderr = os.Stdout - out, err := c.RunWithCapture("debug zip --concurrency=1 --cpu-profile-duration=0s --validate-zip-file=false " + os.DevNull) - if err != nil { - t.Fatal(err) - } + out, err := c.RunWithCapture(fmt.Sprintf( + "debug zip --concurrency=1 --cpu-profile-duration=0s --validate-zip-file=false --cluster-name=%s %s", + tc.ClusterName(), os.DevNull, + )) + require.NoError(t, err) // Strip any non-deterministic messages. t.Log(out) + out = eraseClusterName(out, tc.ClusterName()) out = eraseNonDeterministicZipOutput(out) datadriven.RunTest(t, datapathutils.TestDataPath(t, "zip", "partial1"), @@ -751,12 +760,13 @@ func TestPartialZip(t *testing.T) { }) // Now do it again and exclude the down node explicitly. - out, err = c.RunWithCapture("debug zip " + os.DevNull + " --concurrency=1 --exclude-nodes=2 --cpu-profile-duration=0" + - " --validate-zip-file=false") - if err != nil { - t.Fatal(err) - } + out, err = c.RunWithCapture(fmt.Sprintf( + "debug zip --concurrency=1 --exclude-nodes=2 --cpu-profile-duration=0 --validate-zip-file=false --cluster-name=%s %s", + tc.ClusterName(), os.DevNull, + )) + require.NoError(t, err) + out = eraseClusterName(out, tc.ClusterName()) out = eraseNonDeterministicZipOutput(out) datadriven.RunTest(t, datapathutils.TestDataPath(t, "zip", "partial1_excluded"), func(t *testing.T, td *datadriven.TestData) string { @@ -767,12 +777,11 @@ func TestPartialZip(t *testing.T) { // skips over it automatically. We specifically use --wait=none because // we're decommissioning a node in a 3-node cluster, so there's no node to // up-replicate the under-replicated ranges to. - { - _, err := c.RunWithCapture(fmt.Sprintf("node decommission --checks=skip --wait=none %d", 2)) - if err != nil { - t.Fatal(err) - } - } + _, err = c.RunWithCapture(fmt.Sprintf( + "node decommission --checks=skip --wait=none --cluster-name=%s %d", + tc.ClusterName(), 2, + )) + require.NoError(t, err) // We use .Override() here instead of SET CLUSTER SETTING in SQL to // override the 1m15s minimum placed on the cluster setting. There @@ -787,12 +796,13 @@ func TestPartialZip(t *testing.T) { datadriven.RunTest(t, datapathutils.TestDataPath(t, "zip", "partial2"), func(t *testing.T, td *datadriven.TestData) string { f := func() string { - out, err := c.RunWithCapture("debug zip --concurrency=1 --cpu-profile-duration=0 --validate-zip-file=false " + os.DevNull) - if err != nil { - t.Fatal(err) - } - + out, err := c.RunWithCapture(fmt.Sprintf( + "debug zip --concurrency=1 --cpu-profile-duration=0 --validate-zip-file=false --cluster-name=%s %s", + tc.ClusterName(), os.DevNull, + )) + require.NoError(t, err) // Strip any non-deterministic messages. + out = eraseClusterName(out, tc.ClusterName()) return eraseNonDeterministicZipOutput(out) } diff --git a/pkg/testutils/testcluster/testcluster.go b/pkg/testutils/testcluster/testcluster.go index f2b48022c1dd..b40622c2820c 100644 --- a/pkg/testutils/testcluster/testcluster.go +++ b/pkg/testutils/testcluster/testcluster.go @@ -9,6 +9,7 @@ import ( "context" gosql "database/sql" "fmt" + "math/rand/v2" "net" "os" "reflect" @@ -86,6 +87,11 @@ type TestCluster struct { var _ serverutils.TestClusterInterface = &TestCluster{} +// ClusterName returns the configured or auto-generated cluster name. +func (tc *TestCluster) ClusterName() string { + return tc.clusterArgs.ServerArgs.ClusterName +} + // NumServers is part of TestClusterInterface. func (tc *TestCluster) NumServers() int { return len(tc.Servers) @@ -317,6 +323,11 @@ func NewTestCluster( if clusterArgs.StartSingleNode && nodes > 1 { t.Fatal("StartSingleNode implies 1 node only, but asked to create", nodes) } + if clusterArgs.ServerArgs.ClusterName == "" { + // Use a cluster name that is sufficiently unique (within the CI env) but is + // concise and recognizable. + clusterArgs.ServerArgs.ClusterName = fmt.Sprintf("TestCluster-%d", rand.Uint32()) + } if err := checkServerArgsForCluster( clusterArgs.ServerArgs, clusterArgs.ReplicationMode, disallowJoinAddr, @@ -634,6 +645,9 @@ func (tc *TestCluster) AddServer( if serverArgs.JoinAddr != "" { serverArgs.NoAutoInitializeCluster = true } + if serverArgs.ClusterName == "" { + serverArgs.ClusterName = tc.clusterArgs.ServerArgs.ClusterName + } // Check args even though we have called checkServerArgsForCluster() // already in NewTestCluster(). AddServer might be called for servers