diff --git a/docs/generated/settings/settings-for-tenants.txt b/docs/generated/settings/settings-for-tenants.txt index a93c690f2521..677f232bfdbd 100644 --- a/docs/generated/settings/settings-for-tenants.txt +++ b/docs/generated/settings/settings-for-tenants.txt @@ -300,4 +300,4 @@ trace.jaeger.agent string the address of a Jaeger agent to receive traces using trace.opentelemetry.collector string address of an OpenTelemetry trace collector to receive traces using the otel gRPC protocol, as :. If no port is specified, 4317 will be used. trace.span_registry.enabled boolean true if set, ongoing traces can be seen at https:///#/debug/tracez trace.zipkin.collector string the address of a Zipkin instance to receive traces, as :. If no port is specified, 9411 will be used. -version version 1000022.1-74 set the active cluster version in the format '.' +version version 1000022.1-76 set the active cluster version in the format '.' diff --git a/docs/generated/settings/settings.html b/docs/generated/settings/settings.html index f103a365fd0f..9ae6e6652691 100644 --- a/docs/generated/settings/settings.html +++ b/docs/generated/settings/settings.html @@ -234,6 +234,6 @@ trace.opentelemetry.collectorstringaddress of an OpenTelemetry trace collector to receive traces using the otel gRPC protocol, as :. If no port is specified, 4317 will be used. trace.span_registry.enabledbooleantrueif set, ongoing traces can be seen at https:///#/debug/tracez trace.zipkin.collectorstringthe address of a Zipkin instance to receive traces, as :. If no port is specified, 9411 will be used. -versionversion1000022.1-74set the active cluster version in the format '.' +versionversion1000022.1-76set the active cluster version in the format '.' diff --git a/pkg/clusterversion/cockroach_versions.go b/pkg/clusterversion/cockroach_versions.go index dd91fa70ab53..8be9069bc9fd 100644 --- a/pkg/clusterversion/cockroach_versions.go +++ b/pkg/clusterversion/cockroach_versions.go @@ -302,6 +302,10 @@ const ( // supported in cloud storage and KMS. SupportAssumeRoleAuth + // FixUserfileRelatedDescriptorCorruption adds a migration which uses + // heuristics to identify invalid table descriptors for userfile-related + // descriptors. + FixUserfileRelatedDescriptorCorruption // ************************************************* // Step (1): Add new versions here. // Do not add new versions to a patch release. @@ -488,7 +492,10 @@ var rawVersionsSingleton = keyedVersions{ Key: SupportAssumeRoleAuth, Version: roachpb.Version{Major: 22, Minor: 1, Internal: 74}, }, - + { + Key: FixUserfileRelatedDescriptorCorruption, + Version: roachpb.Version{Major: 22, Minor: 1, Internal: 76}, + }, // ************************************************* // Step (2): Add new versions here. // Do not add new versions to a patch release. diff --git a/pkg/clusterversion/key_string.go b/pkg/clusterversion/key_string.go index fe1a7c3e54a1..977ec8f35af8 100644 --- a/pkg/clusterversion/key_string.go +++ b/pkg/clusterversion/key_string.go @@ -47,11 +47,12 @@ func _() { _ = x[PrioritizeSnapshots-35] _ = x[EnableLeaseUpgrade-36] _ = x[SupportAssumeRoleAuth-37] + _ = x[FixUserfileRelatedDescriptorCorruption-38] } -const _Key_name = "invalidVersionKeyV22_1Start22_2LocalTimestampsPebbleFormatSplitUserKeysMarkedCompactedEnsurePebbleFormatVersionRangeKeysEnablePebbleFormatVersionRangeKeysTrigramInvertedIndexesRemoveGrantPrivilegeMVCCRangeTombstonesUpgradeSequenceToBeReferencedByIDSampledStmtDiagReqsAddSSTableTombstonesSystemPrivilegesTableEnablePredicateProjectionChangefeedAlterSystemSQLInstancesAddLocalitySystemExternalConnectionsTableAlterSystemStatementStatisticsAddIndexRecommendationsRoleIDSequenceAddSystemUserIDColumnSystemUsersIDColumnIsBackfilledSetSystemUsersUserIDColumnNotNullSQLSchemaTelemetryScheduledJobsSchemaChangeSupportsCreateFunctionDeleteRequestReturnKeyPebbleFormatPrePebblev1MarkedRoleOptionsTableHasIDColumnRoleOptionsIDColumnIsBackfilledSetRoleOptionsUserIDColumnNotNullUseDelRangeInGCJobWaitedForDelRangeInGCJobRangefeedUseOneStreamPerNodeNoNonMVCCAddSSTableGCHintInReplicaStateUpdateInvalidColumnIDsInSequenceBackReferencesTTLDistSQLPrioritizeSnapshotsEnableLeaseUpgradeSupportAssumeRoleAuth" +const _Key_name = "invalidVersionKeyV22_1Start22_2LocalTimestampsPebbleFormatSplitUserKeysMarkedCompactedEnsurePebbleFormatVersionRangeKeysEnablePebbleFormatVersionRangeKeysTrigramInvertedIndexesRemoveGrantPrivilegeMVCCRangeTombstonesUpgradeSequenceToBeReferencedByIDSampledStmtDiagReqsAddSSTableTombstonesSystemPrivilegesTableEnablePredicateProjectionChangefeedAlterSystemSQLInstancesAddLocalitySystemExternalConnectionsTableAlterSystemStatementStatisticsAddIndexRecommendationsRoleIDSequenceAddSystemUserIDColumnSystemUsersIDColumnIsBackfilledSetSystemUsersUserIDColumnNotNullSQLSchemaTelemetryScheduledJobsSchemaChangeSupportsCreateFunctionDeleteRequestReturnKeyPebbleFormatPrePebblev1MarkedRoleOptionsTableHasIDColumnRoleOptionsIDColumnIsBackfilledSetRoleOptionsUserIDColumnNotNullUseDelRangeInGCJobWaitedForDelRangeInGCJobRangefeedUseOneStreamPerNodeNoNonMVCCAddSSTableGCHintInReplicaStateUpdateInvalidColumnIDsInSequenceBackReferencesTTLDistSQLPrioritizeSnapshotsEnableLeaseUpgradeSupportAssumeRoleAuthFixUserfileRelatedDescriptorCorruption" -var _Key_index = [...]uint16{0, 17, 22, 31, 46, 86, 120, 154, 176, 196, 215, 248, 267, 287, 308, 343, 377, 407, 460, 474, 495, 526, 559, 590, 624, 646, 675, 702, 733, 766, 784, 808, 836, 855, 875, 921, 931, 950, 968, 989} +var _Key_index = [...]uint16{0, 17, 22, 31, 46, 86, 120, 154, 176, 196, 215, 248, 267, 287, 308, 343, 377, 407, 460, 474, 495, 526, 559, 590, 624, 646, 675, 702, 733, 766, 784, 808, 836, 855, 875, 921, 931, 950, 968, 989, 1027} func (i Key) String() string { i -= -1 diff --git a/pkg/server/server_sql.go b/pkg/server/server_sql.go index 86a4d4322856..3f7f42e807a4 100644 --- a/pkg/server/server_sql.go +++ b/pkg/server/server_sql.go @@ -1023,7 +1023,7 @@ func newSQLServer(ctx context.Context, cfg sqlServerArgs) (*SQLServer, error) { knobs, _ := cfg.TestingKnobs.UpgradeManager.(*upgrade.TestingKnobs) migrationMgr := upgrademanager.NewManager( - systemDeps, leaseMgr, cfg.circularInternalExecutor, jobRegistry, codec, + systemDeps, leaseMgr, cfg.circularInternalExecutor, cfg.internalExecutorFactory, jobRegistry, codec, cfg.Settings, knobs, ) execCfg.UpgradeJobDeps = migrationMgr diff --git a/pkg/upgrade/upgrademanager/BUILD.bazel b/pkg/upgrade/upgrademanager/BUILD.bazel index 94da57089cd1..c96e760fbc4c 100644 --- a/pkg/upgrade/upgrademanager/BUILD.bazel +++ b/pkg/upgrade/upgrademanager/BUILD.bazel @@ -16,6 +16,7 @@ go_library( "//pkg/server/serverpb", "//pkg/settings/cluster", "//pkg/sql", + "//pkg/sql/catalog/descs", "//pkg/sql/catalog/lease", "//pkg/sql/protoreflect", "//pkg/sql/sem/tree", diff --git a/pkg/upgrade/upgrademanager/manager.go b/pkg/upgrade/upgrademanager/manager.go index 4a0c83cbeb7d..5362bcce4494 100644 --- a/pkg/upgrade/upgrademanager/manager.go +++ b/pkg/upgrade/upgrademanager/manager.go @@ -25,6 +25,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/server/serverpb" "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/sql" + "github.com/cockroachdb/cockroach/pkg/sql/catalog/descs" "github.com/cockroachdb/cockroach/pkg/sql/catalog/lease" "github.com/cockroachdb/cockroach/pkg/sql/protoreflect" "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" @@ -44,6 +45,7 @@ type Manager struct { deps upgrade.SystemDeps lm *lease.Manager ie sqlutil.InternalExecutor + ief descs.TxnManager jr *jobs.Registry codec keys.SQLCodec settings *cluster.Settings @@ -72,6 +74,7 @@ func NewManager( deps upgrade.SystemDeps, lm *lease.Manager, ie sqlutil.InternalExecutor, + ief descs.TxnManager, jr *jobs.Registry, codec keys.SQLCodec, settings *cluster.Settings, @@ -85,6 +88,7 @@ func NewManager( deps: deps, lm: lm, ie: ie, + ief: ief, jr: jr, codec: codec, settings: settings, @@ -411,12 +415,13 @@ func (m *Manager) checkPreconditions( continue } if err := tm.Precondition(ctx, v, upgrade.TenantDeps{ - DB: m.deps.DB, - Codec: m.codec, - Settings: m.settings, - LeaseManager: m.lm, - InternalExecutor: m.ie, - JobRegistry: m.jr, + DB: m.deps.DB, + Codec: m.codec, + Settings: m.settings, + LeaseManager: m.lm, + InternalExecutor: m.ie, + InternalExecutorFactory: m.ief, + JobRegistry: m.jr, }); err != nil { return errors.Wrapf( err, diff --git a/pkg/upgrade/upgrades/BUILD.bazel b/pkg/upgrade/upgrades/BUILD.bazel index 40bf00e56ee2..9195fb71c824 100644 --- a/pkg/upgrade/upgrades/BUILD.bazel +++ b/pkg/upgrade/upgrades/BUILD.bazel @@ -8,6 +8,7 @@ go_library( "alter_statement_statistics_index_recommendations.go", "descriptor_utils.go", "ensure_sql_schema_telemetry_schedule.go", + "fix_userfile_descriptor_corruption.go", "precondition_before_starting_an_upgrade.go", "remove_grant_migration.go", "role_id_sequence_migration.go", @@ -50,6 +51,7 @@ go_library( "//pkg/sql/sem/tree", "//pkg/sql/sessiondata", "//pkg/sql/sqlutil", + "//pkg/sql/types", "//pkg/storage", "//pkg/upgrade", "//pkg/util/hlc", @@ -71,6 +73,7 @@ go_test( "builtins_test.go", "descriptor_utils_test.go", "ensure_sql_schema_telemetry_schedule_test.go", + "fix_userfile_descriptor_corruption_test.go", "helpers_test.go", "main_test.go", "precondition_before_starting_an_upgrade_external_test.go", diff --git a/pkg/upgrade/upgrades/fix_userfile_descriptor_corruption.go b/pkg/upgrade/upgrades/fix_userfile_descriptor_corruption.go new file mode 100644 index 000000000000..14227119df93 --- /dev/null +++ b/pkg/upgrade/upgrades/fix_userfile_descriptor_corruption.go @@ -0,0 +1,162 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package upgrades + +import ( + "context" + "strings" + + "github.com/cockroachdb/cockroach/pkg/clusterversion" + "github.com/cockroachdb/cockroach/pkg/jobs" + "github.com/cockroachdb/cockroach/pkg/kv" + "github.com/cockroachdb/cockroach/pkg/sql/catalog" + "github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb" + "github.com/cockroachdb/cockroach/pkg/sql/catalog/descs" + "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" + "github.com/cockroachdb/cockroach/pkg/sql/sqlutil" + "github.com/cockroachdb/cockroach/pkg/sql/types" + "github.com/cockroachdb/cockroach/pkg/upgrade" + "github.com/cockroachdb/cockroach/pkg/util/log" +) + +// fixInvalidObjectsThatLookLikeBadUserfileConstraint attempts to remove a +// dangling table descriptor mutation for foreign key constraints on +// usefile-related upload_payload tables. +// +// It only proceeds with the fix if +// +// - the fk mutation it the _only_ mutation on the table, +// - the table name, column names, and column type all look like a userfile table, and +// - the fk is referencing a table whose name, column names, and column types all look like a userfile table. +// +// To fix the table, we remove the mutation as the FK constraint is unnecessary +// in the current implementation. +func fixInvalidObjectsThatLookLikeBadUserfileConstraint( + ctx context.Context, _ clusterversion.ClusterVersion, d upgrade.TenantDeps, _ *jobs.Job, +) error { + return d.InternalExecutorFactory.DescsTxnWithExecutor(ctx, d.DB, nil, + func(ctx context.Context, txn *kv.Txn, descriptors *descs.Collection, ie sqlutil.InternalExecutor) error { + query := `SELECT * FROM crdb_internal.invalid_objects` + rows, err := ie.QueryIterator(ctx, "find-invalid-descriptors", txn, query) + if err != nil { + return err + } + defer func() { _ = rows.Close() }() + + var hasNext bool + for hasNext, err = rows.Next(ctx); hasNext && err == nil; hasNext, err = rows.Next(ctx) { + // crdb_internal.invalid_objects has five columns: id, database name, schema name, table name, error. + row := rows.Cur() + tableID := descpb.ID(tree.MustBeDInt(row[0])) + errString := string(tree.MustBeDString(row[4])) + if veryLikelyKnownUserfileBreakage(ctx, txn, descriptors, tableID, errString) { + log.Infof(ctx, "attempting to fix invalid table descriptor %d assuming it is a userfile-related table", tableID) + mutTableDesc, err := descriptors.GetMutableTableByID(ctx, txn, tableID, tree.ObjectLookupFlags{ + CommonLookupFlags: tree.CommonLookupFlags{ + IncludeOffline: true, + IncludeDropped: true, + }, + }) + if err != nil { + return err + } + mutTableDesc.Mutations = nil + mutTableDesc.MutationJobs = nil + if err := descriptors.WriteDesc(ctx, false, mutTableDesc, txn); err != nil { + return err + } + } + } + if err != nil { + // TODO(ssd): We always return a nil error here because I'm not sure that this + // would be worth failing an upgrade for. + log.Warningf(ctx, "could not fix broken userfile: %v", err) + } + return nil + }) +} + +// veryLikelyKnownUserfileBreakage returns true if the given descriptor id and +// error message from crdb_internal.invalid_objects is likely related to a known +// userfile-related table corruption. +func veryLikelyKnownUserfileBreakage( + ctx context.Context, txn *kv.Txn, descriptors *descs.Collection, id descpb.ID, errMsg string, +) bool { + if !strings.HasSuffix(errMsg, "job not found") { + return false + } + + tableDesc, err := descriptors.GetImmutableTableByID(ctx, txn, id, tree.ObjectLookupFlags{ + CommonLookupFlags: tree.CommonLookupFlags{ + IncludeOffline: true, + IncludeDropped: true, + }, + }) + if err != nil { + return false + } + return tableLooksLikeUserfilePayloadTable(tableDesc) && mutationsLookLikeuserfilePayloadCorruption(ctx, txn, tableDesc, descriptors) +} + +func tableLooksLikeUserfilePayloadTable(tableDesc catalog.TableDescriptor) bool { + columns := tableDesc.PublicColumns() + return strings.HasSuffix(tableDesc.GetName(), "_upload_payload") && + len(columns) == 3 && + (columns[0].ColName() == "file_id" && columns[0].GetType().Oid() == types.Uuid.Oid()) && + (columns[1].ColName() == "byte_offset" && columns[1].GetType().Oid() == types.Int.Oid()) && + (columns[2].ColName() == "payload" && columns[2].GetType().Oid() == types.Bytes.Oid()) +} + +func tableLooksLikeUserfileFileTable(tableDesc catalog.TableDescriptor) bool { + columns := tableDesc.PublicColumns() + return strings.HasSuffix(tableDesc.GetName(), "_upload_files") && + len(columns) == 5 && + (columns[0].ColName() == "filename" && columns[0].GetType().Oid() == types.String.Oid()) && + (columns[1].ColName() == "file_id" && columns[1].GetType().Oid() == types.Uuid.Oid()) && + (columns[2].ColName() == "file_size" && columns[2].GetType().Oid() == types.Int.Oid()) && + (columns[3].ColName() == "username" && columns[3].GetType().Oid() == types.String.Oid()) && + (columns[4].ColName() == "upload_time" && columns[4].GetType().Oid() == types.Timestamp.Oid()) +} + +func mutationsLookLikeuserfilePayloadCorruption( + ctx context.Context, + txn *kv.Txn, + tableDesc catalog.TableDescriptor, + descriptors *descs.Collection, +) bool { + if len(tableDesc.GetMutationJobs()) != 1 { + return false + } + if len(tableDesc.AllMutations()) != 1 { + return false + } + mutation := tableDesc.AllMutations()[0] + if mutation.Adding() && mutation.DeleteOnly() { + if constraintMutation := mutation.AsConstraint(); constraintMutation != nil { + if constraintMutation.IsForeignKey() { + fkConstraint := constraintMutation.ForeignKey() + targetTableDesc, err := descriptors.GetImmutableTableByID(ctx, txn, fkConstraint.ReferencedTableID, tree.ObjectLookupFlags{ + CommonLookupFlags: tree.CommonLookupFlags{ + IncludeOffline: true, + IncludeDropped: true, + }, + }) + if err != nil { + return false + } + if tableLooksLikeUserfileFileTable(targetTableDesc) { + return true + } + } + } + } + return false +} diff --git a/pkg/upgrade/upgrades/fix_userfile_descriptor_corruption_test.go b/pkg/upgrade/upgrades/fix_userfile_descriptor_corruption_test.go new file mode 100644 index 000000000000..5d35d7489c37 --- /dev/null +++ b/pkg/upgrade/upgrades/fix_userfile_descriptor_corruption_test.go @@ -0,0 +1,110 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package upgrades_test + +import ( + "context" + "testing" + + "github.com/cockroachdb/cockroach/pkg/base" + "github.com/cockroachdb/cockroach/pkg/clusterversion" + "github.com/cockroachdb/cockroach/pkg/server" + "github.com/cockroachdb/cockroach/pkg/settings/cluster" + "github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb" + "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" + "github.com/cockroachdb/cockroach/pkg/testutils/testcluster" + "github.com/cockroachdb/cockroach/pkg/util/leaktest" + "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/stretchr/testify/require" +) + +// The hex for the descriptor to inject was created by running the following +// commands in a 22.1 binary. +// +// In 22.1 and prior, userfile creation had a bug that produced a foreign +// key constraint mutation without an associated mutation job. +// +// CREATE DATABASE to_backup; +// BACKUP DATABASE to_backup INTO 'userfile://defaultdb.test/data'; +// +// SELECT encode(descriptor, 'hex') +// FROM system.descriptor +// WHERE id = ( +// SELECT id +// FROM system.namespace +// WHERE name = 'test_upload_payload' +// ); +// SELECT encode(descriptor, 'hex') +// FROM system.descriptor +// WHERE id = ( +// SELECT id +// FROM system.namespace +// WHERE name = 'test_upload_files' +// ); +// +// The files table is not broken, but should probably be inserted for completeness. +// +// These are shared with the tests for the upgrade preconditions. +const ( + brokenUserfilePayloadTable = "0ac4040a13746573745f75706c6f61645f7061796c6f6164186b206428033a00422d0a0766696c655f696410011a0d080e10001800300050861760002000300068007000780080010088010098010042300a0b627974655f6f666673657410021a0c08011040180030005014600020003000680070007800800100880100980100422c0a077061796c6f616410031a0c0808100018003000501160002001300068007000780080010088010098010048045290010a18746573745f75706c6f61645f7061796c6f61645f706b657910011801220766696c655f6964220b627974655f6f66667365742a077061796c6f616430013002400040004a10080010001a00200028003000380040005a0070037a0408002000800100880100900104980101a20106080012001800a80100b20100ba0100c00100c80188a5978797a6b68c17d0010160026a210a0b0a0561646d696e100218020a0a0a04726f6f74100218021204726f6f74180272541801200128013800424a0801120a66696c655f69645f666b1a0c0a0012001800300038004003221e086b10011802206a2a0a66696c655f69645f666b3002380040004800700330003a0a08001a0020002a003003800102880103980100b201320a077072696d61727910001a0766696c655f69641a0b627974655f6f66667365741a077061796c6f61642001200220032803b80101c20100da010c080110818088ef93a5db8d0be80100f2010408001200f801008002009202009a020a0888a5978797a6b68c17b20200b80200c00265c80200e00200800300880304" + brokenUserfileFilesTable = "0a91060a11746573745f75706c6f61645f66696c6573186a206428033a00422d0a0866696c656e616d6510011a0c0807100018003000501960002000300068007000780080010088010098010042400a0766696c655f696410021a0d080e100018003000508617600020002a1167656e5f72616e646f6d5f7575696428293000680070007800800100880100980100422e0a0966696c655f73697a6510031a0c08011040180030005014600020003000680070007800800100880100980100422d0a08757365726e616d6510041a0c0807100018003000501960002000300068007000780080010088010098010042440a0b75706c6f61645f74696d6510051a0d080510001800300050da08600020012a116e6f7728293a3a3a54494d455354414d503000680070007800800100880100980100480652a6010a16746573745f75706c6f61645f66696c65735f706b657910011801220866696c656e616d652a0766696c655f69642a0966696c655f73697a652a08757365726e616d652a0b75706c6f61645f74696d65300140004a10080010001a00200028003000380040005a0070027003700470057a0408002000800100880100900104980101a20106080012001800a80100b20100ba0100c00100c80188a5978797a6b68c17d001025a7b0a1d746573745f75706c6f61645f66696c65735f66696c655f69645f6b657910021801220766696c655f69643002380140004a10080010001a00200028003000380040005a007a0408002000800100880100900103980100a20106080012001800a80100b20100ba0100c00100c80188a5978797a6b68c17d0010160036a210a0b0a0561646d696e100218020a0a0a04726f6f74100218021204726f6f741802800101880103980100b2014c0a077072696d61727910001a0866696c656e616d651a0766696c655f69641a0966696c655f73697a651a08757365726e616d651a0b75706c6f61645f74696d65200120022003200420052800b80101c20100e80100f2010408001200f801008002009202009a020a0888a5978797a6b68c17b20200b80200c00265c80200e00200800300880303" +) + +// TestFixUserfileRelatedDescriptorCorruptionUpgrade tests that we correctly repair +// a broken userfile table. +func TestFixUserfileRelatedDescriptorCorruptionUpgrade(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + var ( + v0 = clusterversion.ByKey(clusterversion.FixUserfileRelatedDescriptorCorruption - 1) + v1 = clusterversion.ByKey(clusterversion.FixUserfileRelatedDescriptorCorruption) + ) + + ctx := context.Background() + settings := cluster.MakeTestingClusterSettingsWithVersions(v1, v0, false /* initializeVersion */) + require.NoError(t, clusterversion.Initialize(ctx, v0, &settings.SV)) + + tc := testcluster.StartTestCluster(t, 1, base.TestClusterArgs{ + ServerArgs: base.TestServerArgs{ + Settings: settings, + Knobs: base.TestingKnobs{ + Server: &server.TestingKnobs{ + DisableAutomaticVersionUpgrade: make(chan struct{}), + BinaryVersionOverride: v0, + }, + }, + }, + }) + defer tc.Stopper().Stop(ctx) + + sqlDB := tc.ServerConn(0) + tdb := sqlutils.MakeSQLRunner(sqlDB) + + var parentID, parentSchemaID descpb.ID + tdb.Exec(t, "CREATE TABLE temp_tbl()") + tdb.QueryRow(t, `SELECT "parentID", "parentSchemaID" FROM system.namespace WHERE name = 'temp_tbl'`). + Scan(&parentID, &parentSchemaID) + + decodeTableDescriptorAndInsert(t, ctx, sqlDB, brokenUserfileFilesTable, parentID, parentSchemaID) + decodeTableDescriptorAndInsert(t, ctx, sqlDB, brokenUserfilePayloadTable, parentID, parentSchemaID) + var count int + err := sqlDB.QueryRow(`SELECT count(1) FROM "".crdb_internal.invalid_objects`).Scan(&count) + require.NoError(t, err) + require.Equal(t, 1, count) + + _, err = sqlDB.Exec(`SET CLUSTER SETTING version = $1`, v1.String()) + require.NoError(t, err) + err = sqlDB.QueryRow(`SELECT count(1) FROM "".crdb_internal.invalid_objects`).Scan(&count) + require.NoError(t, err) + require.Equal(t, 0, count) + tdb.CheckQueryResults(t, "SHOW CLUSTER SETTING version", [][]string{{v1.String()}}) +} diff --git a/pkg/upgrade/upgrades/precondition_before_starting_an_upgrade.go b/pkg/upgrade/upgrades/precondition_before_starting_an_upgrade.go index 65141758bd60..0cdbcd9a6241 100644 --- a/pkg/upgrade/upgrades/precondition_before_starting_an_upgrade.go +++ b/pkg/upgrade/upgrades/precondition_before_starting_an_upgrade.go @@ -16,10 +16,15 @@ import ( "strings" "github.com/cockroachdb/cockroach/pkg/clusterversion" + "github.com/cockroachdb/cockroach/pkg/kv" + "github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb" + "github.com/cockroachdb/cockroach/pkg/sql/catalog/descs" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror" "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" + "github.com/cockroachdb/cockroach/pkg/sql/sqlutil" "github.com/cockroachdb/cockroach/pkg/upgrade" + "github.com/cockroachdb/cockroach/pkg/util/log" ) func preconditionBeforeStartingAnUpgrade( @@ -43,31 +48,40 @@ func preconditionBeforeStartingAnUpgrade( func preconditionNoInvalidDescriptorsBeforeUpgrading( ctx context.Context, cs clusterversion.ClusterVersion, d upgrade.TenantDeps, ) error { - query := `SELECT * FROM crdb_internal.invalid_objects` - rows, err := d.InternalExecutor.QueryIterator( - ctx, "check-if-there-are-any-invalid-descriptors", nil /* txn */, query, - ) - if err != nil { - return err - } - - var hasNext bool var errMsg strings.Builder - for hasNext, err = rows.Next(ctx); hasNext && err == nil; hasNext, err = rows.Next(ctx) { - // There exists invalid objects; Accumulate their information into `errMsg`. - // `crdb_internal.invalid_objects` has five columns: id, database name, schema name, table name, error. - row := rows.Cur() - descName := tree.MakeTableNameWithSchema( - tree.Name(tree.MustBeDString(row[1])), - tree.Name(tree.MustBeDString(row[2])), - tree.Name(tree.MustBeDString(row[3])), - ) - errMsg.WriteString(fmt.Sprintf("invalid descriptor: %v (%v) because %v\n", descName.String(), row[0], row[4])) - } + err := d.InternalExecutorFactory.DescsTxnWithExecutor(ctx, d.DB, d.SessionData, + func(ctx context.Context, txn *kv.Txn, descriptors *descs.Collection, ie sqlutil.InternalExecutor) error { + query := `SELECT * FROM crdb_internal.invalid_objects` + rows, err := ie.QueryIterator( + ctx, "check-if-there-are-any-invalid-descriptors", txn /* txn */, query) + if err != nil { + return err + } + var hasNext bool + for hasNext, err = rows.Next(ctx); hasNext && err == nil; hasNext, err = rows.Next(ctx) { + // There exists invalid objects; Accumulate their information into `errMsg`. + // `crdb_internal.invalid_objects` has five columns: id, database name, schema name, table name, error. + row := rows.Cur() + descName := tree.MakeTableNameWithSchema( + tree.Name(tree.MustBeDString(row[1])), + tree.Name(tree.MustBeDString(row[2])), + tree.Name(tree.MustBeDString(row[3])), + ) + tableID := descpb.ID(tree.MustBeDInt(row[0])) + errString := string(tree.MustBeDString(row[4])) + // TODO(ssd): Remove in 23.1 once we are sure that the migration which fixes this corruption has run. + if veryLikelyKnownUserfileBreakage(ctx, txn, descriptors, tableID, errString) { + log.Infof(ctx, "ignoring invalid descriptor %v (%v) with error %q because it looks like known userfile-related corruption", + descName.String(), tableID, errString) + } else { + errMsg.WriteString(fmt.Sprintf("invalid descriptor: %v (%v) because %v\n", descName.String(), row[0], row[4])) + } + } + return err + }) if err != nil { return err } - if errMsg.Len() == 0 { return nil } diff --git a/pkg/upgrade/upgrades/precondition_before_starting_an_upgrade_external_test.go b/pkg/upgrade/upgrades/precondition_before_starting_an_upgrade_external_test.go index fa20df8e9de8..c24fc22c1e45 100644 --- a/pkg/upgrade/upgrades/precondition_before_starting_an_upgrade_external_test.go +++ b/pkg/upgrade/upgrades/precondition_before_starting_an_upgrade_external_test.go @@ -45,29 +45,41 @@ func TestPreconditionBeforeStartingAnUpgrade(t *testing.T) { ) ctx := context.Background() - settings := cluster.MakeTestingClusterSettingsWithVersions(v1, v0, false /* initializeVersion */) - require.NoError(t, clusterversion.Initialize(ctx, v0, &settings.SV)) - - tc := testcluster.StartTestCluster(t, 1, base.TestClusterArgs{ - ServerArgs: base.TestServerArgs{ - Settings: settings, - Knobs: base.TestingKnobs{ - Server: &server.TestingKnobs{ - DisableAutomaticVersionUpgrade: make(chan struct{}), - BinaryVersionOverride: v0, + type testSetup struct { + parentID descpb.ID + parentSchemaID descpb.ID + sqlDB *gosql.DB + tdb *sqlutils.SQLRunner + cleanup func() + } + setupTestCluster := func() testSetup { + settings := cluster.MakeTestingClusterSettingsWithVersions(v1, v0, false /* initializeVersion */) + require.NoError(t, clusterversion.Initialize(ctx, v0, &settings.SV)) + tc := testcluster.StartTestCluster(t, 1, base.TestClusterArgs{ + ServerArgs: base.TestServerArgs{ + Settings: settings, + Knobs: base.TestingKnobs{ + Server: &server.TestingKnobs{ + DisableAutomaticVersionUpgrade: make(chan struct{}), + BinaryVersionOverride: v0, + }, }, }, - }, - }) - defer tc.Stopper().Stop(ctx) - - sqlDB := tc.ServerConn(0) - tdb := sqlutils.MakeSQLRunner(sqlDB) - - var parentID, parentSchemaID descpb.ID - tdb.Exec(t, "CREATE TABLE temp_tbl()") - tdb.QueryRow(t, `SELECT "parentID", "parentSchemaID" FROM system.namespace WHERE name = 'temp_tbl'`). - Scan(&parentID, &parentSchemaID) + }) + sqlDB := tc.ServerConn(0) + tdb := sqlutils.MakeSQLRunner(sqlDB) + var parentID, parentSchemaID descpb.ID + tdb.Exec(t, "CREATE TABLE temp_tbl()") + tdb.QueryRow(t, `SELECT "parentID", "parentSchemaID" FROM system.namespace WHERE name = 'temp_tbl'`). + Scan(&parentID, &parentSchemaID) + return testSetup{ + parentID: parentID, + parentSchemaID: parentSchemaID, + sqlDB: sqlDB, + tdb: tdb, + cleanup: func() { tc.Stopper().Stop(ctx) }, + } + } // One subtest for each precondition we wish to test. t.Run("upgrade fails if there exists invalid descriptors", func(t *testing.T) { @@ -95,14 +107,15 @@ func TestPreconditionBeforeStartingAnUpgrade(t *testing.T) { 104 so that it does collide with system tables that are allowed IDs below 100. */ - + ts := setupTestCluster() + defer ts.cleanup() const tableDescriptorToInject = "0a85020a01741868203228023a0042260a016910011a0c080110401800300050146000200030006800700078008001008801009801004802524c0a077072696d61727910011801220169300140004a10080010001a00200028003000380040005a007a0408002000800100880100900104980101a20106080012001800a80100b20100ba010060026a1d0a090a0561646d696e10020a080a04726f6f7410021204726f6f741802800101880103980100b201120a077072696d61727910001a016920012800b80101c20100d201080835100018012000e80100f2010408001200f801008002009202009a020a08f084c3bfb1c1ccfe16b20200b80200c0021dc80200e00200f00200" // Decode and insert the table descriptor. - decodeTableDescriptorAndInsert(t, ctx, sqlDB, tableDescriptorToInject, parentID, parentSchemaID) + decodeTableDescriptorAndInsert(t, ctx, ts.sqlDB, tableDescriptorToInject, ts.parentID, ts.parentSchemaID) // Attempt to upgrade the cluster version and expect to see a failure - _, err := sqlDB.Exec(`SET CLUSTER SETTING version = $1`, v1.String()) + _, err := ts.sqlDB.Exec(`SET CLUSTER SETTING version = $1`, v1.String()) require.Error(t, err, "upgrade should be refused because precondition is violated.") require.Equal(t, "pq: verifying precondition for version 22.1-2: "+ "there exists invalid descriptors as listed below; fix these descriptors before attempting to upgrade again:\n"+ @@ -110,9 +123,19 @@ func TestPreconditionBeforeStartingAnUpgrade(t *testing.T) { "invalid descriptor: defaultdb.public.temp_tbl (104) because 'no matching name info found in non-dropped relation \"t\"'", strings.ReplaceAll(err.Error(), "1000022", "22")) // The cluster version should remain at `v0`. - tdb.CheckQueryResults(t, "SHOW CLUSTER SETTING version", [][]string{{v0.String()}}) + ts.tdb.CheckQueryResults(t, "SHOW CLUSTER SETTING version", [][]string{{v0.String()}}) }) + t.Run("upgrade correctly identifies broken userfiles", func(t *testing.T) { + ts := setupTestCluster() + defer ts.cleanup() + + decodeTableDescriptorAndInsert(t, ctx, ts.sqlDB, brokenUserfileFilesTable, ts.parentID, ts.parentSchemaID) + decodeTableDescriptorAndInsert(t, ctx, ts.sqlDB, brokenUserfilePayloadTable, ts.parentID, ts.parentSchemaID) + _, err := ts.sqlDB.Exec(`SET CLUSTER SETTING version = $1`, v1.String()) + require.NoError(t, err) + ts.tdb.CheckQueryResults(t, "SHOW CLUSTER SETTING version", [][]string{{v1.String()}}) + }) // other preconditions to test here, one per `t.Run()`. } diff --git a/pkg/upgrade/upgrades/upgrades.go b/pkg/upgrade/upgrades/upgrades.go index 77440f85c7e9..7b8c10672d9e 100644 --- a/pkg/upgrade/upgrades/upgrades.go +++ b/pkg/upgrade/upgrades/upgrades.go @@ -159,6 +159,11 @@ var upgrades = []upgrade.Upgrade{ NoPrecondition, updateInvalidColumnIDsInSequenceBackReferences, ), + upgrade.NewTenantUpgrade("fix corrupt user-file related table descriptors", + toCV(clusterversion.FixUserfileRelatedDescriptorCorruption), + NoPrecondition, + fixInvalidObjectsThatLookLikeBadUserfileConstraint, + ), } func init() {