Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

sql: extend drop region for system db #137929

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/generated/metrics/metrics.html
Original file line number Diff line number Diff line change
Expand Up @@ -1800,7 +1800,7 @@
<tr><td>APPLICATION</td><td>sql.savepoint.started.count</td><td>Number of SQL SAVEPOINT statements started</td><td>SQL Statements</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.savepoint.started.count.internal</td><td>Number of SQL SAVEPOINT statements started (internal queries)</td><td>SQL Internal Statements</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.schema.invalid_objects</td><td>Gauge of detected invalid objects within the system.descriptor table (measured by querying crdb_internal.invalid_objects)</td><td>Objects</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>sql.schema_changer.object_count</td><td>Couner of the number of objects in the cluster</td><td>Objects</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>sql.schema_changer.object_count</td><td>Counter of the number of objects in the cluster</td><td>Objects</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>sql.select.count</td><td>Number of SQL SELECT statements successfully executed</td><td>SQL Statements</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.select.count.internal</td><td>Number of SQL SELECT statements successfully executed (internal queries)</td><td>SQL Internal Statements</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.select.started.count</td><td>Number of SQL SELECT statements started</td><td>SQL Statements</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
Expand Down
78 changes: 78 additions & 0 deletions pkg/ccl/multiregionccl/multiregion_system_table_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -554,3 +554,81 @@ func TestMrSystemDatabaseUpgrade(t *testing.T) {
{"ALTER PARTITION \"us-east3\" OF INDEX system.public.lease@primary CONFIGURE ZONE USING\n\tnum_voters = 3,\n\tvoter_constraints = '[+region=us-east3]',\n\tlease_preferences = '[[+region=us-east3]]'"},
})
}

func TestMrSystemDatabaseDropRegion(t *testing.T) {
defer leaktest.AfterTest(t)()

defer log.Scope(t).Close(t)

ctx := context.Background()

// Enable settings required for configuring a tenant's system database as multi-region.
makeSettings := func() *cluster.Settings {
cs := cluster.MakeTestingClusterSettingsWithVersions(clusterversion.Latest.Version(),
clusterversion.MinSupported.Version(),
false)
instancestorage.ReclaimLoopInterval.Override(ctx, &cs.SV, 150*time.Millisecond)
return cs
}

cluster, _, cleanup := multiregionccltestutils.TestingCreateMultiRegionCluster(t, 3,
base.TestingKnobs{
Server: &server.TestingKnobs{
DisableAutomaticVersionUpgrade: make(chan struct{}),
ClusterVersionOverride: clusterversion.MinSupported.Version(),
},
},
multiregionccltestutils.WithSettings(makeSettings()))
defer cleanup()
id, err := roachpb.MakeTenantID(11)
require.NoError(t, err)

// Disable license enforcement for this test.
for _, s := range cluster.Servers {
s.ExecutorConfig().(sql.ExecutorConfig).LicenseEnforcer.Disable(ctx)
}

tenantArgs := base.TestTenantArgs{
Settings: makeSettings(),
TestingKnobs: base.TestingKnobs{
Server: &server.TestingKnobs{
DisableAutomaticVersionUpgrade: make(chan struct{}),
ClusterVersionOverride: clusterversion.MinSupported.Version(),
},
},
TenantID: id,
Locality: cluster.Servers[0].Locality(),
}
appLayer, tenantSQL := serverutils.StartTenant(t, cluster.Servers[0], tenantArgs)
appLayer.ExecutorConfig().(sql.ExecutorConfig).LicenseEnforcer.Disable(ctx)

tDB := sqlutils.MakeSQLRunner(tenantSQL)

tDB.Exec(t, `ALTER DATABASE system SET PRIMARY REGION "us-east1"`)
tDB.Exec(t, `ALTER DATABASE system ADD REGION "us-east2"`)
tDB.Exec(t, `ALTER DATABASE system ADD REGION "us-east3"`)
tDB.Exec(t, `ALTER DATABASE defaultdb SET PRIMARY REGION "us-east1"`)
tDB.Exec(t, `ALTER DATABASE defaultdb ADD REGION "us-east2"`)
tDB.Exec(t, `ALTER DATABASE defaultdb ADD REGION "us-east3"`)

tDB.CheckQueryResults(t, "SELECT create_statement FROM [SHOW CREATE DATABASE system]", [][]string{
{"CREATE DATABASE system PRIMARY REGION \"us-east1\" REGIONS = \"us-east1\", \"us-east2\", \"us-east3\" SURVIVE REGION FAILURE"},
})

tDB.ExpectErr(t, "region is still in use", `ALTER DATABASE system DROP REGION "us-east3"`)
tDB.Exec(t, `ALTER DATABASE defaultdb DROP REGION "us-east3"`)
tDB.Exec(t, `ALTER DATABASE system DROP REGION "us-east3"`)

tDB.CheckQueryResults(t, `SELECT count(*) FROM system.sql_instances WHERE crdb_region != 'us-east1'::system.public.crdb_internal_region AND crdb_region != 'us-east2'::system.public.crdb_internal_region`, [][]string{
{"0"},
})
tDB.CheckQueryResults(t, `SELECT count(*) FROM system.sqlliveness WHERE crdb_region != 'us-east1'::system.public.crdb_internal_region AND crdb_region != 'us-east2'::system.public.crdb_internal_region`, [][]string{
{"0"},
})
tDB.CheckQueryResults(t, `SELECT count(*) FROM system.region_liveness WHERE crdb_region != 'us-east1'::system.public.crdb_internal_region AND crdb_region != 'us-east2'::system.public.crdb_internal_region`, [][]string{
{"0"},
})
tDB.CheckQueryResults(t, `SELECT count(*) FROM system.lease WHERE crdb_region != 'us-east1'::system.public.crdb_internal_region AND crdb_region != 'us-east2'::system.public.crdb_internal_region`, [][]string{
{"0"},
})
}
41 changes: 41 additions & 0 deletions pkg/sql/alter_database.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,14 @@ import (
"github.com/cockroachdb/cockroach/pkg/sql/catalog/typedesc"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/zone"
"github.com/cockroachdb/cockroach/pkg/sql/decodeusername"
"github.com/cockroachdb/cockroach/pkg/sql/isql"
"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode"
"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror"
"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgnotice"
"github.com/cockroachdb/cockroach/pkg/sql/privilege"
"github.com/cockroachdb/cockroach/pkg/sql/regionliveness"
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
"github.com/cockroachdb/cockroach/pkg/sql/sessiondata"
"github.com/cockroachdb/cockroach/pkg/sql/sqlclustersettings"
"github.com/cockroachdb/cockroach/pkg/sql/sqlerrors"
"github.com/cockroachdb/cockroach/pkg/sql/sqltelemetry"
Expand Down Expand Up @@ -483,6 +486,44 @@ func (p *planner) AlterDatabaseDropRegion(
if err := p.checkCanDropSystemDatabaseRegion(ctx, n.Region); err != nil {
return nil, err
}
if err := p.ExecCfg().InternalDB.Txn(ctx, func(ctx context.Context, txn isql.Txn) error {
tablesToClean := []string{"sqlliveness", "lease", "sql_instances"}
for _, t := range tablesToClean {
livenessQuery := fmt.Sprintf(
`SELECT count(*) > 0 FROM system.%s WHERE crdb_region = '%s' AND
crdb_internal.sql_liveness_is_alive(session_id)`, t, n.Region)
row, err := txn.QueryRowEx(ctx, "check-session-liveness-for-region", txn.KV(),
sessiondata.NodeUserSessionDataOverride, livenessQuery)
if err != nil {
return err
}
// Block dropping n.Region if any associated session is active.
if tree.MustBeDBool(row[0]) {
return errors.WithHintf(
pgerror.Newf(
pgcode.InvalidDatabaseDefinition,
"cannot drop region %q",
n.Region,
),
"You must not have any active sessions that are in this region. "+
"Ensure that there no nodes that still belong to region %q", n.Region,
)
}
}
// For the region_liveness table, we can just safely remove the reference
// (if any) of the dropping region from the table.
if _, err := txn.ExecEx(ctx, "remove-region-liveness-ref", txn.KV(),
sessiondata.NodeUserSessionDataOverride, `DELETE FROM system.region_liveness
WHERE crdb_region = $1`, n.Region); err != nil {
return err
}
return nil
}); err != nil {
return nil, err
}
if err := regionliveness.CleanupSystemTableForRegion(ctx, p.execCfg.Codec, n.Region.String(), p.txn); err != nil {
return nil, err
}
}

// Ensure survivability goal and number of regions after the drop jive.
Expand Down
2 changes: 1 addition & 1 deletion pkg/sql/schema_changer_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ import (
var (
metaObjects = metric.Metadata{
Name: "sql.schema_changer.object_count",
Help: "Couner of the number of objects in the cluster",
Help: "Counter of the number of objects in the cluster",
Measurement: "Objects",
Unit: metric.Unit_COUNT,
}
Expand Down
67 changes: 63 additions & 4 deletions pkg/sql/type_change.go
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ func (t *typeSchemaChanger) exec(ctx context.Context) error {
// exposing things in the right order in OnFailOrCancel. This is because
// OnFailOrCancel doesn't expose any new state in the type descriptor
// (it just cleans up non-public states).
var multiRegionPreDropIsNecessary bool
var isDroppingMultiRegionEnumMember bool
withDatabaseRegionChangeFinalizer := func(
ctx context.Context, txn descs.Txn,
f func(finalizer *databaseRegionChangeFinalizer) error,
Expand Down Expand Up @@ -367,7 +367,7 @@ func (t *typeSchemaChanger) exec(ctx context.Context) error {
for _, member := range typeDesc.EnumMembers {
if t.isTransitioningInCurrentJob(&member) && enumMemberIsRemoving(&member) {
if typeDesc.Kind == descpb.TypeDescriptor_MULTIREGION_ENUM {
multiRegionPreDropIsNecessary = true
isDroppingMultiRegionEnumMember = true
}
toDrop = append(toDrop, member)
}
Expand All @@ -385,7 +385,7 @@ func (t *typeSchemaChanger) exec(ctx context.Context) error {
// transaction to be a writing transaction; it would have a heck of
// a lot of data to refresh. We instead defer the repartitioning until
// after this checking confirms the safety of the change.
if multiRegionPreDropIsNecessary {
if isDroppingMultiRegionEnumMember {
repartitioned, err := prepareRepartitionedRegionalByRowTables(ctx, txn)
if err != nil {
return err
Expand All @@ -403,10 +403,51 @@ func (t *typeSchemaChanger) exec(ctx context.Context) error {
}
return nil
}

var idsToRemove []int
populateIDsToRemove := func(holder context.Context, txn descs.Txn) error {
typeDesc, err := txn.Descriptors().MutableByID(txn.KV()).Type(ctx, t.typeID)
if err != nil {
return err
}
for _, member := range typeDesc.EnumMembers {
if !t.isTransitioningInCurrentJob(&member) ||
!enumMemberIsRemoving(&member) ||
typeDesc.Kind != descpb.TypeDescriptor_MULTIREGION_ENUM {
continue
}
rows, err := txn.QueryBufferedEx(ctx, "select-invalid-instances", txn.KV(),
sessiondata.NodeUserSessionDataOverride, `SELECT id FROM system.sql_instances
WHERE crdb_region = $1`, member.PhysicalRepresentation)
if err != nil {
return err
}
for _, row := range rows {
idsToRemove = append(idsToRemove, int(tree.MustBeDInt(row[0])))
}
}
return nil
}

removeReferences := func(ctx context.Context, txn descs.Txn) error {
for _, id := range idsToRemove {
deleteQuery := fmt.Sprintf(
`DELETE FROM system.sql_instances WHERE id = %d`, id)
if _, err := txn.ExecEx(ctx, "delete-dropped-region-ref", txn.KV(),
sessiondata.NodeUserSessionDataOverride, deleteQuery); err != nil {
return err
}

}
return nil
}
if err := t.execCfg.InternalDB.DescsTxn(ctx, validateDrops); err != nil {
return err
}
if multiRegionPreDropIsNecessary {
if isDroppingMultiRegionEnumMember {
if err := t.execCfg.InternalDB.DescsTxn(ctx, populateIDsToRemove); err != nil {
return err
}
if err := t.execCfg.InternalDB.DescsTxn(ctx, repartitionRegionalByRowTables); err != nil {
return err
}
Expand Down Expand Up @@ -500,6 +541,12 @@ func (t *typeSchemaChanger) exec(ctx context.Context) error {
if err := refreshTypeDescriptorLeases(ctx, leaseMgr, t.execCfg.DB, typeDesc); err != nil {
return err
}

if isDroppingMultiRegionEnumMember && len(idsToRemove) != 0 {
if err := t.execCfg.InternalDB.DescsTxn(ctx, removeReferences); err != nil {
return err
}
}
}

// If the type is being dropped, remove the descriptor here only
Expand Down Expand Up @@ -1009,6 +1056,18 @@ func (t *typeSchemaChanger) canRemoveEnumValueFromTable(
// Check if the above query returned a result. If it did, then the
// enum value is being used by some place.
if len(rows) > 0 {
// If our enum member is being removed, we can skip this check
// because we need to wait until the region is removed from our
// multiregion enum before we can drop the reference entirely.
// We will perform said cleanup later on during the type schema
// change. We have to do this because
// instancestorage.RunInstanceIDReclaimLoop will add prewarmed
// entries in the instances table for each public region.
if member.Direction == descpb.TypeDescriptor_EnumMember_REMOVE {
if desc.GetID() == keys.SQLInstancesTableID {
return nil
}
}
return pgerror.Newf(pgcode.DependentObjectsStillExist,
"could not remove enum value %q as it is being used by %q in row: %s",
member.LogicalRepresentation, desc.GetName(), labeledRowValues(desc.AccessibleColumns(), rows))
Expand Down
Loading