From e38f2bf7e74085aa3f97bab11a43e20f9bf966a0 Mon Sep 17 00:00:00 2001 From: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> Date: Mon, 1 Aug 2022 10:38:51 +0300 Subject: [PATCH 1/7] Online DDL: migration state transitions to 'cancelled' after CANCEL command Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> --- .../scheduler/onlineddl_scheduler_test.go | 6 +-- go/vt/vttablet/onlineddl/executor.go | 43 +++++++++++++------ go/vt/vttablet/onlineddl/schema.go | 10 +++++ 3 files changed, 42 insertions(+), 17 deletions(-) diff --git a/go/test/endtoend/onlineddl/scheduler/onlineddl_scheduler_test.go b/go/test/endtoend/onlineddl/scheduler/onlineddl_scheduler_test.go index 5da4525b803..571978caf51 100644 --- a/go/test/endtoend/onlineddl/scheduler/onlineddl_scheduler_test.go +++ b/go/test/endtoend/onlineddl/scheduler/onlineddl_scheduler_test.go @@ -401,7 +401,7 @@ func TestSchemaChange(t *testing.T) { // let's cancel it onlineddl.CheckCancelMigration(t, &vtParams, shards, drop1uuid, true) time.Sleep(2 * time.Second) - onlineddl.CheckMigrationStatus(t, &vtParams, shards, drop1uuid, schema.OnlineDDLStatusFailed) + onlineddl.CheckMigrationStatus(t, &vtParams, shards, drop1uuid, schema.OnlineDDLStatusCancelled) }) t.Run("complete t1", func(t *testing.T) { // t1 should be still running! @@ -511,7 +511,7 @@ func TestSchemaChange(t *testing.T) { // let's cancel it onlineddl.CheckCancelMigration(t, &vtParams, shards, drop1uuid, true) time.Sleep(2 * time.Second) - onlineddl.CheckMigrationStatus(t, &vtParams, shards, drop1uuid, schema.OnlineDDLStatusFailed) + onlineddl.CheckMigrationStatus(t, &vtParams, shards, drop1uuid, schema.OnlineDDLStatusCancelled) }) t.Run("complete t1", func(t *testing.T) { // t1 should be still running! @@ -537,7 +537,7 @@ func TestSchemaChange(t *testing.T) { // let's cancel it onlineddl.CheckCancelMigration(t, &vtParams, shards, uuid, true) time.Sleep(2 * time.Second) - onlineddl.CheckMigrationStatus(t, &vtParams, shards, uuid, schema.OnlineDDLStatusFailed) + onlineddl.CheckMigrationStatus(t, &vtParams, shards, uuid, schema.OnlineDDLStatusCancelled) }) // now, we submit the exact same migratoin again: same UUID, same migration context. diff --git a/go/vt/vttablet/onlineddl/executor.go b/go/vt/vttablet/onlineddl/executor.go index 9fe615ad153..a767f7d221d 100644 --- a/go/vt/vttablet/onlineddl/executor.go +++ b/go/vt/vttablet/onlineddl/executor.go @@ -1420,9 +1420,8 @@ exit $exit_code log.Infof("Will now dry-run gh-ost on: %s:%d", variables.host, variables.port) if err := runGhost(false); err != nil { - // perhaps gh-ost was interrupted midway and didn't have the chance to send a "failes" status - _ = e.updateMigrationStatus(ctx, onlineDDL.UUID, schema.OnlineDDLStatusFailed) - _ = e.updateMigrationMessage(ctx, onlineDDL.UUID, err.Error()) + // perhaps gh-ost was interrupted midway and didn't have the chance to send a "failed" status + _ = e.failMigration(ctx, onlineDDL, err) log.Errorf("Error executing gh-ost dry run: %+v", err) return err @@ -1433,8 +1432,7 @@ exit $exit_code startedMigrations.Add(1) if err := runGhost(true); err != nil { // perhaps gh-ost was interrupted midway and didn't have the chance to send a "failes" status - _ = e.updateMigrationStatus(ctx, onlineDDL.UUID, schema.OnlineDDLStatusFailed) - _ = e.updateMigrationMessage(ctx, onlineDDL.UUID, err.Error()) + _ = e.failMigration(ctx, onlineDDL, err) failedMigrations.Add(1) log.Errorf("Error running gh-ost: %+v", err) return err @@ -1644,8 +1642,7 @@ export MYSQL_PWD log.Infof("Will now dry-run pt-online-schema-change on: %s:%d", variables.host, variables.port) if err := runPTOSC(false); err != nil { // perhaps pt-osc was interrupted midway and didn't have the chance to send a "failes" status - _ = e.updateMigrationStatus(ctx, onlineDDL.UUID, schema.OnlineDDLStatusFailed) - _ = e.updateMigrationMessage(ctx, onlineDDL.UUID, err.Error()) + _ = e.failMigration(ctx, onlineDDL, err) _ = e.updateMigrationTimestamp(ctx, "completed_timestamp", onlineDDL.UUID) log.Errorf("Error executing pt-online-schema-change dry run: %+v", err) return err @@ -1656,8 +1653,7 @@ export MYSQL_PWD startedMigrations.Add(1) if err := runPTOSC(true); err != nil { // perhaps pt-osc was interrupted midway and didn't have the chance to send a "failes" status - _ = e.updateMigrationStatus(ctx, onlineDDL.UUID, schema.OnlineDDLStatusFailed) - _ = e.updateMigrationMessage(ctx, onlineDDL.UUID, err.Error()) + _ = e.failMigration(ctx, onlineDDL, err) _ = e.updateMigrationTimestamp(ctx, "completed_timestamp", onlineDDL.UUID) _ = e.dropPTOSCMigrationTriggers(ctx, onlineDDL) failedMigrations.Add(1) @@ -1738,7 +1734,6 @@ func (e *Executor) terminateMigration(ctx context.Context, onlineDDL *schema.Onl if err := e.terminateVReplMigration(ctx, onlineDDL.UUID); err != nil { return foundRunning, fmt.Errorf("Error terminating migration, vreplication exec error: %+v", err) } - _ = e.updateMigrationStatus(ctx, onlineDDL.UUID, schema.OnlineDDLStatusFailed) case schema.DDLStrategyPTOSC: // see if pt-osc is running (could have been executed by this vttablet or one that crashed in the past) if running, pid, _ := e.isPTOSCMigrationRunning(ctx, onlineDDL.UUID); running { @@ -1794,17 +1789,25 @@ func (e *Executor) CancelMigration(ctx context.Context, uuid string, message str case schema.OnlineDDLStatusComplete, schema.OnlineDDLStatusFailed: log.Infof("CancelMigration: migration %s is in non-cancellable status: %v", uuid, onlineDDL.Status) return emptyResult, nil + } + // From this point on, we're actually cancelling a migration + _ = e.updateMigrationTimestamp(ctx, "cancelled_timestamp", uuid) + defer e.failMigration(ctx, onlineDDL, errors.New(message)) + defer e.triggerNextCheckInterval() + + switch onlineDDL.Status { case schema.OnlineDDLStatusQueued, schema.OnlineDDLStatusReady: log.Infof("CancelMigration: cancelling %s with status: %v", uuid, onlineDDL.Status) if err := e.updateMigrationStatus(ctx, onlineDDL.UUID, schema.OnlineDDLStatusCancelled); err != nil { return nil, err } - rowsAffected = 1 + return &sqltypes.Result{RowsAffected: 1}, nil } - defer e.triggerNextCheckInterval() migrationFound, err := e.terminateMigration(ctx, onlineDDL) - defer e.updateMigrationMessage(ctx, onlineDDL.UUID, message) + if err != nil { + return result, err + } if migrationFound { log.Infof("CancelMigration: terminated %s with status: %v", uuid, onlineDDL.Status) @@ -2341,7 +2344,7 @@ func (e *Executor) getCompletedMigrationByContextAndSQL(ctx context.Context, onl // failMigration marks a migration as failed func (e *Executor) failMigration(ctx context.Context, onlineDDL *schema.OnlineDDL, err error) error { - _ = e.updateMigrationStatus(ctx, onlineDDL.UUID, schema.OnlineDDLStatusFailed) + _ = e.updateMigrationStatusFailedOrCancelled(ctx, onlineDDL.UUID) if err != nil { _ = e.updateMigrationMessage(ctx, onlineDDL.UUID, err.Error()) } @@ -3715,6 +3718,18 @@ func (e *Executor) updateTabletFailure(ctx context.Context, uuid string) error { return err } +func (e *Executor) updateMigrationStatusFailedOrCancelled(ctx context.Context, uuid string) error { + log.Infof("updateMigrationStatus: transitioning migration: %s into status failed or cancelled", uuid) + query, err := sqlparser.ParseAndBind(sqlUpdateMigrationStatusFailedOrCancelled, + sqltypes.StringBindVariable(uuid), + ) + if err != nil { + return err + } + _, err = e.execQuery(ctx, query) + return err +} + func (e *Executor) updateMigrationStatus(ctx context.Context, uuid string, status schema.OnlineDDLStatus) error { log.Infof("updateMigrationStatus: transitioning migration: %s into status: %s", uuid, string(status)) query, err := sqlparser.ParseAndBind(sqlUpdateMigrationStatus, diff --git a/go/vt/vttablet/onlineddl/schema.go b/go/vt/vttablet/onlineddl/schema.go index 6ae1728e8b1..3262402ffa8 100644 --- a/go/vt/vttablet/onlineddl/schema.go +++ b/go/vt/vttablet/onlineddl/schema.go @@ -79,6 +79,7 @@ const ( alterSchemaMigrationsTableSpecialPlan = "ALTER TABLE _vt.schema_migrations add column special_plan text NOT NULL" alterSchemaMigrationsLastThrottled = "ALTER TABLE _vt.schema_migrations add column last_throttled_timestamp timestamp NULL DEFAULT NULL" alterSchemaMigrationsComponentThrottled = "ALTER TABLE _vt.schema_migrations add column component_throttled tinytext NOT NULL" + alterSchemaMigrationsCancelledTimestamp = "ALTER TABLE _vt.schema_migrations add column cancelled_timestamp timestamp NULL DEFAULT NULL" sqlInsertMigration = `INSERT IGNORE INTO _vt.schema_migrations ( migration_uuid, @@ -122,6 +123,11 @@ const ( WHERE migration_uuid=%a ` + sqlUpdateMigrationStatusFailedOrCancelled = `UPDATE _vt.schema_migrations + SET migration_status=IF(cancelled_timestamp IS NULL, 'failed', 'cancelled') + WHERE + migration_uuid=%a + ` sqlUpdateMigrationProgress = `UPDATE _vt.schema_migrations SET progress=%a WHERE @@ -272,6 +278,7 @@ const ( ready_timestamp=NULL, started_timestamp=NULL, liveness_timestamp=NULL, + cancelled_timestamp=NULL, completed_timestamp=NULL, cleanup_timestamp=NULL WHERE @@ -289,6 +296,7 @@ const ( ready_timestamp=NULL, started_timestamp=NULL, liveness_timestamp=NULL, + cancelled_timestamp=NULL, completed_timestamp=NULL, cleanup_timestamp=NULL WHERE @@ -410,6 +418,7 @@ const ( vitess_liveness_indicator, user_throttle_ratio, last_throttled_timestamp, + cancelled_timestamp, component_throttled, postpone_completion FROM _vt.schema_migrations @@ -627,4 +636,5 @@ var ApplyDDL = []string{ alterSchemaMigrationsTableSpecialPlan, alterSchemaMigrationsLastThrottled, alterSchemaMigrationsComponentThrottled, + alterSchemaMigrationsCancelledTimestamp, } From 1623e63b8af307629552ad6074f26396b9c4eb56 Mon Sep 17 00:00:00 2001 From: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> Date: Mon, 1 Aug 2022 11:22:01 +0300 Subject: [PATCH 2/7] allow 'cancelled' state in vrepl suite Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> --- .../onlineddl/vrepl_suite/onlineddl_vrepl_suite_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/test/endtoend/onlineddl/vrepl_suite/onlineddl_vrepl_suite_test.go b/go/test/endtoend/onlineddl/vrepl_suite/onlineddl_vrepl_suite_test.go index 34bcbdeb119..f0d4b13d2e0 100644 --- a/go/test/endtoend/onlineddl/vrepl_suite/onlineddl_vrepl_suite_test.go +++ b/go/test/endtoend/onlineddl/vrepl_suite/onlineddl_vrepl_suite_test.go @@ -319,7 +319,7 @@ func waitForMigration(t *testing.T, uuid string, timeout time.Duration) sqltypes row := readMigration(t, uuid) status = row["migration_status"].ToString() switch status { - case string(schema.OnlineDDLStatusComplete), string(schema.OnlineDDLStatusFailed): + case string(schema.OnlineDDLStatusComplete), string(schema.OnlineDDLStatusFailed), string(schema.OnlineDDLStatusCancelled): // migration is complete, either successful or not return row } From 6bb968b2bd163f54f1403cc7e279b623109a0ce0 Mon Sep 17 00:00:00 2001 From: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> Date: Mon, 1 Aug 2022 11:48:18 +0300 Subject: [PATCH 3/7] allow 'cancelled' state in vrepl suite Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> --- .../onlineddl/vrepl_suite/onlineddl_vrepl_suite_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/test/endtoend/onlineddl/vrepl_suite/onlineddl_vrepl_suite_test.go b/go/test/endtoend/onlineddl/vrepl_suite/onlineddl_vrepl_suite_test.go index f0d4b13d2e0..31341ed8cdd 100644 --- a/go/test/endtoend/onlineddl/vrepl_suite/onlineddl_vrepl_suite_test.go +++ b/go/test/endtoend/onlineddl/vrepl_suite/onlineddl_vrepl_suite_test.go @@ -250,7 +250,7 @@ func testSingle(t *testing.T, testName string) { if expectedErrorMessage, exists := readTestFile(t, testName, "expect_failure"); exists { // Failure is expected! - assert.Equal(t, string(schema.OnlineDDLStatusFailed), migrationStatus) + assert.Contains(t, []string{string(schema.OnlineDDLStatusFailed), string(schema.OnlineDDLStatusCancelled)}, migrationStatus) require.Contains(t, migrationMessage, expectedErrorMessage, "expected error message (%s) to contain (%s)", migrationMessage, expectedErrorMessage) // no need to proceed to checksum or anything further return From c3b26f203a4b9dbaf056ed9c17197d537ea55be8 Mon Sep 17 00:00:00 2001 From: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> Date: Mon, 1 Aug 2022 12:26:01 +0300 Subject: [PATCH 4/7] expect 'cancelled' state Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> --- .../onlineddl/singleton/onlineddl_singleton_test.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/go/test/endtoend/onlineddl/singleton/onlineddl_singleton_test.go b/go/test/endtoend/onlineddl/singleton/onlineddl_singleton_test.go index ed323789bb4..91614843c26 100644 --- a/go/test/endtoend/onlineddl/singleton/onlineddl_singleton_test.go +++ b/go/test/endtoend/onlineddl/singleton/onlineddl_singleton_test.go @@ -197,7 +197,7 @@ func TestSchemaChange(t *testing.T) { onlineddl.CheckMigrationStatus(t, &vtParams, shards, throttledUUID, schema.OnlineDDLStatusRunning) onlineddl.CheckCancelMigration(t, &vtParams, shards, throttledUUID, true) time.Sleep(2 * time.Second) - onlineddl.CheckMigrationStatus(t, &vtParams, shards, throttledUUID, schema.OnlineDDLStatusFailed) + onlineddl.CheckMigrationStatus(t, &vtParams, shards, throttledUUID, schema.OnlineDDLStatusCancelled) }) t.Run("successful gh-ost alter, vtctl", func(t *testing.T) { uuid := testOnlineDDLStatement(t, alterTableTrivialStatement, "gh-ost --singleton", "vtctl", "", "hint_col", "", false) @@ -294,9 +294,9 @@ func TestSchemaChange(t *testing.T) { // revert is running _ = testOnlineDDLStatement(t, dropNonexistentTableStatement, "vitess --allow-concurrent --singleton-context", "vtctl", "migrate:ctx", "", "rejected", true) onlineddl.CheckCancelMigration(t, &vtParams, shards, revertUUID, true) - status := onlineddl.WaitForMigrationStatus(t, &vtParams, shards, revertUUID, 20*time.Second, schema.OnlineDDLStatusComplete, schema.OnlineDDLStatusFailed) + status := onlineddl.WaitForMigrationStatus(t, &vtParams, shards, revertUUID, 20*time.Second, schema.OnlineDDLStatusComplete, schema.OnlineDDLStatusFailed, schema.OnlineDDLStatusCancelled) fmt.Printf("# Migration status (for debug purposes): <%s>\n", status) - onlineddl.CheckMigrationStatus(t, &vtParams, shards, revertUUID, schema.OnlineDDLStatusFailed) + onlineddl.CheckMigrationStatus(t, &vtParams, shards, revertUUID, schema.OnlineDDLStatusCancelled) }) t.Run("success concurrent singleton-context with no-context revert", func(t *testing.T) { revertUUID := testRevertMigration(t, uuids[len(uuids)-1], "vtctl", "vitess --allow-concurrent --postpone-completion", "rev:ctx", "", false) @@ -306,9 +306,9 @@ func TestSchemaChange(t *testing.T) { uuids = append(uuids, uuid) onlineddl.CheckMigrationStatus(t, &vtParams, shards, uuid, schema.OnlineDDLStatusComplete) onlineddl.CheckCancelMigration(t, &vtParams, shards, revertUUID, true) - status := onlineddl.WaitForMigrationStatus(t, &vtParams, shards, revertUUID, 20*time.Second, schema.OnlineDDLStatusComplete, schema.OnlineDDLStatusFailed) + status := onlineddl.WaitForMigrationStatus(t, &vtParams, shards, revertUUID, 20*time.Second, schema.OnlineDDLStatusComplete, schema.OnlineDDLStatusFailed, schema.OnlineDDLStatusCancelled) fmt.Printf("# Migration status (for debug purposes): <%s>\n", status) - onlineddl.CheckMigrationStatus(t, &vtParams, shards, revertUUID, schema.OnlineDDLStatusFailed) + onlineddl.CheckMigrationStatus(t, &vtParams, shards, revertUUID, schema.OnlineDDLStatusCancelled) }) } From 7af515f2eb6572a3442d15a40368b76e8f466af9 Mon Sep 17 00:00:00 2001 From: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> Date: Mon, 1 Aug 2022 12:52:06 +0300 Subject: [PATCH 5/7] expect 'cancelled' state Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> --- go/test/endtoend/onlineddl/vrepl/onlineddl_vrepl_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/test/endtoend/onlineddl/vrepl/onlineddl_vrepl_test.go b/go/test/endtoend/onlineddl/vrepl/onlineddl_vrepl_test.go index d1f72eb98f1..d549a2abb51 100644 --- a/go/test/endtoend/onlineddl/vrepl/onlineddl_vrepl_test.go +++ b/go/test/endtoend/onlineddl/vrepl/onlineddl_vrepl_test.go @@ -351,7 +351,7 @@ func TestSchemaChange(t *testing.T) { testRows(t) onlineddl.CheckCancelMigration(t, &vtParams, shards, uuid, true) time.Sleep(2 * time.Second) - onlineddl.CheckMigrationStatus(t, &vtParams, shards, uuid, schema.OnlineDDLStatusFailed) + onlineddl.CheckMigrationStatus(t, &vtParams, shards, uuid, schema.OnlineDDLStatusCancelled) }) t.Run("throttled and unthrottled migration", func(t *testing.T) { From 053536160c10df0c2665f3c4cc06a5525b18893d Mon Sep 17 00:00:00 2001 From: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> Date: Mon, 1 Aug 2022 12:58:25 +0300 Subject: [PATCH 6/7] excessive check for error Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> --- go/vt/vttablet/onlineddl/executor.go | 4 ---- 1 file changed, 4 deletions(-) diff --git a/go/vt/vttablet/onlineddl/executor.go b/go/vt/vttablet/onlineddl/executor.go index a767f7d221d..0d940b3767d 100644 --- a/go/vt/vttablet/onlineddl/executor.go +++ b/go/vt/vttablet/onlineddl/executor.go @@ -1805,10 +1805,6 @@ func (e *Executor) CancelMigration(ctx context.Context, uuid string, message str } migrationFound, err := e.terminateMigration(ctx, onlineDDL) - if err != nil { - return result, err - } - if migrationFound { log.Infof("CancelMigration: terminated %s with status: %v", uuid, onlineDDL.Status) rowsAffected = 1 From ad311f5b18f79bcd0a95bfc4f1bdbb9a37719410 Mon Sep 17 00:00:00 2001 From: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> Date: Mon, 1 Aug 2022 13:28:26 +0300 Subject: [PATCH 7/7] expect 'cancelled' state Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> --- go/test/endtoend/onlineddl/ghost/onlineddl_ghost_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/test/endtoend/onlineddl/ghost/onlineddl_ghost_test.go b/go/test/endtoend/onlineddl/ghost/onlineddl_ghost_test.go index 792e654ea82..a52e4ae9ec4 100644 --- a/go/test/endtoend/onlineddl/ghost/onlineddl_ghost_test.go +++ b/go/test/endtoend/onlineddl/ghost/onlineddl_ghost_test.go @@ -272,7 +272,7 @@ func TestSchemaChange(t *testing.T) { onlineddl.CheckMigrationStatus(t, &vtParams, shards, uuid, schema.OnlineDDLStatusRunning) onlineddl.CheckCancelMigration(t, &vtParams, shards, uuid, true) time.Sleep(2 * time.Second) - onlineddl.CheckMigrationStatus(t, &vtParams, shards, uuid, schema.OnlineDDLStatusFailed) + onlineddl.CheckMigrationStatus(t, &vtParams, shards, uuid, schema.OnlineDDLStatusCancelled) }) t.Run("failed migration", func(t *testing.T) { uuid := testOnlineDDLStatement(t, alterTableFailedStatement, "gh-ost", "vtgate", "ghost_col", "")