From f265d20533cd879f46ffbc6283375b5a55206308 Mon Sep 17 00:00:00 2001 From: "vitess-bot[bot]" <108069721+vitess-bot[bot]@users.noreply.github.com> Date: Thu, 28 Sep 2023 18:52:47 -0400 Subject: [PATCH 1/2] Cherry-pick 04834c4fec33bdec0102174cb26e8f15359cfaa2 with conflicts --- go/vt/vtctl/workflow/server.go | 2133 +++++++++++++++++ .../vttablet/tabletmanager/framework_test.go | 492 ++++ go/vt/vttablet/tabletmanager/vdiff/action.go | 68 +- .../tabletmanager/vdiff/action_test.go | 70 +- .../tabletmanager/vdiff/controller.go | 22 +- go/vt/vttablet/tabletmanager/vdiff/schema.go | 12 + .../tabletmanager/vdiff/table_differ.go | 4 + .../tabletmanager/vdiff/workflow_differ.go | 6 + go/vt/wrangler/workflow.go | 26 +- 9 files changed, 2804 insertions(+), 29 deletions(-) create mode 100644 go/vt/vttablet/tabletmanager/framework_test.go diff --git a/go/vt/vtctl/workflow/server.go b/go/vt/vtctl/workflow/server.go index b26e198c2cd..53768ade993 100644 --- a/go/vt/vtctl/workflow/server.go +++ b/go/vt/vtctl/workflow/server.go @@ -749,3 +749,2136 @@ func (s *Server) getWorkflowCopyStates(ctx context.Context, tablet *topo.TabletI return copyStates, nil } +<<<<<<< HEAD +======= + +// MoveTablesCreate is part of the vtctlservicepb.VtctldServer interface. +// It passes the embedded TabletRequest object to the given keyspace's +// target primary tablets that will be executing the workflow. +func (s *Server) MoveTablesCreate(ctx context.Context, req *vtctldatapb.MoveTablesCreateRequest) (res *vtctldatapb.WorkflowStatusResponse, err error) { + span, ctx := trace.NewSpan(ctx, "workflow.Server.MoveTablesCreate") + defer span.Finish() + + span.Annotate("keyspace", req.TargetKeyspace) + span.Annotate("workflow", req.Workflow) + span.Annotate("cells", req.Cells) + span.Annotate("tablet_types", req.TabletTypes) + span.Annotate("on_ddl", req.OnDdl) + + sourceKeyspace := req.SourceKeyspace + targetKeyspace := req.TargetKeyspace + //FIXME validate tableSpecs, allTables, excludeTables + var ( + tables = req.IncludeTables + externalTopo *topo.Server + sourceTopo = s.ts + ) + + // When the source is an external cluster mounted using the Mount command. + if req.ExternalClusterName != "" { + externalTopo, err = s.ts.OpenExternalVitessClusterServer(ctx, req.ExternalClusterName) + if err != nil { + return nil, err + } + sourceTopo = externalTopo + log.Infof("Successfully opened external topo: %+v", externalTopo) + } + + var vschema *vschemapb.Keyspace + var origVSchema *vschemapb.Keyspace // If we need to rollback a failed create + vschema, err = s.ts.GetVSchema(ctx, targetKeyspace) + if err != nil { + return nil, err + } + if vschema == nil { + return nil, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "no vschema found for target keyspace %s", targetKeyspace) + } + ksTables, err := getTablesInKeyspace(ctx, sourceTopo, s.tmc, sourceKeyspace) + if err != nil { + return nil, err + } + if len(tables) > 0 { + err = s.validateSourceTablesExist(ctx, sourceKeyspace, ksTables, tables) + if err != nil { + return nil, err + } + } else { + if req.AllTables { + tables = ksTables + } else { + return nil, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "no tables to move") + } + } + if len(req.ExcludeTables) > 0 { + err = s.validateSourceTablesExist(ctx, sourceKeyspace, ksTables, req.ExcludeTables) + if err != nil { + return nil, err + } + } + var tables2 []string + for _, t := range tables { + if shouldInclude(t, req.ExcludeTables) { + tables2 = append(tables2, t) + } + } + tables = tables2 + if len(tables) == 0 { + return nil, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "no tables to move") + } + log.Infof("Found tables to move: %s", strings.Join(tables, ",")) + + if !vschema.Sharded { + // Save the original in case we need to restore it for a late failure + // in the defer(). + origVSchema = vschema.CloneVT() + if err := s.addTablesToVSchema(ctx, sourceKeyspace, vschema, tables, externalTopo == nil); err != nil { + return nil, err + } + } + + ms := &vtctldatapb.MaterializeSettings{ + Workflow: req.Workflow, + MaterializationIntent: vtctldatapb.MaterializationIntent_MOVETABLES, + SourceKeyspace: sourceKeyspace, + TargetKeyspace: targetKeyspace, + Cell: strings.Join(req.Cells, ","), + TabletTypes: topoproto.MakeStringTypeCSV(req.TabletTypes), + TabletSelectionPreference: req.TabletSelectionPreference, + StopAfterCopy: req.StopAfterCopy, + ExternalCluster: req.ExternalClusterName, + SourceShards: req.SourceShards, + OnDdl: req.OnDdl, + DeferSecondaryKeys: req.DeferSecondaryKeys, + AtomicCopy: req.AtomicCopy, + } + if req.SourceTimeZone != "" { + ms.SourceTimeZone = req.SourceTimeZone + ms.TargetTimeZone = "UTC" + } + createDDLMode := createDDLAsCopy + if req.DropForeignKeys { + createDDLMode = createDDLAsCopyDropForeignKeys + } + + for _, table := range tables { + buf := sqlparser.NewTrackedBuffer(nil) + buf.Myprintf("select * from %v", sqlparser.NewIdentifierCS(table)) + ms.TableSettings = append(ms.TableSettings, &vtctldatapb.TableMaterializeSettings{ + TargetTable: table, + SourceExpression: buf.String(), + CreateDdl: createDDLMode, + }) + } + mz := &materializer{ + ctx: ctx, + ts: s.ts, + sourceTs: sourceTopo, + tmc: s.tmc, + ms: ms, + } + err = mz.prepareMaterializerStreams(req) + if err != nil { + return nil, err + } + + // If we get an error after this point, where the vreplication streams/records + // have been created, then we clean up the workflow's artifacts. + defer func() { + if err != nil { + ts, cerr := s.buildTrafficSwitcher(ctx, ms.TargetKeyspace, ms.Workflow) + if cerr != nil { + err = vterrors.Wrapf(err, "failed to cleanup workflow artifacts: %v", cerr) + } + if cerr := s.dropArtifacts(ctx, false, &switcher{s: s, ts: ts}); cerr != nil { + err = vterrors.Wrapf(err, "failed to cleanup workflow artifacts: %v", cerr) + } + if origVSchema == nil { // There's no previous version to restore + return + } + if cerr := s.ts.SaveVSchema(ctx, targetKeyspace, origVSchema); cerr != nil { + err = vterrors.Wrapf(err, "failed to restore original target vschema: %v", cerr) + } + } + }() + + // Now that the streams have been successfully created, let's put the associated + // routing rules in place. + if externalTopo == nil { + if req.NoRoutingRules { + log.Warningf("Found --no-routing-rules flag, not creating routing rules for workflow %s.%s", targetKeyspace, req.Workflow) + } else { + // Save routing rules before vschema. If we save vschema first, and routing + // rules fails to save, we may generate duplicate table errors. + if mz.isPartial { + if err := createDefaultShardRoutingRules(mz.ctx, mz.ms, mz.ts); err != nil { + return nil, err + } + } + + rules, err := topotools.GetRoutingRules(ctx, s.ts) + if err != nil { + return nil, err + } + for _, table := range tables { + toSource := []string{sourceKeyspace + "." + table} + rules[table] = toSource + rules[table+"@replica"] = toSource + rules[table+"@rdonly"] = toSource + rules[targetKeyspace+"."+table] = toSource + rules[targetKeyspace+"."+table+"@replica"] = toSource + rules[targetKeyspace+"."+table+"@rdonly"] = toSource + rules[targetKeyspace+"."+table] = toSource + rules[sourceKeyspace+"."+table+"@replica"] = toSource + rules[sourceKeyspace+"."+table+"@rdonly"] = toSource + } + if err := topotools.SaveRoutingRules(ctx, s.ts, rules); err != nil { + return nil, err + } + } + if vschema != nil { + // We added to the vschema. + if err := s.ts.SaveVSchema(ctx, targetKeyspace, vschema); err != nil { + return nil, err + } + } + + } + if err := s.ts.RebuildSrvVSchema(ctx, nil); err != nil { + return nil, err + } + + if ms.SourceTimeZone != "" { + if err := mz.checkTZConversion(ctx, ms.SourceTimeZone); err != nil { + return nil, err + } + } + + tabletShards, err := s.collectTargetStreams(ctx, mz) + if err != nil { + return nil, err + } + + migrationID, err := getMigrationID(targetKeyspace, tabletShards) + if err != nil { + return nil, err + } + + if mz.ms.ExternalCluster == "" { + exists, tablets, err := s.checkIfPreviousJournalExists(ctx, mz, migrationID) + if err != nil { + return nil, err + } + if exists { + log.Errorf("Found a previous journal entry for %d", migrationID) + msg := fmt.Sprintf("found an entry from a previous run for migration id %d in _vt.resharding_journal on tablets %s, ", + migrationID, strings.Join(tablets, ",")) + msg += fmt.Sprintf("please review and delete it before proceeding and then start the workflow using: MoveTables --workflow %s --target-keyspace %s start", + req.Workflow, req.TargetKeyspace) + return nil, vterrors.Errorf(vtrpcpb.Code_INTERNAL, msg) + } + } + + if req.AutoStart { + if err := mz.startStreams(ctx); err != nil { + return nil, err + } + } + + return s.WorkflowStatus(ctx, &vtctldatapb.WorkflowStatusRequest{ + Keyspace: targetKeyspace, + Workflow: req.Workflow, + }) +} + +// MoveTablesComplete is part of the vtctlservicepb.VtctldServer interface. +// It cleans up a successful MoveTables workflow and its related artifacts. +func (s *Server) MoveTablesComplete(ctx context.Context, req *vtctldatapb.MoveTablesCompleteRequest) (*vtctldatapb.MoveTablesCompleteResponse, error) { + span, ctx := trace.NewSpan(ctx, "workflow.Server.MoveTablesComplete") + defer span.Finish() + + ts, state, err := s.getWorkflowState(ctx, req.TargetKeyspace, req.Workflow) + if err != nil { + return nil, err + } + + var summary string + if req.DryRun { + summary = fmt.Sprintf("Complete dry run results for workflow %s.%s at %v", req.TargetKeyspace, req.Workflow, time.Now().UTC().Format(time.RFC822)) + } else { + summary = fmt.Sprintf("Successfully completed the %s workflow in the %s keyspace", req.Workflow, req.TargetKeyspace) + } + var dryRunResults *[]string + + if state.WorkflowType == TypeMigrate { + dryRunResults, err = s.finalizeMigrateWorkflow(ctx, req.TargetKeyspace, req.Workflow, strings.Join(ts.tables, ","), + false, req.KeepData, req.KeepRoutingRules, req.DryRun) + if err != nil { + return nil, vterrors.Wrapf(err, "failed to finalize the %s workflow in the %s keyspace", + req.Workflow, req.TargetKeyspace) + } + resp := &vtctldatapb.MoveTablesCompleteResponse{ + Summary: summary, + } + if dryRunResults != nil { + resp.DryRunResults = *dryRunResults + } + return resp, nil + } + + if !state.WritesSwitched || len(state.ReplicaCellsNotSwitched) > 0 || len(state.RdonlyCellsNotSwitched) > 0 { + return nil, ErrWorkflowNotFullySwitched + } + var renameTable TableRemovalType + if req.RenameTables { + renameTable = RenameTable + } else { + renameTable = DropTable + } + if dryRunResults, err = s.dropSources(ctx, ts, renameTable, req.KeepData, req.KeepRoutingRules, false, req.DryRun); err != nil { + return nil, err + } + + resp := &vtctldatapb.MoveTablesCompleteResponse{ + Summary: summary, + } + if dryRunResults != nil { + resp.DryRunResults = *dryRunResults + } + + return resp, nil +} + +// ReshardCreate is part of the vtctlservicepb.VtctldServer interface. +func (s *Server) ReshardCreate(ctx context.Context, req *vtctldatapb.ReshardCreateRequest) (*vtctldatapb.WorkflowStatusResponse, error) { + span, ctx := trace.NewSpan(ctx, "workflow.Server.ReshardCreate") + defer span.Finish() + + span.Annotate("keyspace", req.Keyspace) + span.Annotate("workflow", req.Workflow) + span.Annotate("source_shards", req.SourceShards) + span.Annotate("target_shards", req.TargetShards) + span.Annotate("cells", req.Cells) + span.Annotate("tablet_types", req.TabletTypes) + span.Annotate("on_ddl", req.OnDdl) + + keyspace := req.Keyspace + cells := req.Cells + // TODO: validate workflow does not exist. + + if err := s.ts.ValidateSrvKeyspace(ctx, keyspace, strings.Join(cells, ",")); err != nil { + err2 := vterrors.Wrapf(err, "SrvKeyspace for keyspace %s is corrupt for cell(s) %s", keyspace, cells) + log.Errorf("%w", err2) + return nil, err + } + rs, err := s.buildResharder(ctx, keyspace, req.Workflow, req.SourceShards, req.TargetShards, strings.Join(cells, ","), "") + if err != nil { + return nil, vterrors.Wrap(err, "buildResharder") + } + rs.onDDL = req.OnDdl + rs.stopAfterCopy = req.StopAfterCopy + rs.deferSecondaryKeys = req.DeferSecondaryKeys + if !req.SkipSchemaCopy { + if err := rs.copySchema(ctx); err != nil { + return nil, vterrors.Wrap(err, "copySchema") + } + } + if err := rs.createStreams(ctx); err != nil { + return nil, vterrors.Wrap(err, "createStreams") + } + + if req.AutoStart { + if err := rs.startStreams(ctx); err != nil { + return nil, vterrors.Wrap(err, "startStreams") + } + } else { + log.Warningf("Streams will not be started since --auto-start is set to false") + } + return nil, nil +} + +// VDiffCreate is part of the vtctlservicepb.VtctldServer interface. +// It passes on the request to the target primary tablets that are +// participating in the given workflow and VDiff. +func (s *Server) VDiffCreate(ctx context.Context, req *vtctldatapb.VDiffCreateRequest) (*vtctldatapb.VDiffCreateResponse, error) { + span, ctx := trace.NewSpan(ctx, "workflow.Server.VDiffCreate") + defer span.Finish() + + span.Annotate("keyspace", req.TargetKeyspace) + span.Annotate("workflow", req.Workflow) + span.Annotate("uuid", req.Uuid) + span.Annotate("source_cells", req.SourceCells) + span.Annotate("target_cells", req.TargetCells) + span.Annotate("tablet_types", req.TabletTypes) + span.Annotate("tables", req.Tables) + span.Annotate("auto_retry", req.AutoRetry) + + tabletTypesStr := topoproto.MakeStringTypeCSV(req.TabletTypes) + if req.TabletSelectionPreference == tabletmanagerdatapb.TabletSelectionPreference_INORDER { + tabletTypesStr = discovery.InOrderHint + tabletTypesStr + } + + options := &tabletmanagerdatapb.VDiffOptions{ + PickerOptions: &tabletmanagerdatapb.VDiffPickerOptions{ + TabletTypes: tabletTypesStr, + SourceCell: strings.Join(req.SourceCells, ","), + TargetCell: strings.Join(req.TargetCells, ","), + }, + CoreOptions: &tabletmanagerdatapb.VDiffCoreOptions{ + Tables: strings.Join(req.Tables, ","), + AutoRetry: req.AutoRetry, + MaxRows: req.MaxExtraRowsToCompare, + TimeoutSeconds: req.FilteredReplicationWaitTime.Seconds, + MaxExtraRowsToCompare: req.MaxExtraRowsToCompare, + UpdateTableStats: req.UpdateTableStats, + }, + ReportOptions: &tabletmanagerdatapb.VDiffReportOptions{ + OnlyPks: req.OnlyPKs, + DebugQuery: req.DebugQuery, + }, + } + + tabletreq := &tabletmanagerdatapb.VDiffRequest{ + Keyspace: req.TargetKeyspace, + Workflow: req.Workflow, + Action: string(vdiff.CreateAction), + Options: options, + VdiffUuid: req.Uuid, + } + + ts, err := s.buildTrafficSwitcher(ctx, req.TargetKeyspace, req.Workflow) + if err != nil { + return nil, err + } + if ts.frozen { + return nil, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "invalid VDiff run: writes have been already been switched for workflow %s.%s", + req.TargetKeyspace, req.Workflow) + } + + err = ts.ForAllTargets(func(target *MigrationTarget) error { + _, err := s.tmc.VDiff(ctx, target.GetPrimary().Tablet, tabletreq) + return err + }) + if err != nil { + log.Errorf("Error executing vdiff create action: %v", err) + return nil, err + } + + return &vtctldatapb.VDiffCreateResponse{ + UUID: req.Uuid, + }, nil +} + +// VDiffDelete is part of the vtctlservicepb.VtctldServer interface. +func (s *Server) VDiffDelete(ctx context.Context, req *vtctldatapb.VDiffDeleteRequest) (*vtctldatapb.VDiffDeleteResponse, error) { + span, ctx := trace.NewSpan(ctx, "workflow.Server.VDiffDelete") + defer span.Finish() + + span.Annotate("keyspace", req.TargetKeyspace) + span.Annotate("workflow", req.Workflow) + span.Annotate("argument", req.Arg) + + tabletreq := &tabletmanagerdatapb.VDiffRequest{ + Keyspace: req.TargetKeyspace, + Workflow: req.Workflow, + Action: string(vdiff.DeleteAction), + ActionArg: req.Arg, + } + + ts, err := s.buildTrafficSwitcher(ctx, req.TargetKeyspace, req.Workflow) + if err != nil { + return nil, err + } + + err = ts.ForAllTargets(func(target *MigrationTarget) error { + _, err := s.tmc.VDiff(ctx, target.GetPrimary().Tablet, tabletreq) + return err + }) + if err != nil { + log.Errorf("Error executing vdiff delete action: %v", err) + return nil, err + } + + return &vtctldatapb.VDiffDeleteResponse{}, nil +} + +// VDiffResume is part of the vtctlservicepb.VtctldServer interface. +func (s *Server) VDiffResume(ctx context.Context, req *vtctldatapb.VDiffResumeRequest) (*vtctldatapb.VDiffResumeResponse, error) { + span, ctx := trace.NewSpan(ctx, "workflow.Server.VDiffResume") + defer span.Finish() + + span.Annotate("keyspace", req.TargetKeyspace) + span.Annotate("workflow", req.Workflow) + span.Annotate("uuid", req.Uuid) + + tabletreq := &tabletmanagerdatapb.VDiffRequest{ + Keyspace: req.TargetKeyspace, + Workflow: req.Workflow, + Action: string(vdiff.ResumeAction), + VdiffUuid: req.Uuid, + } + + ts, err := s.buildTrafficSwitcher(ctx, req.TargetKeyspace, req.Workflow) + if err != nil { + return nil, err + } + + err = ts.ForAllTargets(func(target *MigrationTarget) error { + _, err := s.tmc.VDiff(ctx, target.GetPrimary().Tablet, tabletreq) + return err + }) + if err != nil { + log.Errorf("Error executing vdiff resume action: %v", err) + return nil, err + } + + return &vtctldatapb.VDiffResumeResponse{}, nil +} + +// VDiffShow is part of the vtctlservicepb.VtctldServer interface. +func (s *Server) VDiffShow(ctx context.Context, req *vtctldatapb.VDiffShowRequest) (*vtctldatapb.VDiffShowResponse, error) { + span, ctx := trace.NewSpan(ctx, "workflow.Server.VDiffShow") + defer span.Finish() + + span.Annotate("keyspace", req.TargetKeyspace) + span.Annotate("workflow", req.Workflow) + span.Annotate("argument", req.Arg) + + tabletreq := &tabletmanagerdatapb.VDiffRequest{ + Keyspace: req.TargetKeyspace, + Workflow: req.Workflow, + Action: string(vdiff.ShowAction), + ActionArg: req.Arg, + } + + ts, err := s.buildTrafficSwitcher(ctx, req.TargetKeyspace, req.Workflow) + if err != nil { + return nil, err + } + + output := &vdiffOutput{ + responses: make(map[string]*tabletmanagerdatapb.VDiffResponse, len(ts.targets)), + err: nil, + } + output.err = ts.ForAllTargets(func(target *MigrationTarget) error { + resp, err := s.tmc.VDiff(ctx, target.GetPrimary().Tablet, tabletreq) + output.mu.Lock() + defer output.mu.Unlock() + output.responses[target.GetShard().ShardName()] = resp + return err + }) + if output.err != nil { + log.Errorf("Error executing vdiff show action: %v", output.err) + return nil, output.err + } + + return &vtctldatapb.VDiffShowResponse{ + TabletResponses: output.responses, + }, nil +} + +// VDiffStop is part of the vtctlservicepb.VtctldServer interface. +func (s *Server) VDiffStop(ctx context.Context, req *vtctldatapb.VDiffStopRequest) (*vtctldatapb.VDiffStopResponse, error) { + span, ctx := trace.NewSpan(ctx, "workflow.Server.VDiffStop") + defer span.Finish() + + span.Annotate("keyspace", req.TargetKeyspace) + span.Annotate("workflow", req.Workflow) + span.Annotate("uuid", req.Uuid) + + tabletreq := &tabletmanagerdatapb.VDiffRequest{ + Keyspace: req.TargetKeyspace, + Workflow: req.Workflow, + Action: string(vdiff.StopAction), + VdiffUuid: req.Uuid, + } + + ts, err := s.buildTrafficSwitcher(ctx, req.TargetKeyspace, req.Workflow) + if err != nil { + return nil, err + } + + err = ts.ForAllTargets(func(target *MigrationTarget) error { + _, err := s.tmc.VDiff(ctx, target.GetPrimary().Tablet, tabletreq) + return err + }) + if err != nil { + log.Errorf("Error executing vdiff stop action: %v", err) + return nil, err + } + + return &vtctldatapb.VDiffStopResponse{}, nil +} + +// WorkflowDelete is part of the vtctlservicepb.VtctldServer interface. +// It passes on the request to the target primary tablets that are +// participating in the given workflow. +func (s *Server) WorkflowDelete(ctx context.Context, req *vtctldatapb.WorkflowDeleteRequest) (*vtctldatapb.WorkflowDeleteResponse, error) { + span, ctx := trace.NewSpan(ctx, "workflow.Server.WorkflowDelete") + defer span.Finish() + + span.Annotate("keyspace", req.Keyspace) + span.Annotate("workflow", req.Workflow) + + // Cleanup related data and artifacts. + if _, err := s.DropTargets(ctx, req.Keyspace, req.Workflow, req.KeepData, req.KeepRoutingRules, false); err != nil { + if topo.IsErrType(err, topo.NoNode) { + return nil, vterrors.Wrapf(err, "%s keyspace does not exist", req.Keyspace) + } + return nil, err + } + + deleteReq := &tabletmanagerdatapb.DeleteVReplicationWorkflowRequest{ + Workflow: req.Workflow, + } + vx := vexec.NewVExec(req.Keyspace, req.Workflow, s.ts, s.tmc) + callback := func(ctx context.Context, tablet *topo.TabletInfo) (*querypb.QueryResult, error) { + res, err := s.tmc.DeleteVReplicationWorkflow(ctx, tablet.Tablet, deleteReq) + if err != nil { + return nil, err + } + // Best effort cleanup and optimization of related data. + s.deleteWorkflowVDiffData(ctx, tablet.Tablet, req.Workflow) + s.optimizeCopyStateTable(tablet.Tablet) + return res.Result, err + } + res, err := vx.CallbackContext(ctx, callback) + if err != nil { + return nil, err + } + + if len(res) == 0 { + return nil, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "the %s workflow does not exist in the %s keyspace", req.Workflow, req.Keyspace) + } + + response := &vtctldatapb.WorkflowDeleteResponse{} + response.Summary = fmt.Sprintf("Successfully cancelled the %s workflow in the %s keyspace", req.Workflow, req.Keyspace) + details := make([]*vtctldatapb.WorkflowDeleteResponse_TabletInfo, 0, len(res)) + for tinfo, tres := range res { + result := &vtctldatapb.WorkflowDeleteResponse_TabletInfo{ + Tablet: tinfo.Alias, + Deleted: tres.RowsAffected > 0, // Can be more than one with shard merges + } + details = append(details, result) + } + response.Details = details + return response, nil +} + +func (s *Server) WorkflowStatus(ctx context.Context, req *vtctldatapb.WorkflowStatusRequest) (*vtctldatapb.WorkflowStatusResponse, error) { + ts, state, err := s.getWorkflowState(ctx, req.Keyspace, req.Workflow) + if err != nil { + return nil, err + } + copyProgress, err := s.GetCopyProgress(ctx, ts, state) + if err != nil { + return nil, err + } + resp := &vtctldatapb.WorkflowStatusResponse{} + if copyProgress != nil { + resp.TableCopyState = make(map[string]*vtctldatapb.WorkflowStatusResponse_TableCopyState, len(*copyProgress)) + // We sort the tables for intuitive and consistent output. + var tables []string + for table := range *copyProgress { + tables = append(tables, table) + } + sort.Strings(tables) + var progress tableCopyProgress + for _, table := range tables { + var rowCountPct, tableSizePct float32 + resp.TableCopyState[table] = &vtctldatapb.WorkflowStatusResponse_TableCopyState{} + progress = *(*copyProgress)[table] + if progress.SourceRowCount > 0 { + rowCountPct = float32(100.0 * float64(progress.TargetRowCount) / float64(progress.SourceRowCount)) + } + if progress.SourceTableSize > 0 { + tableSizePct = float32(100.0 * float64(progress.TargetTableSize) / float64(progress.SourceTableSize)) + } + resp.TableCopyState[table].RowsCopied = progress.TargetRowCount + resp.TableCopyState[table].RowsTotal = progress.SourceRowCount + resp.TableCopyState[table].RowsPercentage = rowCountPct + resp.TableCopyState[table].BytesCopied = progress.TargetTableSize + resp.TableCopyState[table].BytesTotal = progress.SourceTableSize + resp.TableCopyState[table].BytesPercentage = tableSizePct + } + } + + workflow, err := s.GetWorkflow(ctx, req.Keyspace, req.Workflow) + if err != nil { + return nil, err + } + + // The stream key is target keyspace/tablet alias, e.g. 0/test-0000000100. + // We sort the keys for intuitive and consistent output. + streamKeys := make([]string, 0, len(workflow.ShardStreams)) + for streamKey := range workflow.ShardStreams { + streamKeys = append(streamKeys, streamKey) + } + sort.Strings(streamKeys) + resp.ShardStreams = make(map[string]*vtctldatapb.WorkflowStatusResponse_ShardStreams, len(streamKeys)) + for _, streamKey := range streamKeys { + streams := workflow.ShardStreams[streamKey].GetStreams() + keyParts := strings.Split(streamKey, "/") + if len(keyParts) != 2 { + return nil, vterrors.Errorf(vtrpcpb.Code_INTERNAL, "unexpected stream key format in: %s ; expect /", + streamKey) + } + // We want to use target keyspace/shard as the map key for the + // response, e.g. customer/-80. + ksShard := fmt.Sprintf("%s/%s", req.Keyspace, keyParts[0]) + resp.ShardStreams[ksShard] = &vtctldatapb.WorkflowStatusResponse_ShardStreams{} + resp.ShardStreams[ksShard].Streams = make([]*vtctldatapb.WorkflowStatusResponse_ShardStreamState, len(streams)) + for i, st := range streams { + info := []string{} + ts := &vtctldatapb.WorkflowStatusResponse_ShardStreamState{} + if st.State == binlogdatapb.VReplicationWorkflowState_Error.String() { + info = append(info, st.Message) + } else if st.Position == "" { + info = append(info, "VStream has not started") + } else { + now := time.Now().Nanosecond() + updateLag := int64(now) - st.TimeUpdated.Seconds + if updateLag > 0*1e9 { + info = append(info, "VStream may not be running") + } + txLag := int64(now) - st.TransactionTimestamp.Seconds + info = append(info, fmt.Sprintf("VStream Lag: %ds", txLag/1e9)) + if st.TransactionTimestamp.Seconds > 0 { // if no events occur after copy phase, TransactionTimeStamp can be 0 + info = append(info, fmt.Sprintf("; Tx time: %s.", time.Unix(st.TransactionTimestamp.Seconds, 0).Format(time.ANSIC))) + } + } + ts.Id = int32(st.Id) + ts.Tablet = st.Tablet + ts.SourceShard = fmt.Sprintf("%s/%s", st.BinlogSource.Keyspace, st.BinlogSource.Shard) + ts.Position = st.Position + ts.Status = st.State + ts.Info = strings.Join(info, "; ") + resp.ShardStreams[ksShard].Streams[i] = ts + } + } + + return resp, nil +} + +// GetCopyProgress returns the progress of all tables being copied in the +// workflow. +func (s *Server) GetCopyProgress(ctx context.Context, ts *trafficSwitcher, state *State) (*copyProgress, error) { + getTablesQuery := "select distinct table_name from _vt.copy_state cs, _vt.vreplication vr where vr.id = cs.vrepl_id and vr.id = %d" + getRowCountQuery := "select table_name, table_rows, data_length from information_schema.tables where table_schema = %s and table_name in (%s)" + tables := make(map[string]bool) + const MaxRows = 1000 + sourcePrimaries := make(map[*topodatapb.TabletAlias]bool) + for _, target := range ts.targets { + for id, bls := range target.Sources { + query := fmt.Sprintf(getTablesQuery, id) + p3qr, err := s.tmc.ExecuteFetchAsDba(ctx, target.GetPrimary().Tablet, true, &tabletmanagerdatapb.ExecuteFetchAsDbaRequest{ + Query: []byte(query), + MaxRows: MaxRows, + }) + if err != nil { + return nil, err + } + if len(p3qr.Rows) < 1 { + continue + } + qr := sqltypes.Proto3ToResult(p3qr) + for i := 0; i < len(p3qr.Rows); i++ { + tables[qr.Rows[i][0].ToString()] = true + } + sourcesi, err := s.ts.GetShard(ctx, bls.Keyspace, bls.Shard) + if err != nil { + return nil, err + } + found := false + for existingSource := range sourcePrimaries { + if existingSource.Uid == sourcesi.PrimaryAlias.Uid { + found = true + } + } + if !found { + sourcePrimaries[sourcesi.PrimaryAlias] = true + } + } + } + if len(tables) == 0 { + return nil, nil + } + var tableList []string + targetRowCounts := make(map[string]int64) + sourceRowCounts := make(map[string]int64) + targetTableSizes := make(map[string]int64) + sourceTableSizes := make(map[string]int64) + + for table := range tables { + tableList = append(tableList, encodeString(table)) + targetRowCounts[table] = 0 + sourceRowCounts[table] = 0 + targetTableSizes[table] = 0 + sourceTableSizes[table] = 0 + } + + var getTableMetrics = func(tablet *topodatapb.Tablet, query string, rowCounts *map[string]int64, tableSizes *map[string]int64) error { + p3qr, err := s.tmc.ExecuteFetchAsDba(ctx, tablet, true, &tabletmanagerdatapb.ExecuteFetchAsDbaRequest{ + Query: []byte(query), + MaxRows: uint64(len(tables)), + }) + if err != nil { + return err + } + qr := sqltypes.Proto3ToResult(p3qr) + for i := 0; i < len(qr.Rows); i++ { + table := qr.Rows[i][0].ToString() + rowCount, err := qr.Rows[i][1].ToCastInt64() + if err != nil { + return err + } + tableSize, err := qr.Rows[i][2].ToCastInt64() + if err != nil { + return err + } + (*rowCounts)[table] += rowCount + (*tableSizes)[table] += tableSize + } + return nil + } + sourceDbName := "" + for _, tsSource := range ts.sources { + sourceDbName = tsSource.GetPrimary().DbName() + break + } + if sourceDbName == "" { + return nil, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "no sources found for workflow %s.%s", state.TargetKeyspace, state.Workflow) + } + targetDbName := "" + for _, tsTarget := range ts.targets { + targetDbName = tsTarget.GetPrimary().DbName() + break + } + if sourceDbName == "" || targetDbName == "" { + return nil, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "workflow %s.%s is incorrectly configured", state.TargetKeyspace, state.Workflow) + } + sort.Strings(tableList) // sort list for repeatability for mocking in tests + tablesStr := strings.Join(tableList, ",") + query := fmt.Sprintf(getRowCountQuery, encodeString(targetDbName), tablesStr) + for _, target := range ts.targets { + tablet := target.GetPrimary().Tablet + if err := getTableMetrics(tablet, query, &targetRowCounts, &targetTableSizes); err != nil { + return nil, err + } + } + + query = fmt.Sprintf(getRowCountQuery, encodeString(sourceDbName), tablesStr) + for source := range sourcePrimaries { + ti, err := s.ts.GetTablet(ctx, source) + tablet := ti.Tablet + if err != nil { + return nil, err + } + if err := getTableMetrics(tablet, query, &sourceRowCounts, &sourceTableSizes); err != nil { + return nil, err + } + } + + copyProgress := copyProgress{} + for table, rowCount := range targetRowCounts { + copyProgress[table] = &tableCopyProgress{ + TargetRowCount: rowCount, + TargetTableSize: targetTableSizes[table], + SourceRowCount: sourceRowCounts[table], + SourceTableSize: sourceTableSizes[table], + } + } + return ©Progress, nil +} + +// WorkflowUpdate is part of the vtctlservicepb.VtctldServer interface. +// It passes the embedded TabletRequest object to the given keyspace's +// target primary tablets that are participating in the given workflow. +func (s *Server) WorkflowUpdate(ctx context.Context, req *vtctldatapb.WorkflowUpdateRequest) (*vtctldatapb.WorkflowUpdateResponse, error) { + span, ctx := trace.NewSpan(ctx, "workflow.Server.WorkflowUpdate") + defer span.Finish() + + span.Annotate("keyspace", req.Keyspace) + span.Annotate("workflow", req.TabletRequest.Workflow) + span.Annotate("cells", req.TabletRequest.Cells) + span.Annotate("tablet_types", req.TabletRequest.TabletTypes) + span.Annotate("on_ddl", req.TabletRequest.OnDdl) + + vx := vexec.NewVExec(req.Keyspace, req.TabletRequest.Workflow, s.ts, s.tmc) + callback := func(ctx context.Context, tablet *topo.TabletInfo) (*querypb.QueryResult, error) { + res, err := s.tmc.UpdateVReplicationWorkflow(ctx, tablet.Tablet, req.TabletRequest) + if err != nil { + return nil, err + } + return res.Result, err + } + res, err := vx.CallbackContext(ctx, callback) + if err != nil { + if topo.IsErrType(err, topo.NoNode) { + return nil, vterrors.Wrapf(err, "%s keyspace does not exist", req.Keyspace) + } + return nil, err + } + + if len(res) == 0 { + return nil, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "the %s workflow does not exist in the %s keyspace", req.TabletRequest.Workflow, req.Keyspace) + } + + response := &vtctldatapb.WorkflowUpdateResponse{} + response.Summary = fmt.Sprintf("Successfully updated the %s workflow on (%d) target primary tablets in the %s keyspace", req.TabletRequest.Workflow, len(res), req.Keyspace) + details := make([]*vtctldatapb.WorkflowUpdateResponse_TabletInfo, 0, len(res)) + for tinfo, tres := range res { + result := &vtctldatapb.WorkflowUpdateResponse_TabletInfo{ + Tablet: tinfo.Alias, + Changed: tres.RowsAffected > 0, // Can be more than one with shard merges + } + details = append(details, result) + } + response.Details = details + return response, nil +} + +// validateSourceTablesExist validates that tables provided are present +// in the source keyspace. +func (s *Server) validateSourceTablesExist(ctx context.Context, sourceKeyspace string, ksTables, tables []string) error { + var missingTables []string + for _, table := range tables { + if schema.IsInternalOperationTableName(table) { + continue + } + found := false + + for _, ksTable := range ksTables { + if table == ksTable { + found = true + break + } + } + if !found { + missingTables = append(missingTables, table) + } + } + if len(missingTables) > 0 { + return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "table(s) not found in source keyspace %s: %s", sourceKeyspace, strings.Join(missingTables, ",")) + } + return nil +} + +// addTablesToVSchema adds tables to an (unsharded) vschema if they are not already defined. +// If copyVSchema is true then we copy over the vschema table definitions from the source, +// otherwise we create empty ones. +// For a migrate workflow we do not copy the vschema since the source keyspace is just a +// proxy to import data into Vitess. +func (s *Server) addTablesToVSchema(ctx context.Context, sourceKeyspace string, targetVSchema *vschemapb.Keyspace, tables []string, copyVSchema bool) error { + if targetVSchema.Tables == nil { + targetVSchema.Tables = make(map[string]*vschemapb.Table) + } + if copyVSchema { + srcVSchema, err := s.ts.GetVSchema(ctx, sourceKeyspace) + if err != nil { + return vterrors.Wrapf(err, "failed to get vschema for source keyspace %s", sourceKeyspace) + } + for _, table := range tables { + srcTable, sok := srcVSchema.Tables[table] + if _, tok := targetVSchema.Tables[table]; sok && !tok { + targetVSchema.Tables[table] = srcTable + // If going from sharded to unsharded, then we need to remove the + // column vindexes as they are not valid for unsharded tables. + if srcVSchema.Sharded { + targetVSchema.Tables[table].ColumnVindexes = nil + } + } + } + } + // Ensure that each table at least has an empty definition on the target. + for _, table := range tables { + if _, tok := targetVSchema.Tables[table]; !tok { + targetVSchema.Tables[table] = &vschemapb.Table{} + } + } + return nil +} + +func (s *Server) collectTargetStreams(ctx context.Context, mz *materializer) ([]string, error) { + var shardTablets []string + var mu sync.Mutex + err := mz.forAllTargets(func(target *topo.ShardInfo) error { + var qrproto *querypb.QueryResult + var id int64 + var err error + targetPrimary, err := s.ts.GetTablet(ctx, target.PrimaryAlias) + if err != nil { + return vterrors.Wrapf(err, "GetTablet(%v) failed", target.PrimaryAlias) + } + query := fmt.Sprintf("select id from _vt.vreplication where db_name=%s and workflow=%s", encodeString(targetPrimary.DbName()), encodeString(mz.ms.Workflow)) + if qrproto, err = s.tmc.VReplicationExec(ctx, targetPrimary.Tablet, query); err != nil { + return vterrors.Wrapf(err, "VReplicationExec(%v, %s)", targetPrimary.Tablet, query) + } + qr := sqltypes.Proto3ToResult(qrproto) + for i := 0; i < len(qr.Rows); i++ { + id, err = qr.Rows[i][0].ToCastInt64() + if err != nil { + return err + } + mu.Lock() + shardTablets = append(shardTablets, fmt.Sprintf("%s:%d", target.ShardName(), id)) + mu.Unlock() + } + return nil + }) + if err != nil { + return nil, err + } + return shardTablets, nil +} + +func (s *Server) checkIfPreviousJournalExists(ctx context.Context, mz *materializer, migrationID int64) (bool, []string, error) { + forAllSources := func(f func(*topo.ShardInfo) error) error { + var wg sync.WaitGroup + allErrors := &concurrency.AllErrorRecorder{} + for _, sourceShard := range mz.sourceShards { + wg.Add(1) + go func(sourceShard *topo.ShardInfo) { + defer wg.Done() + + if err := f(sourceShard); err != nil { + allErrors.RecordError(err) + } + }(sourceShard) + } + wg.Wait() + return allErrors.AggrError(vterrors.Aggregate) + } + + var ( + mu sync.Mutex + exists bool + tablets []string + ) + + err := forAllSources(func(si *topo.ShardInfo) error { + tablet, err := s.ts.GetTablet(ctx, si.PrimaryAlias) + if err != nil { + return err + } + if tablet == nil { + return nil + } + _, exists, err = s.CheckReshardingJournalExistsOnTablet(ctx, tablet.Tablet, migrationID) + if err != nil { + return err + } + if exists { + mu.Lock() + defer mu.Unlock() + tablets = append(tablets, tablet.AliasString()) + } + return nil + }) + return exists, tablets, err +} + +// deleteWorkflowVDiffData cleans up any potential VDiff related data associated +// with the workflow on the given tablet. +func (s *Server) deleteWorkflowVDiffData(ctx context.Context, tablet *topodatapb.Tablet, workflow string) { + if _, err := s.tmc.VDiff(ctx, tablet, &tabletmanagerdatapb.VDiffRequest{ + Keyspace: tablet.Keyspace, + Workflow: workflow, + Action: string(vdiff.DeleteAction), + ActionArg: vdiff.AllActionArg, + }); err != nil { + log.Errorf("Error deleting vdiff data for %s.%s workflow: %v", tablet.Keyspace, workflow, err) + } +} + +// optimizeCopyStateTable rebuilds the copy_state table to ensure the on-disk +// structures are minimal and optimized and resets the auto-inc value for +// subsequent inserts. +// This helps to ensure that the size, storage, and performance related factors +// for the table remain optimal over time and that we don't ever exhaust the +// available auto-inc values for the table. +// Note: it's not critical that this executes successfully any given time, it's +// only important that we try to do this periodically so that things stay in an +// optimal state over long periods of time. For this reason, the work is done +// asynchronously in the background on the given tablet and any failures are +// logged as warnings. Because it's done in the background we use the AllPrivs +// account to be sure that we don't execute the writes if READ_ONLY is set on +// the MySQL instance. +func (s *Server) optimizeCopyStateTable(tablet *topodatapb.Tablet) { + if s.sem != nil { + if !s.sem.TryAcquire(1) { + log.Warningf("Deferring work to optimize the copy_state table on %q due to hitting the maximum concurrent background job limit.", + tablet.Alias.String()) + return + } + } + go func() { + defer func() { + if s.sem != nil { + s.sem.Release(1) + } + }() + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Minute) + defer cancel() + sqlOptimizeTable := "optimize table _vt.copy_state" + if _, err := s.tmc.ExecuteFetchAsAllPrivs(ctx, tablet, &tabletmanagerdatapb.ExecuteFetchAsAllPrivsRequest{ + Query: []byte(sqlOptimizeTable), + MaxRows: uint64(100), // always produces 1+rows with notes and status + }); err != nil { + if sqlErr, ok := err.(*sqlerror.SQLError); ok && sqlErr.Num == sqlerror.ERNoSuchTable { // the table may not exist + return + } + log.Warningf("Failed to optimize the copy_state table on %q: %v", tablet.Alias.String(), err) + } + // This will automatically set the value to 1 or the current max value in the + // table, whichever is greater. + sqlResetAutoInc := "alter table _vt.copy_state auto_increment = 1" + if _, err := s.tmc.ExecuteFetchAsAllPrivs(ctx, tablet, &tabletmanagerdatapb.ExecuteFetchAsAllPrivsRequest{ + Query: []byte(sqlResetAutoInc), + MaxRows: uint64(0), + }); err != nil { + log.Warningf("Failed to reset the auto_increment value for the copy_state table on %q: %v", + tablet.Alias.String(), err) + } + }() +} + +// DropTargets cleans up target tables, shards and denied tables if a MoveTables/Reshard +// is cancelled. +func (s *Server) DropTargets(ctx context.Context, targetKeyspace, workflow string, keepData, keepRoutingRules, dryRun bool) (*[]string, error) { + ts, state, err := s.getWorkflowState(ctx, targetKeyspace, workflow) + if err != nil { + log.Errorf("Failed to get VReplication workflow state for %s.%s: %v", targetKeyspace, workflow, err) + return nil, err + } + + // Return an error if the workflow traffic is partially switched. + if state.WritesSwitched || len(state.ReplicaCellsSwitched) > 0 || len(state.RdonlyCellsSwitched) > 0 { + return nil, ErrWorkflowPartiallySwitched + } + + if state.WorkflowType == TypeMigrate { + _, err := s.finalizeMigrateWorkflow(ctx, targetKeyspace, workflow, "", true, keepData, keepRoutingRules, dryRun) + return nil, err + } + + ts.keepRoutingRules = keepRoutingRules + var sw iswitcher + if dryRun { + sw = &switcherDryRun{ts: ts, drLog: NewLogRecorder()} + } else { + sw = &switcher{s: s, ts: ts} + } + var tctx context.Context + tctx, sourceUnlock, lockErr := sw.lockKeyspace(ctx, ts.SourceKeyspaceName(), "DropTargets") + if lockErr != nil { + ts.Logger().Errorf("Source LockKeyspace failed: %v", lockErr) + return nil, lockErr + } + defer sourceUnlock(&err) + ctx = tctx + + if ts.TargetKeyspaceName() != ts.SourceKeyspaceName() { + tctx, targetUnlock, lockErr := sw.lockKeyspace(ctx, ts.TargetKeyspaceName(), "DropTargets") + if lockErr != nil { + ts.Logger().Errorf("Target LockKeyspace failed: %v", lockErr) + return nil, lockErr + } + defer targetUnlock(&err) + ctx = tctx + } + if !keepData { + switch ts.MigrationType() { + case binlogdatapb.MigrationType_TABLES: + if err := sw.removeTargetTables(ctx); err != nil { + return nil, err + } + if err := sw.dropSourceDeniedTables(ctx); err != nil { + return nil, err + } + if err := sw.dropTargetDeniedTables(ctx); err != nil { + return nil, err + } + case binlogdatapb.MigrationType_SHARDS: + if err := sw.dropTargetShards(ctx); err != nil { + return nil, err + } + } + } + if err := s.dropRelatedArtifacts(ctx, keepRoutingRules, sw); err != nil { + return nil, err + } + if err := ts.TopoServer().RebuildSrvVSchema(ctx, nil); err != nil { + return nil, err + } + return sw.logs(), nil +} + +func (s *Server) buildTrafficSwitcher(ctx context.Context, targetKeyspace, workflowName string) (*trafficSwitcher, error) { + tgtInfo, err := BuildTargets(ctx, s.ts, s.tmc, targetKeyspace, workflowName) + if err != nil { + log.Infof("Error building targets: %s", err) + return nil, err + } + targets, frozen, optCells, optTabletTypes := tgtInfo.Targets, tgtInfo.Frozen, tgtInfo.OptCells, tgtInfo.OptTabletTypes + + ts := &trafficSwitcher{ + ws: s, + logger: logutil.NewConsoleLogger(), + workflow: workflowName, + reverseWorkflow: ReverseWorkflowName(workflowName), + id: HashStreams(targetKeyspace, targets), + targets: targets, + sources: make(map[string]*MigrationSource), + targetKeyspace: targetKeyspace, + frozen: frozen, + optCells: optCells, + optTabletTypes: optTabletTypes, + workflowType: tgtInfo.WorkflowType, + workflowSubType: tgtInfo.WorkflowSubType, + } + log.Infof("Migration ID for workflow %s: %d", workflowName, ts.id) + sourceTopo := s.ts + + // Build the sources. + for _, target := range targets { + for _, bls := range target.Sources { + if ts.sourceKeyspace == "" { + ts.sourceKeyspace = bls.Keyspace + ts.sourceTimeZone = bls.SourceTimeZone + ts.targetTimeZone = bls.TargetTimeZone + ts.externalCluster = bls.ExternalCluster + if ts.externalCluster != "" { + externalTopo, err := s.ts.OpenExternalVitessClusterServer(ctx, ts.externalCluster) + if err != nil { + return nil, err + } + sourceTopo = externalTopo + ts.externalTopo = externalTopo + } + } else if ts.sourceKeyspace != bls.Keyspace { + return nil, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "source keyspaces are mismatched across streams: %v vs %v", ts.sourceKeyspace, bls.Keyspace) + } + + if ts.tables == nil { + for _, rule := range bls.Filter.Rules { + ts.tables = append(ts.tables, rule.Match) + } + sort.Strings(ts.tables) + } else { + var tables []string + for _, rule := range bls.Filter.Rules { + tables = append(tables, rule.Match) + } + sort.Strings(tables) + if !reflect.DeepEqual(ts.tables, tables) { + return nil, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "table lists are mismatched across streams: %v vs %v", ts.tables, tables) + } + } + + if _, ok := ts.sources[bls.Shard]; ok { + continue + } + sourcesi, err := sourceTopo.GetShard(ctx, bls.Keyspace, bls.Shard) + if err != nil { + return nil, err + } + if sourcesi.PrimaryAlias == nil { + return nil, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "source shard %s/%s currently has no primary tablet", + bls.Keyspace, bls.Shard) + } + sourcePrimary, err := sourceTopo.GetTablet(ctx, sourcesi.PrimaryAlias) + if err != nil { + return nil, err + } + ts.sources[bls.Shard] = NewMigrationSource(sourcesi, sourcePrimary) + } + } + if ts.sourceKeyspace != ts.targetKeyspace || ts.externalCluster != "" { + ts.migrationType = binlogdatapb.MigrationType_TABLES + } else { + // TODO(sougou): for shard migration, validate that source and target combined + // keyranges match. + ts.migrationType = binlogdatapb.MigrationType_SHARDS + for sourceShard := range ts.sources { + if _, ok := ts.targets[sourceShard]; ok { + // If shards are overlapping, then this is a table migration. + ts.migrationType = binlogdatapb.MigrationType_TABLES + break + } + } + } + vs, err := sourceTopo.GetVSchema(ctx, ts.sourceKeyspace) + if err != nil { + return nil, err + } + ts.sourceKSSchema, err = vindexes.BuildKeyspaceSchema(vs, ts.sourceKeyspace) + if err != nil { + return nil, err + } + + sourceShards, targetShards := ts.getSourceAndTargetShardsNames() + + ts.isPartialMigration, err = ts.isPartialMoveTables(sourceShards, targetShards) + if err != nil { + return nil, err + } + if ts.isPartialMigration { + log.Infof("Migration is partial, for shards %+v", sourceShards) + } + return ts, nil +} + +func (s *Server) dropRelatedArtifacts(ctx context.Context, keepRoutingRules bool, sw iswitcher) error { + if err := sw.dropSourceReverseVReplicationStreams(ctx); err != nil { + return err + } + if !keepRoutingRules { + if err := sw.deleteRoutingRules(ctx); err != nil { + return err + } + if err := sw.deleteShardRoutingRules(ctx); err != nil { + return err + } + } + + return nil +} + +// dropSources cleans up source tables, shards and denied tables after a +// MoveTables/Reshard is completed. +func (s *Server) dropSources(ctx context.Context, ts *trafficSwitcher, removalType TableRemovalType, keepData, keepRoutingRules, force, dryRun bool) (*[]string, error) { + var ( + sw iswitcher + err error + ) + if dryRun { + sw = &switcherDryRun{ts: ts, drLog: NewLogRecorder()} + } else { + sw = &switcher{ts: ts, s: s} + } + var tctx context.Context + tctx, sourceUnlock, lockErr := sw.lockKeyspace(ctx, ts.SourceKeyspaceName(), "DropSources") + if lockErr != nil { + ts.Logger().Errorf("Source LockKeyspace failed: %v", lockErr) + return nil, lockErr + } + defer sourceUnlock(&err) + ctx = tctx + if ts.TargetKeyspaceName() != ts.SourceKeyspaceName() { + tctx, targetUnlock, lockErr := sw.lockKeyspace(ctx, ts.TargetKeyspaceName(), "DropSources") + if lockErr != nil { + ts.Logger().Errorf("Target LockKeyspace failed: %v", lockErr) + return nil, lockErr + } + defer targetUnlock(&err) + ctx = tctx + } + if !force { + if err := sw.validateWorkflowHasCompleted(ctx); err != nil { + ts.Logger().Errorf("Workflow has not completed, cannot DropSources: %v", err) + return nil, err + } + } + if !keepData { + switch ts.MigrationType() { + case binlogdatapb.MigrationType_TABLES: + log.Infof("Deleting tables") + if err := sw.removeSourceTables(ctx, removalType); err != nil { + return nil, err + } + if err := sw.dropSourceDeniedTables(ctx); err != nil { + return nil, err + } + if err := sw.dropTargetDeniedTables(ctx); err != nil { + return nil, err + } + + case binlogdatapb.MigrationType_SHARDS: + log.Infof("Removing shards") + if err := sw.dropSourceShards(ctx); err != nil { + return nil, err + } + } + } + if err := s.dropArtifacts(ctx, keepRoutingRules, sw); err != nil { + return nil, err + } + if err := ts.TopoServer().RebuildSrvVSchema(ctx, nil); err != nil { + return nil, err + } + + return sw.logs(), nil +} + +func (s *Server) dropArtifacts(ctx context.Context, keepRoutingRules bool, sw iswitcher) error { + if err := sw.dropSourceReverseVReplicationStreams(ctx); err != nil { + return err + } + if err := sw.dropTargetVReplicationStreams(ctx); err != nil { + return err + } + if !keepRoutingRules { + if err := sw.deleteRoutingRules(ctx); err != nil { + return err + } + if err := sw.deleteShardRoutingRules(ctx); err != nil { + return err + } + } + + return nil +} + +// DeleteShard will do all the necessary changes in the topology server +// to entirely remove a shard. +func (s *Server) DeleteShard(ctx context.Context, keyspace, shard string, recursive, evenIfServing bool) error { + // Read the Shard object. If it's not there, try to clean up + // the topology anyway. + shardInfo, err := s.ts.GetShard(ctx, keyspace, shard) + if err != nil { + if topo.IsErrType(err, topo.NoNode) { + log.Infof("Shard %v/%v doesn't seem to exist, cleaning up any potential leftover", keyspace, shard) + return s.ts.DeleteShard(ctx, keyspace, shard) + } + return err + } + + servingCells, err := s.ts.GetShardServingCells(ctx, shardInfo) + if err != nil { + return err + } + // Check the Serving map for the shard, we don't want to + // remove a serving shard if not absolutely sure. + if !evenIfServing && len(servingCells) > 0 { + return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "shard %v/%v is still serving, cannot delete it, use the even-if-serving flag if needed", keyspace, shard) + } + + cells, err := s.ts.GetCellInfoNames(ctx) + if err != nil { + return err + } + + // Go through all the cells. + for _, cell := range cells { + var aliases []*topodatapb.TabletAlias + + // Get the ShardReplication object for that cell. Try + // to find all tablets that may belong to our shard. + sri, err := s.ts.GetShardReplication(ctx, cell, keyspace, shard) + switch { + case topo.IsErrType(err, topo.NoNode): + // No ShardReplication object. It means the + // topo is inconsistent. Let's read all the + // tablets for that cell, and if we find any + // in our keyspace / shard, either abort or + // try to delete them. + aliases, err = s.ts.GetTabletAliasesByCell(ctx, cell) + if err != nil { + return vterrors.Errorf(vtrpcpb.Code_INTERNAL, "GetTabletsByCell(%v) failed: %v", cell, err) + } + case err == nil: + // We found a ShardReplication object. We + // trust it to have all tablet records. + aliases = make([]*topodatapb.TabletAlias, len(sri.Nodes)) + for i, n := range sri.Nodes { + aliases[i] = n.TabletAlias + } + default: + return vterrors.Errorf(vtrpcpb.Code_INTERNAL, "GetShardReplication(%v, %v, %v) failed: %v", cell, keyspace, shard, err) + } + + // Get the corresponding Tablet records. Note + // GetTabletMap ignores ErrNoNode, and it's good for + // our purpose, it means a tablet was deleted but is + // still referenced. + tabletMap, err := s.ts.GetTabletMap(ctx, aliases) + if err != nil { + return vterrors.Errorf(vtrpcpb.Code_INTERNAL, "GetTabletMap() failed: %v", err) + } + + // Remove the tablets that don't belong to our + // keyspace/shard from the map. + for a, ti := range tabletMap { + if ti.Keyspace != keyspace || ti.Shard != shard { + delete(tabletMap, a) + } + } + + // Now see if we need to DeleteTablet, and if we can, do it. + if len(tabletMap) > 0 { + if !recursive { + return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "shard %v/%v still has %v tablets in cell %v; use --recursive or remove them manually", keyspace, shard, len(tabletMap), cell) + } + + log.Infof("Deleting all tablets in shard %v/%v cell %v", keyspace, shard, cell) + for tabletAlias, tabletInfo := range tabletMap { + // We don't care about scrapping or updating the replication graph, + // because we're about to delete the entire replication graph. + log.Infof("Deleting tablet %v", tabletAlias) + if err := s.ts.DeleteTablet(ctx, tabletInfo.Alias); err != nil && !topo.IsErrType(err, topo.NoNode) { + // We don't want to continue if a DeleteTablet fails for + // any good reason (other than missing tablet, in which + // case it's just a topology server inconsistency we can + // ignore). If we continue and delete the replication + // graph, the tablet record will be orphaned, since + // we'll no longer know it belongs to this shard. + // + // If the problem is temporary, or resolved externally, re-running + // DeleteShard will skip over tablets that were already deleted. + return vterrors.Errorf(vtrpcpb.Code_INTERNAL, "can't delete tablet %v: %v", tabletAlias, err) + } + } + } + } + + // Try to remove the replication graph and serving graph in each cell, + // regardless of its existence. + for _, cell := range cells { + if err := s.ts.DeleteShardReplication(ctx, cell, keyspace, shard); err != nil && !topo.IsErrType(err, topo.NoNode) { + log.Warningf("Cannot delete ShardReplication in cell %v for %v/%v: %v", cell, keyspace, shard, err) + } + } + + return s.ts.DeleteShard(ctx, keyspace, shard) +} + +// updateShardRecords updates the shard records based on 'from' or 'to' direction. +func (s *Server) updateShardRecords(ctx context.Context, keyspace string, shards []*topo.ShardInfo, cells []string, + servedType topodatapb.TabletType, isFrom bool, clearSourceShards bool, logger logutil.Logger) (err error) { + return topotools.UpdateShardRecords(ctx, s.ts, s.tmc, keyspace, shards, cells, servedType, isFrom, clearSourceShards, logger) +} + +// refreshPrimaryTablets will just RPC-ping all the primary tablets with RefreshState +func (s *Server) refreshPrimaryTablets(ctx context.Context, shards []*topo.ShardInfo) error { + wg := sync.WaitGroup{} + rec := concurrency.AllErrorRecorder{} + for _, si := range shards { + wg.Add(1) + go func(si *topo.ShardInfo) { + defer wg.Done() + ti, err := s.ts.GetTablet(ctx, si.PrimaryAlias) + if err != nil { + rec.RecordError(err) + return + } + + if err := s.tmc.RefreshState(ctx, ti.Tablet); err != nil { + rec.RecordError(err) + } else { + log.Infof("%v responded", topoproto.TabletAliasString(si.PrimaryAlias)) + } + }(si) + } + wg.Wait() + return rec.Error() +} + +// finalizeMigrateWorkflow deletes the streams for the Migrate workflow. +// We only cleanup the target for external sources. +func (s *Server) finalizeMigrateWorkflow(ctx context.Context, targetKeyspace, workflow, tableSpecs string, cancel, keepData, keepRoutingRules, dryRun bool) (*[]string, error) { + ts, err := s.buildTrafficSwitcher(ctx, targetKeyspace, workflow) + if err != nil { + ts.Logger().Errorf("buildTrafficSwitcher failed: %v", err) + return nil, err + } + var sw iswitcher + if dryRun { + sw = &switcherDryRun{ts: ts, drLog: NewLogRecorder()} + } else { + sw = &switcher{s: s, ts: ts} + } + var tctx context.Context + tctx, targetUnlock, lockErr := sw.lockKeyspace(ctx, ts.TargetKeyspaceName(), "completeMigrateWorkflow") + if lockErr != nil { + ts.Logger().Errorf("Target LockKeyspace failed: %v", lockErr) + return nil, lockErr + } + defer targetUnlock(&err) + ctx = tctx + if err := sw.dropTargetVReplicationStreams(ctx); err != nil { + return nil, err + } + if !cancel { + if err := sw.addParticipatingTablesToKeyspace(ctx, targetKeyspace, tableSpecs); err != nil { + return nil, err + } + if err := ts.TopoServer().RebuildSrvVSchema(ctx, nil); err != nil { + return nil, err + } + } + log.Infof("cancel is %t, keepData %t", cancel, keepData) + if cancel && !keepData { + if err := sw.removeTargetTables(ctx); err != nil { + return nil, err + } + } + return sw.logs(), nil +} + +// WorkflowSwitchTraffic switches traffic in the direction passed for specified tablet types. +func (s *Server) WorkflowSwitchTraffic(ctx context.Context, req *vtctldatapb.WorkflowSwitchTrafficRequest) (*vtctldatapb.WorkflowSwitchTrafficResponse, error) { + var ( + dryRunResults []string + rdDryRunResults, wrDryRunResults *[]string + hasReplica, hasRdonly, hasPrimary bool + ) + timeout, set, err := protoutil.DurationFromProto(req.Timeout) + if err != nil { + err = vterrors.Wrapf(err, "unable to parse Timeout into a valid duration") + return nil, err + } + if !set { + timeout = defaultDuration + } + ts, startState, err := s.getWorkflowState(ctx, req.Keyspace, req.Workflow) + if err != nil { + return nil, err + } + + if startState.WorkflowType == TypeMigrate { + return nil, vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "invalid action for Migrate workflow: SwitchTraffic") + } + + maxReplicationLagAllowed, set, err := protoutil.DurationFromProto(req.MaxReplicationLagAllowed) + if err != nil { + err = vterrors.Wrapf(err, "unable to parse MaxReplicationLagAllowed into a valid duration") + return nil, err + } + if !set { + maxReplicationLagAllowed = defaultDuration + } + direction := TrafficSwitchDirection(req.Direction) + if direction == DirectionBackward { + ts, startState, err = s.getWorkflowState(ctx, startState.SourceKeyspace, ts.reverseWorkflow) + if err != nil { + return nil, err + } + } + reason, err := s.canSwitch(ctx, ts, startState, direction, int64(maxReplicationLagAllowed.Seconds())) + if err != nil { + return nil, err + } + if reason != "" { + return nil, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "cannot switch traffic for workflow %s at this time: %s", startState.Workflow, reason) + } + hasReplica, hasRdonly, hasPrimary, err = parseTabletTypes(req.TabletTypes) + if err != nil { + return nil, err + } + if hasReplica || hasRdonly { + if rdDryRunResults, err = s.switchReads(ctx, req, ts, startState, timeout, false, direction); err != nil { + return nil, err + } + log.Infof("Switch Reads done for workflow %s.%s", req.Keyspace, req.Workflow) + } + if rdDryRunResults != nil { + dryRunResults = append(dryRunResults, *rdDryRunResults...) + } + if hasPrimary { + if _, wrDryRunResults, err = s.switchWrites(ctx, req, ts, timeout, false); err != nil { + return nil, err + } + log.Infof("Switch Writes done for workflow %s.%s", req.Keyspace, req.Workflow) + } + + if wrDryRunResults != nil { + dryRunResults = append(dryRunResults, *wrDryRunResults...) + } + if req.DryRun && len(dryRunResults) == 0 { + dryRunResults = append(dryRunResults, "No changes required") + } + cmd := "SwitchTraffic" + if direction == DirectionBackward { + cmd = "ReverseTraffic" + } + log.Infof("%s done for workflow %s.%s", cmd, req.Keyspace, req.Workflow) + resp := &vtctldatapb.WorkflowSwitchTrafficResponse{} + if req.DryRun { + resp.Summary = fmt.Sprintf("%s dry run results for workflow %s.%s at %v", cmd, req.Keyspace, req.Workflow, time.Now().UTC().Format(time.RFC822)) + resp.DryRunResults = dryRunResults + } else { + log.Infof("SwitchTraffic done for workflow %s.%s", req.Keyspace, req.Workflow) + resp.Summary = fmt.Sprintf("%s was successful for workflow %s.%s", cmd, req.Keyspace, req.Workflow) + // Reload the state after the SwitchTraffic operation + // and return that as a string. + keyspace := req.Keyspace + workflow := req.Workflow + if direction == DirectionBackward { + keyspace = startState.SourceKeyspace + workflow = ts.reverseWorkflow + } + resp.StartState = startState.String() + log.Infof("Before reloading workflow state after switching traffic: %+v\n", resp.StartState) + _, currentState, err := s.getWorkflowState(ctx, keyspace, workflow) + if err != nil { + resp.CurrentState = fmt.Sprintf("Error reloading workflow state after switching traffic: %v", err) + } else { + resp.CurrentState = currentState.String() + } + } + return resp, nil +} + +// switchReads is a generic way of switching read traffic for a workflow. +func (s *Server) switchReads(ctx context.Context, req *vtctldatapb.WorkflowSwitchTrafficRequest, ts *trafficSwitcher, state *State, timeout time.Duration, cancel bool, direction TrafficSwitchDirection) (*[]string, error) { + roTypesToSwitchStr := topoproto.MakeStringTypeCSV(req.TabletTypes) + var switchReplica, switchRdonly bool + for _, roType := range req.TabletTypes { + switch roType { + case topodatapb.TabletType_REPLICA: + switchReplica = true + case topodatapb.TabletType_RDONLY: + switchRdonly = true + } + } + + // Consistently handle errors by logging and returning them. + handleError := func(message string, err error) (*[]string, error) { + werr := vterrors.Errorf(vtrpcpb.Code_INTERNAL, fmt.Sprintf("%s: %v", message, err)) + ts.Logger().Error(werr) + return nil, werr + } + + log.Infof("Switching reads: %s.%s tablet types: %s, cells: %s, workflow state: %s", ts.targetKeyspace, ts.workflow, roTypesToSwitchStr, ts.optCells, state.String()) + if !switchReplica && !switchRdonly { + return handleError("invalid tablet types", vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "tablet types must be REPLICA or RDONLY: %s", roTypesToSwitchStr)) + } + if !ts.isPartialMigration { // shard level traffic switching is all or nothing + if direction == DirectionBackward && switchReplica && len(state.ReplicaCellsSwitched) == 0 { + return handleError("invalid request", vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "requesting reversal of read traffic for REPLICAs but REPLICA reads have not been switched")) + } + if direction == DirectionBackward && switchRdonly && len(state.RdonlyCellsSwitched) == 0 { + return handleError("invalid request", vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "requesting reversal of SwitchReads for RDONLYs but RDONLY reads have not been switched")) + } + } + var cells = req.Cells + // If no cells were provided in the command then use the value from the workflow. + if len(cells) == 0 && ts.optCells != "" { + cells = strings.Split(strings.TrimSpace(ts.optCells), ",") + } + + // If there are no rdonly tablets in the cells ask to switch rdonly tablets as well so that routing rules + // are updated for rdonly as well. Otherwise vitess will not know that the workflow has completed and will + // incorrectly report that not all reads have been switched. User currently is forced to switch non-existent + // rdonly tablets. + if switchReplica && !switchRdonly { + var err error + rdonlyTabletsExist, err := topotools.DoCellsHaveRdonlyTablets(ctx, s.ts, cells) + if err != nil { + return nil, err + } + if !rdonlyTabletsExist { + req.TabletTypes = append(req.TabletTypes, topodatapb.TabletType_RDONLY) + } + } + + // If journals exist notify user and fail. + journalsExist, _, err := ts.checkJournals(ctx) + if err != nil { + return handleError(fmt.Sprintf("failed to read journal in the %s keyspace", ts.SourceKeyspaceName()), err) + } + if journalsExist { + log.Infof("Found a previous journal entry for %d", ts.id) + } + var sw iswitcher + if req.DryRun { + sw = &switcherDryRun{ts: ts, drLog: NewLogRecorder()} + } else { + sw = &switcher{ts: ts, s: s} + } + + if err := ts.validate(ctx); err != nil { + return handleError("workflow validation failed", err) + } + + // For reads, locking the source keyspace is sufficient. + ctx, unlock, lockErr := sw.lockKeyspace(ctx, ts.SourceKeyspaceName(), "SwitchReads") + if lockErr != nil { + return handleError(fmt.Sprintf("failed to lock the %s keyspace", ts.SourceKeyspaceName()), lockErr) + } + defer unlock(&err) + + if ts.MigrationType() == binlogdatapb.MigrationType_TABLES { + if ts.isPartialMigration { + ts.Logger().Infof("Partial migration, skipping switchTableReads as traffic is all or nothing per shard and overridden for reads AND writes in the ShardRoutingRule created when switching writes.") + } else if err := sw.switchTableReads(ctx, cells, req.TabletTypes, direction); err != nil { + return handleError("failed to switch read traffic for the tables", err) + } + return sw.logs(), nil + } + ts.Logger().Infof("About to switchShardReads: %+v, %+s, %+v", cells, roTypesToSwitchStr, direction) + if err := sw.switchShardReads(ctx, cells, req.TabletTypes, direction); err != nil { + return handleError("failed to switch read traffic for the shards", err) + } + + ts.Logger().Infof("switchShardReads Completed: %+v, %+s, %+v", cells, roTypesToSwitchStr, direction) + if err := s.ts.ValidateSrvKeyspace(ctx, ts.targetKeyspace, strings.Join(cells, ",")); err != nil { + err2 := vterrors.Wrapf(err, "after switching shard reads, found SrvKeyspace for %s is corrupt in cell %s", + ts.targetKeyspace, strings.Join(cells, ",")) + return handleError("failed to validate SrvKeyspace record", err2) + } + return sw.logs(), nil +} + +// switchWrites is a generic way of migrating write traffic for a workflow. +func (s *Server) switchWrites(ctx context.Context, req *vtctldatapb.WorkflowSwitchTrafficRequest, ts *trafficSwitcher, timeout time.Duration, + cancel bool) (journalID int64, dryRunResults *[]string, err error) { + + var sw iswitcher + if req.DryRun { + sw = &switcherDryRun{ts: ts, drLog: NewLogRecorder()} + } else { + sw = &switcher{ts: ts, s: s} + } + + // Consistently handle errors by logging and returning them. + handleError := func(message string, err error) (int64, *[]string, error) { + werr := vterrors.Errorf(vtrpcpb.Code_INTERNAL, fmt.Sprintf("%s: %v", message, err)) + ts.Logger().Error(werr) + return 0, nil, werr + } + + if ts.frozen { + ts.Logger().Warningf("Writes have already been switched for workflow %s, nothing to do here", ts.WorkflowName()) + return 0, sw.logs(), nil + } + + if err := ts.validate(ctx); err != nil { + return handleError("workflow validation failed", err) + } + + if req.EnableReverseReplication { + if err := areTabletsAvailableToStreamFrom(ctx, req, ts, ts.TargetKeyspaceName(), ts.TargetShards()); err != nil { + return handleError(fmt.Sprintf("no tablets were available to stream from in the %s keyspace", ts.SourceKeyspaceName()), err) + } + } + + // Need to lock both source and target keyspaces. + tctx, sourceUnlock, lockErr := sw.lockKeyspace(ctx, ts.SourceKeyspaceName(), "SwitchWrites") + if lockErr != nil { + return handleError(fmt.Sprintf("failed to lock the %s keyspace", ts.SourceKeyspaceName()), lockErr) + } + ctx = tctx + defer sourceUnlock(&err) + if ts.TargetKeyspaceName() != ts.SourceKeyspaceName() { + tctx, targetUnlock, lockErr := sw.lockKeyspace(ctx, ts.TargetKeyspaceName(), "SwitchWrites") + if lockErr != nil { + return handleError(fmt.Sprintf("failed to lock the %s keyspace", ts.TargetKeyspaceName()), lockErr) + } + ctx = tctx + defer targetUnlock(&err) + } + + // Find out if the target is using any sequence tables for auto_increment + // value generation. If so, then we'll need to ensure that they are + // initialized properly before allowing new writes on the target. + sequenceMetadata := make(map[string]*sequenceMetadata) + // For sharded to sharded migrations the sequence must already be setup. + // For reshards the sequence usage is not changed. + if req.InitializeTargetSequences && ts.workflowType == binlogdatapb.VReplicationWorkflowType_MoveTables && + ts.SourceKeyspaceSchema() != nil && ts.SourceKeyspaceSchema().Keyspace != nil && + !ts.SourceKeyspaceSchema().Keyspace.Sharded { + sequenceMetadata, err = ts.getTargetSequenceMetadata(ctx) + if err != nil { + return handleError(fmt.Sprintf("failed to get the sequence information in the %s keyspace", ts.TargetKeyspaceName()), err) + } + } + + // If no journals exist, sourceWorkflows will be initialized by sm.MigrateStreams. + journalsExist, sourceWorkflows, err := ts.checkJournals(ctx) + if err != nil { + return handleError(fmt.Sprintf("failed to read journal in the %s keyspace", ts.SourceKeyspaceName()), err) + } + if !journalsExist { + ts.Logger().Infof("No previous journals were found. Proceeding normally.") + sm, err := BuildStreamMigrator(ctx, ts, cancel) + if err != nil { + return handleError("failed to migrate the workflow streams", err) + } + if cancel { + sw.cancelMigration(ctx, sm) + return 0, sw.logs(), nil + } + + ts.Logger().Infof("Stopping streams") + sourceWorkflows, err = sw.stopStreams(ctx, sm) + if err != nil { + for key, streams := range sm.Streams() { + for _, stream := range streams { + ts.Logger().Errorf("stream in stopStreams: key %s shard %s stream %+v", key, stream.BinlogSource.Shard, stream.BinlogSource) + } + } + sw.cancelMigration(ctx, sm) + return handleError("failed to stop the workflow streams", err) + } + + ts.Logger().Infof("Stopping source writes") + if err := sw.stopSourceWrites(ctx); err != nil { + sw.cancelMigration(ctx, sm) + return handleError(fmt.Sprintf("failed to stop writes in the %s keyspace", ts.SourceKeyspaceName()), err) + } + + if ts.MigrationType() == binlogdatapb.MigrationType_TABLES { + ts.Logger().Infof("Executing LOCK TABLES on source tables %d times", lockTablesCycles) + // Doing this twice with a pause in-between to catch any writes that may have raced in between + // the tablet's deny list check and the first mysqld side table lock. + for cnt := 1; cnt <= lockTablesCycles; cnt++ { + if err := ts.executeLockTablesOnSource(ctx); err != nil { + sw.cancelMigration(ctx, sm) + return handleError(fmt.Sprintf("failed to execute LOCK TABLES (attempt %d of %d) on sources", cnt, lockTablesCycles), err) + } + // No need to UNLOCK the tables as the connection was closed once the locks were acquired + // and thus the locks released. + time.Sleep(lockTablesCycleDelay) + } + } + + ts.Logger().Infof("Waiting for streams to catchup") + if err := sw.waitForCatchup(ctx, timeout); err != nil { + sw.cancelMigration(ctx, sm) + return handleError("failed to sync up replication between the source and target", err) + } + + ts.Logger().Infof("Migrating streams") + if err := sw.migrateStreams(ctx, sm); err != nil { + sw.cancelMigration(ctx, sm) + return handleError("failed to migrate the workflow streams", err) + } + + ts.Logger().Infof("Resetting sequences") + if err := sw.resetSequences(ctx); err != nil { + sw.cancelMigration(ctx, sm) + return handleError("failed to reset the sequences", err) + } + + ts.Logger().Infof("Creating reverse streams") + if err := sw.createReverseVReplication(ctx); err != nil { + sw.cancelMigration(ctx, sm) + return handleError("failed to create the reverse vreplication streams", err) + } + } else { + if cancel { + return handleError("invalid cancel", vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "traffic switching has reached the point of no return, cannot cancel")) + } + ts.Logger().Infof("Journals were found. Completing the left over steps.") + // Need to gather positions in case all journals were not created. + if err := ts.gatherPositions(ctx); err != nil { + return handleError("failed to gather replication positions", err) + } + } + + // This is the point of no return. Once a journal is created, + // traffic can be redirected to target shards. + if err := sw.createJournals(ctx, sourceWorkflows); err != nil { + return handleError("failed to create the journal", err) + } + // Initialize any target sequences, if there are any, before allowing new writes. + if req.InitializeTargetSequences && len(sequenceMetadata) > 0 { + // Writes are blocked so we can safely initialize the sequence tables but + // we also want to use a shorter timeout than the parent context. + // We use up at most half of the overall timeout. + initSeqCtx, cancel := context.WithTimeout(ctx, timeout/2) + defer cancel() + if err := sw.initializeTargetSequences(initSeqCtx, sequenceMetadata); err != nil { + return handleError(fmt.Sprintf("failed to initialize the sequences used in the %s keyspace", ts.TargetKeyspaceName()), err) + } + } + if err := sw.allowTargetWrites(ctx); err != nil { + return handleError(fmt.Sprintf("failed to allow writes in the %s keyspace", ts.TargetKeyspaceName()), err) + } + if err := sw.changeRouting(ctx); err != nil { + return handleError("failed to update the routing rules", err) + } + if err := sw.streamMigraterfinalize(ctx, ts, sourceWorkflows); err != nil { + return handleError("failed to finalize the traffic switch", err) + } + if req.EnableReverseReplication { + if err := sw.startReverseVReplication(ctx); err != nil { + return handleError("failed to start the reverse workflow", err) + } + } + + if err := sw.freezeTargetVReplication(ctx); err != nil { + return handleError(fmt.Sprintf("failed to freeze the workflow in the %s keyspace", ts.TargetKeyspaceName()), err) + } + + return ts.id, sw.logs(), nil +} + +func (s *Server) canSwitch(ctx context.Context, ts *trafficSwitcher, state *State, direction TrafficSwitchDirection, maxAllowedReplLagSecs int64) (reason string, err error) { + if direction == DirectionForward && state.WritesSwitched || + direction == DirectionBackward && !state.WritesSwitched { + log.Infof("writes already switched no need to check lag") + return "", nil + } + wf, err := s.GetWorkflow(ctx, state.TargetKeyspace, state.Workflow) + if err != nil { + return "", err + } + for _, stream := range wf.ShardStreams { + for _, st := range stream.GetStreams() { + if st.Message == Frozen { + return cannotSwitchFrozen, nil + } + // If no new events have been replicated after the copy phase then it will be 0. + if vreplLag := time.Now().Unix() - st.TimeUpdated.Seconds; vreplLag > maxAllowedReplLagSecs { + return fmt.Sprintf(cannotSwitchHighLag, vreplLag, maxAllowedReplLagSecs), nil + } + switch st.State { + case binlogdatapb.VReplicationWorkflowState_Copying.String(): + return cannotSwitchCopyIncomplete, nil + case binlogdatapb.VReplicationWorkflowState_Error.String(): + return cannotSwitchError, nil + } + } + } + + // Ensure that the tablets on both sides are in good shape as we make this same call in the + // process and an error will cause us to backout. + refreshErrors := strings.Builder{} + var m sync.Mutex + var wg sync.WaitGroup + rtbsCtx, cancel := context.WithTimeout(ctx, shardTabletRefreshTimeout) + defer cancel() + refreshTablets := func(shards []*topo.ShardInfo, stype string) { + defer wg.Done() + for _, si := range shards { + if partial, partialDetails, err := topotools.RefreshTabletsByShard(rtbsCtx, s.ts, s.tmc, si, nil, ts.Logger()); err != nil || partial { + m.Lock() + refreshErrors.WriteString(fmt.Sprintf("failed to successfully refresh all tablets in the %s/%s %s shard (%v):\n %v\n", + si.Keyspace(), si.ShardName(), stype, err, partialDetails)) + m.Unlock() + } + } + } + wg.Add(1) + go refreshTablets(ts.SourceShards(), "source") + wg.Add(1) + go refreshTablets(ts.TargetShards(), "target") + wg.Wait() + if refreshErrors.Len() > 0 { + return fmt.Sprintf(cannotSwitchFailedTabletRefresh, refreshErrors.String()), nil + } + return "", nil +} + +// VReplicationExec executes a query remotely using the DBA pool. +func (s *Server) VReplicationExec(ctx context.Context, tabletAlias *topodatapb.TabletAlias, query string) (*querypb.QueryResult, error) { + ti, err := s.ts.GetTablet(ctx, tabletAlias) + if err != nil { + return nil, err + } + return s.tmc.VReplicationExec(ctx, ti.Tablet, query) +} + +// CopySchemaShard copies the schema from a source tablet to the +// specified shard. The schema is applied directly on the primary of +// the destination shard, and is propagated to the replicas through +// binlogs. +func (s *Server) CopySchemaShard(ctx context.Context, sourceTabletAlias *topodatapb.TabletAlias, tables, excludeTables []string, includeViews bool, destKeyspace, destShard string, waitReplicasTimeout time.Duration, skipVerify bool) error { + destShardInfo, err := s.ts.GetShard(ctx, destKeyspace, destShard) + if err != nil { + return vterrors.Errorf(vtrpcpb.Code_INTERNAL, "GetShard(%v, %v) failed: %v", destKeyspace, destShard, err) + } + + if destShardInfo.PrimaryAlias == nil { + return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "no primary in shard record %v/%v. Consider running 'vtctl InitShardPrimary' in case of a new shard or reparenting the shard to fix the topology data", destKeyspace, destShard) + } + + diffs, err := schematools.CompareSchemas(ctx, s.ts, s.tmc, sourceTabletAlias, destShardInfo.PrimaryAlias, tables, excludeTables, includeViews) + if err != nil { + return vterrors.Errorf(vtrpcpb.Code_INTERNAL, "CopySchemaShard failed because schemas could not be compared initially: %v", err) + } + if diffs == nil { + // Return early because dest has already the same schema as source. + return nil + } + + req := &tabletmanagerdatapb.GetSchemaRequest{Tables: tables, ExcludeTables: excludeTables, IncludeViews: includeViews} + sourceSd, err := schematools.GetSchema(ctx, s.ts, s.tmc, sourceTabletAlias, req) + if err != nil { + return vterrors.Errorf(vtrpcpb.Code_INTERNAL, "GetSchema(%v, %v, %v, %v) failed: %v", sourceTabletAlias, tables, excludeTables, includeViews, err) + } + + createSQLstmts := tmutils.SchemaDefinitionToSQLStrings(sourceSd) + + destTabletInfo, err := s.ts.GetTablet(ctx, destShardInfo.PrimaryAlias) + if err != nil { + return vterrors.Errorf(vtrpcpb.Code_INTERNAL, "GetTablet(%v) failed: %v", destShardInfo.PrimaryAlias, err) + } + for _, createSQL := range createSQLstmts { + err = s.applySQLShard(ctx, destTabletInfo, createSQL) + if err != nil { + return vterrors.Errorf(vtrpcpb.Code_INTERNAL, "creating a table failed."+ + " Most likely some tables already exist on the destination and differ from the source."+ + " Please remove all to be copied tables from the destination manually and run this command again."+ + " Full error: %v", err) + } + } + + // Remember the replication position after all the above were applied. + destPrimaryPos, err := s.tmc.PrimaryPosition(ctx, destTabletInfo.Tablet) + if err != nil { + return vterrors.Errorf(vtrpcpb.Code_INTERNAL, "CopySchemaShard: can't get replication position after schema applied: %v", err) + } + + // Although the copy was successful, we have to verify it to catch the case + // where the database already existed on the destination, but with different + // options e.g. a different character set. + // In that case, MySQL would have skipped our CREATE DATABASE IF NOT EXISTS + // statement. + if !skipVerify { + diffs, err = schematools.CompareSchemas(ctx, s.ts, s.tmc, sourceTabletAlias, destShardInfo.PrimaryAlias, tables, excludeTables, includeViews) + if err != nil { + return vterrors.Errorf(vtrpcpb.Code_INTERNAL, "CopySchemaShard failed because schemas could not be compared finally: %v", err) + } + if diffs != nil { + return vterrors.Errorf(vtrpcpb.Code_INTERNAL, "CopySchemaShard was not successful because the schemas between the two tablets %v and %v differ: %v", sourceTabletAlias, destShardInfo.PrimaryAlias, diffs) + } + } + + // Notify Replicas to reload schema. This is best-effort. + reloadCtx, cancel := context.WithTimeout(ctx, waitReplicasTimeout) + defer cancel() + _, ok := schematools.ReloadShard(reloadCtx, s.ts, s.tmc, logutil.NewMemoryLogger(), destKeyspace, destShard, destPrimaryPos, nil, true) + if !ok { + log.Error(vterrors.Errorf(vtrpcpb.Code_INTERNAL, "CopySchemaShard: failed to reload schema on all replicas")) + } + + return err +} + +// applySQLShard applies a given SQL change on a given tablet alias. It allows executing arbitrary +// SQL statements, but doesn't return any results, so it's only useful for SQL statements +// that would be run for their effects (e.g., CREATE). +// It works by applying the SQL statement on the shard's primary tablet with replication turned on. +// Thus it should be used only for changes that can be applied on a live instance without causing issues; +// it shouldn't be used for anything that will require a pivot. +// The SQL statement string is expected to have {{.DatabaseName}} in place of the actual db name. +func (s *Server) applySQLShard(ctx context.Context, tabletInfo *topo.TabletInfo, change string) error { + filledChange, err := fillStringTemplate(change, map[string]string{"DatabaseName": tabletInfo.DbName()}) + if err != nil { + return vterrors.Errorf(vtrpcpb.Code_INTERNAL, "fillStringTemplate failed: %v", err) + } + ctx, cancel := context.WithTimeout(ctx, 30*time.Second) + defer cancel() + // Need to make sure that replication is enabled since we're only applying the statement on primaries + _, err = s.tmc.ApplySchema(ctx, tabletInfo.Tablet, &tmutils.SchemaChange{ + SQL: filledChange, + Force: false, + AllowReplication: true, + SQLMode: vreplication.SQLMode, + }) + return err +} + +// fillStringTemplate returns the string template filled +func fillStringTemplate(tmpl string, vars any) (string, error) { + myTemplate := template.Must(template.New("").Parse(tmpl)) + data := new(bytes.Buffer) + if err := myTemplate.Execute(data, vars); err != nil { + return "", err + } + return data.String(), nil +} +>>>>>>> 04834c4fec (VDiff: Cleanup the controller for a VDiff before deleting it (#14107)) diff --git a/go/vt/vttablet/tabletmanager/framework_test.go b/go/vt/vttablet/tabletmanager/framework_test.go new file mode 100644 index 00000000000..4734ab9ee96 --- /dev/null +++ b/go/vt/vttablet/tabletmanager/framework_test.go @@ -0,0 +1,492 @@ +/* +Copyright 2023 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package tabletmanager + +import ( + "context" + "fmt" + "regexp" + "strings" + "sync" + "testing" + + "github.com/stretchr/testify/require" + + "vitess.io/vitess/go/mysql/fakesqldb" + "vitess.io/vitess/go/mysql/replication" + "vitess.io/vitess/go/sqltypes" + "vitess.io/vitess/go/vt/binlog/binlogplayer" + "vitess.io/vitess/go/vt/dbconfigs" + "vitess.io/vitess/go/vt/grpcclient" + "vitess.io/vitess/go/vt/mysqlctl" + "vitess.io/vitess/go/vt/topo" + "vitess.io/vitess/go/vt/topo/memorytopo" + "vitess.io/vitess/go/vt/vttablet/queryservice" + "vitess.io/vitess/go/vt/vttablet/tabletconn" + "vitess.io/vitess/go/vt/vttablet/tabletconntest" + "vitess.io/vitess/go/vt/vttablet/tabletmanager/vreplication" + "vitess.io/vitess/go/vt/vttablet/tmclient" + "vitess.io/vitess/go/vt/vttablet/tmclienttest" + + binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata" + querypb "vitess.io/vitess/go/vt/proto/query" + tabletmanagerdatapb "vitess.io/vitess/go/vt/proto/tabletmanagerdata" + topodatapb "vitess.io/vitess/go/vt/proto/topodata" +) + +const ( + gtidFlavor = "MySQL56" + gtidPosition = "16b1039f-22b6-11ed-b765-0a43f95f28a3:1-220" +) + +func init() { + tabletconn.RegisterDialer("grpc", func(tablet *topodatapb.Tablet, failFast grpcclient.FailFast) (queryservice.QueryService, error) { + return &tabletconntest.FakeQueryService{ + StreamHealthResponse: &querypb.StreamHealthResponse{ + Serving: true, + Target: &querypb.Target{ + Keyspace: tablet.Keyspace, + Shard: tablet.Shard, + TabletType: tablet.Type, + Cell: tablet.Alias.Cell, + }, + RealtimeStats: &querypb.RealtimeStats{}, + }, + }, nil + }) +} + +type testEnv struct { + mu sync.Mutex + ctx context.Context + ts *topo.Server + cells []string + mysqld *mysqlctl.FakeMysqlDaemon + tmc *fakeTMClient + dbName string + protoName string +} + +func newTestEnv(t *testing.T, ctx context.Context, sourceKeyspace string, sourceShards []string) *testEnv { + tenv := &testEnv{ + ctx: context.Background(), + tmc: newFakeTMClient(), + cells: []string{"zone1"}, + dbName: "tmtestdb", + protoName: t.Name(), + } + tenv.mu.Lock() + defer tenv.mu.Unlock() + tenv.ts = memorytopo.NewServer(ctx, tenv.cells...) + tenv.tmc.sourceKeyspace = sourceKeyspace + tenv.tmc.sourceShards = sourceShards + tenv.tmc.schema = defaultSchema + + tabletconn.RegisterDialer(t.Name(), func(tablet *topodatapb.Tablet, failFast grpcclient.FailFast) (queryservice.QueryService, error) { + tenv.mu.Lock() + defer tenv.mu.Unlock() + if qs, ok := tenv.tmc.tablets[int(tablet.Alias.Uid)]; ok { + return qs, nil + } + return nil, fmt.Errorf("tablet %d not found", tablet.Alias.Uid) + }) + tabletconntest.SetProtocol(fmt.Sprintf("go.vt.vttablet.tabletmanager.framework_test_%s", t.Name()), tenv.protoName) + tmclient.RegisterTabletManagerClientFactory(t.Name(), func() tmclient.TabletManagerClient { + return tenv.tmc + }) + tmclienttest.SetProtocol(fmt.Sprintf("go.vt.vttablet.tabletmanager.framework_test_%s", t.Name()), tenv.protoName) + + tenv.mysqld = mysqlctl.NewFakeMysqlDaemon(fakesqldb.New(t)) + var err error + tenv.mysqld.CurrentPrimaryPosition, err = replication.ParsePosition(gtidFlavor, gtidPosition) + require.NoError(t, err) + + return tenv +} + +func (tenv *testEnv) close() { + tenv.mu.Lock() + defer tenv.mu.Unlock() + tenv.ts.Close() + tenv.mysqld.Close() +} + +//-------------------------------------- +// Tablets + +func (tenv *testEnv) addTablet(t *testing.T, id int, keyspace, shard string) *fakeTabletConn { + tenv.mu.Lock() + defer tenv.mu.Unlock() + tablet := &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: tenv.cells[0], + Uid: uint32(id), + }, + Keyspace: keyspace, + Shard: shard, + Type: topodatapb.TabletType_PRIMARY, + PortMap: map[string]int32{ + tenv.protoName: int32(id), + }, + } + if err := tenv.ts.InitTablet(tenv.ctx, tablet, false /* allowPrimaryOverride */, true /* createShardAndKeyspace */, false /* allowUpdate */); err != nil { + panic(err) + } + if _, err := tenv.ts.UpdateShardFields(tenv.ctx, keyspace, shard, func(si *topo.ShardInfo) error { + si.PrimaryAlias = tablet.Alias + si.IsPrimaryServing = true + return nil + }); err != nil { + panic(err) + } + if err := tenv.ts.EnsureVSchema(tenv.ctx, keyspace); err != nil { + panic(err) + } + + vrdbClient := binlogplayer.NewMockDBClient(t) + vrdbClient.Tag = fmt.Sprintf("tablet:%d", id) + tenv.tmc.tablets[id] = &fakeTabletConn{ + tablet: tablet, + vrdbClient: vrdbClient, + } + + dbClientFactory := func() binlogplayer.DBClient { + return tenv.tmc.tablets[id].vrdbClient + } + tenv.tmc.tablets[id].vrengine = vreplication.NewTestEngine(tenv.ts, tenv.cells[0], tenv.mysqld, dbClientFactory, dbClientFactory, tenv.dbName, nil) + tenv.tmc.tablets[id].vrdbClient.ExpectRequest(fmt.Sprintf("select * from _vt.vreplication where db_name='%s'", tenv.dbName), &sqltypes.Result{}, nil) + tenv.tmc.tablets[id].vrengine.Open(tenv.ctx) + require.True(t, tenv.tmc.tablets[id].vrengine.IsOpen(), "vreplication engine was not open") + + tenv.tmc.tablets[id].tm = &TabletManager{ + VREngine: tenv.tmc.tablets[id].vrengine, + DBConfigs: &dbconfigs.DBConfigs{ + DBName: tenv.dbName, + }, + } + + return tenv.tmc.tablets[id] +} + +func (tenv *testEnv) deleteTablet(tablet *topodatapb.Tablet) { + tenv.mu.Lock() + defer tenv.mu.Unlock() + tenv.tmc.tablets[int(tablet.Alias.Uid)].vrdbClient.Close() + tenv.tmc.tablets[int(tablet.Alias.Uid)].vrengine.Close() + tenv.ts.DeleteTablet(tenv.ctx, tablet.Alias) + // This is not automatically removed from shard replication, which results in log spam. + topo.DeleteTabletReplicationData(tenv.ctx, tenv.ts, tablet) +} + +// fakeTabletConn implements the TabletConn and QueryService interfaces. +type fakeTabletConn struct { + queryservice.QueryService + tablet *topodatapb.Tablet + tm *TabletManager + vrdbClient *binlogplayer.MockDBClient + vrengine *vreplication.Engine +} + +// fakeTabletConn implements the QueryService interface. +func (ftc *fakeTabletConn) Begin(ctx context.Context, target *querypb.Target, options *querypb.ExecuteOptions) (queryservice.TransactionState, error) { + return queryservice.TransactionState{ + TransactionID: 1, + }, nil +} + +// fakeTabletConn implements the QueryService interface. +func (ftc *fakeTabletConn) Commit(ctx context.Context, target *querypb.Target, transactionID int64) (int64, error) { + return 0, nil +} + +// fakeTabletConn implements the QueryService interface. +func (ftc *fakeTabletConn) Rollback(ctx context.Context, target *querypb.Target, transactionID int64) (int64, error) { + return 0, nil +} + +// fakeTabletConn implements the QueryService interface. +func (ftc *fakeTabletConn) Prepare(ctx context.Context, target *querypb.Target, transactionID int64, dtid string) (err error) { + return nil +} + +// fakeTabletConn implements the QueryService interface. +func (ftc *fakeTabletConn) CommitPrepared(ctx context.Context, target *querypb.Target, dtid string) (err error) { + return nil +} + +// fakeTabletConn implements the QueryService interface. +func (ftc *fakeTabletConn) RollbackPrepared(ctx context.Context, target *querypb.Target, dtid string, originalID int64) (err error) { + return nil +} + +// fakeTabletConn implements the QueryService interface. +func (ftc *fakeTabletConn) CreateTransaction(ctx context.Context, target *querypb.Target, dtid string, participants []*querypb.Target) (err error) { + return nil +} + +// fakeTabletConn implements the QueryService interface. +func (ftc *fakeTabletConn) StartCommit(ctx context.Context, target *querypb.Target, transactionID int64, dtid string) (err error) { + return nil +} + +// fakeTabletConn implements the QueryService interface. +func (ftc *fakeTabletConn) SetRollback(ctx context.Context, target *querypb.Target, dtid string, transactionID int64) (err error) { + return nil +} + +// fakeTabletConn implements the QueryService interface. +func (ftc *fakeTabletConn) ConcludeTransaction(ctx context.Context, target *querypb.Target, dtid string) (err error) { + return nil +} + +// fakeTabletConn implements the QueryService interface. +func (ftc *fakeTabletConn) ReadTransaction(ctx context.Context, target *querypb.Target, dtid string) (metadata *querypb.TransactionMetadata, err error) { + return nil, nil +} + +// fakeTabletConn implements the QueryService interface. +func (ftc *fakeTabletConn) Execute(ctx context.Context, target *querypb.Target, sql string, bindVariables map[string]*querypb.BindVariable, transactionID, reservedID int64, options *querypb.ExecuteOptions) (*sqltypes.Result, error) { + return nil, nil +} + +// fakeTabletConn implements the QueryService interface. +func (ftc *fakeTabletConn) StreamExecute(ctx context.Context, target *querypb.Target, sql string, bindVariables map[string]*querypb.BindVariable, transactionID int64, reservedID int64, options *querypb.ExecuteOptions, callback func(*sqltypes.Result) error) error { + return nil +} + +// fakeTabletConn implements the QueryService interface. +func (ftc *fakeTabletConn) BeginExecute(ctx context.Context, target *querypb.Target, preQueries []string, sql string, bindVariables map[string]*querypb.BindVariable, reservedID int64, options *querypb.ExecuteOptions) (queryservice.TransactionState, *sqltypes.Result, error) { + return queryservice.TransactionState{ + TransactionID: 1, + }, nil, nil +} + +// fakeTabletConn implements the QueryService interface. +func (ftc *fakeTabletConn) BeginStreamExecute(ctx context.Context, target *querypb.Target, preQueries []string, sql string, bindVariables map[string]*querypb.BindVariable, reservedID int64, options *querypb.ExecuteOptions, callback func(*sqltypes.Result) error) (queryservice.TransactionState, error) { + return queryservice.TransactionState{ + TransactionID: 1, + }, nil +} + +// fakeTabletConn implements the QueryService interface. +func (ftc *fakeTabletConn) MessageStream(ctx context.Context, target *querypb.Target, name string, callback func(*sqltypes.Result) error) error { + return nil +} + +// fakeTabletConn implements the QueryService interface. +func (ftc *fakeTabletConn) MessageAck(ctx context.Context, target *querypb.Target, name string, ids []*querypb.Value) (count int64, err error) { + return 0, nil +} + +// fakeTabletConn implements the QueryService interface. +func (ftc *fakeTabletConn) VStream(ctx context.Context, request *binlogdatapb.VStreamRequest, send func([]*binlogdatapb.VEvent) error) error { + return nil +} + +// fakeTabletConn implements the QueryService interface. +func (ftc *fakeTabletConn) VStreamRows(ctx context.Context, request *binlogdatapb.VStreamRowsRequest, send func(*binlogdatapb.VStreamRowsResponse) error) error { + return nil +} + +// fakeTabletConn implements the QueryService interface. +func (ftc *fakeTabletConn) VStreamResults(ctx context.Context, target *querypb.Target, query string, send func(*binlogdatapb.VStreamResultsResponse) error) error { + return nil +} + +// fakeTabletConn implements the QueryService interface. +func (ftc *fakeTabletConn) HandlePanic(err *error) { +} + +// fakeTabletConn implements the QueryService interface. +func (ftc *fakeTabletConn) ReserveBeginExecute(ctx context.Context, target *querypb.Target, preQueries []string, postBeginQueries []string, sql string, bindVariables map[string]*querypb.BindVariable, options *querypb.ExecuteOptions) (queryservice.ReservedTransactionState, *sqltypes.Result, error) { + return queryservice.ReservedTransactionState{ + ReservedID: 1, + }, nil, nil +} + +// fakeTabletConn implements the QueryService interface. +func (ftc *fakeTabletConn) ReserveBeginStreamExecute(ctx context.Context, target *querypb.Target, preQueries []string, postBeginQueries []string, sql string, bindVariables map[string]*querypb.BindVariable, options *querypb.ExecuteOptions, callback func(*sqltypes.Result) error) (queryservice.ReservedTransactionState, error) { + return queryservice.ReservedTransactionState{ + ReservedID: 1, + }, nil +} + +// fakeTabletConn implements the QueryService interface. +func (ftc *fakeTabletConn) ReserveExecute(ctx context.Context, target *querypb.Target, preQueries []string, sql string, bindVariables map[string]*querypb.BindVariable, transactionID int64, options *querypb.ExecuteOptions) (queryservice.ReservedState, *sqltypes.Result, error) { + return queryservice.ReservedState{ + ReservedID: 1, + }, nil, nil +} + +// fakeTabletConn implements the QueryService interface. +func (ftc *fakeTabletConn) ReserveStreamExecute(ctx context.Context, target *querypb.Target, preQueries []string, sql string, bindVariables map[string]*querypb.BindVariable, transactionID int64, options *querypb.ExecuteOptions, callback func(*sqltypes.Result) error) (queryservice.ReservedState, error) { + return queryservice.ReservedState{ + ReservedID: 1, + }, nil +} + +// fakeTabletConn implements the QueryService interface. +func (ftc *fakeTabletConn) Release(ctx context.Context, target *querypb.Target, transactionID, reservedID int64) error { + return nil +} + +// fakeTabletConn implements the QueryService interface. +func (ftc *fakeTabletConn) GetSchema(ctx context.Context, target *querypb.Target, tableType querypb.SchemaTableType, tableNames []string, callback func(schemaRes *querypb.GetSchemaResponse) error) error { + return nil +} + +// fakeTabletConn implements the QueryService interface. +func (ftc *fakeTabletConn) Close(ctx context.Context) error { + return nil +} + +func (ftc *fakeTabletConn) StreamHealth(ctx context.Context, callback func(*querypb.StreamHealthResponse) error) error { + return callback(&querypb.StreamHealthResponse{ + Serving: true, + Target: &querypb.Target{ + Keyspace: ftc.tablet.Keyspace, + Shard: ftc.tablet.Shard, + TabletType: ftc.tablet.Type, + Cell: ftc.tablet.Alias.Cell, + }, + RealtimeStats: &querypb.RealtimeStats{}, + }) +} + +//---------------------------------------------- +// fakeTMClient + +type fakeTMClient struct { + tmclient.TabletManagerClient + sourceKeyspace string + sourceShards []string + tablets map[int]*fakeTabletConn + schema *tabletmanagerdatapb.SchemaDefinition + vreQueries map[int]map[string]*querypb.QueryResult +} + +func newFakeTMClient() *fakeTMClient { + return &fakeTMClient{ + tablets: make(map[int]*fakeTabletConn), + vreQueries: make(map[int]map[string]*querypb.QueryResult), + schema: &tabletmanagerdatapb.SchemaDefinition{}, + } +} + +func (tmc *fakeTMClient) GetSchema(ctx context.Context, tablet *topodatapb.Tablet, request *tabletmanagerdatapb.GetSchemaRequest) (*tabletmanagerdatapb.SchemaDefinition, error) { + return tmc.schema, nil +} + +func (tmc *fakeTMClient) SetSchema(schema *tabletmanagerdatapb.SchemaDefinition) { + tmc.schema = schema +} + +func (tmc *fakeTMClient) ExecuteFetchAsApp(ctx context.Context, tablet *topodatapb.Tablet, usePool bool, req *tabletmanagerdatapb.ExecuteFetchAsAppRequest) (*querypb.QueryResult, error) { + // Reuse VReplicationExec + return tmc.VReplicationExec(ctx, tablet, string(req.Query)) +} + +func (tmc *fakeTMClient) ExecuteFetchAsDba(ctx context.Context, tablet *topodatapb.Tablet, usePool bool, req *tabletmanagerdatapb.ExecuteFetchAsDbaRequest) (*querypb.QueryResult, error) { + // Reuse VReplicationExec + return tmc.VReplicationExec(ctx, tablet, string(req.Query)) +} + +// setVReplicationExecResults allows you to specify VReplicationExec queries +// and their results. You can specify exact strings or strings prefixed with +// a '/', in which case they will be treated as a valid regexp. +func (tmc *fakeTMClient) setVReplicationExecResults(tablet *topodatapb.Tablet, query string, result *sqltypes.Result) { + queries, ok := tmc.vreQueries[int(tablet.Alias.Uid)] + if !ok { + queries = make(map[string]*querypb.QueryResult) + tmc.vreQueries[int(tablet.Alias.Uid)] = queries + } + queries[query] = sqltypes.ResultToProto3(result) +} + +func (tmc *fakeTMClient) VReplicationExec(ctx context.Context, tablet *topodatapb.Tablet, query string) (*querypb.QueryResult, error) { + if result, ok := tmc.vreQueries[int(tablet.Alias.Uid)][query]; ok { + return result, nil + } + for qry, res := range tmc.vreQueries[int(tablet.Alias.Uid)] { + if strings.HasPrefix(qry, "/") { + re := regexp.MustCompile(qry) + if re.MatchString(qry) { + return res, nil + } + } + } + return nil, fmt.Errorf("query %q not found for tablet %d", query, tablet.Alias.Uid) +} + +func (tmc *fakeTMClient) CreateVReplicationWorkflow(ctx context.Context, tablet *topodatapb.Tablet, req *tabletmanagerdatapb.CreateVReplicationWorkflowRequest) (*tabletmanagerdatapb.CreateVReplicationWorkflowResponse, error) { + return tmc.tablets[int(tablet.Alias.Uid)].tm.CreateVReplicationWorkflow(ctx, req) +} + +func (tmc *fakeTMClient) ReadVReplicationWorkflow(ctx context.Context, tablet *topodatapb.Tablet, req *tabletmanagerdatapb.ReadVReplicationWorkflowRequest) (*tabletmanagerdatapb.ReadVReplicationWorkflowResponse, error) { + resp := &tabletmanagerdatapb.ReadVReplicationWorkflowResponse{ + Workflow: req.Workflow, + WorkflowSubType: binlogdatapb.VReplicationWorkflowSubType_None, + WorkflowType: binlogdatapb.VReplicationWorkflowType_MoveTables, + TabletTypes: []topodatapb.TabletType{topodatapb.TabletType_PRIMARY}, + Streams: make([]*tabletmanagerdatapb.ReadVReplicationWorkflowResponse_Stream, len(tmc.sourceShards)), + } + rules := make([]*binlogdatapb.Rule, len(defaultSchema.TableDefinitions)) + for i, table := range defaultSchema.TableDefinitions { + rules[i] = &binlogdatapb.Rule{ + Match: table.Name, + Filter: tablet.Shard, + } + } + for i, shard := range tmc.sourceShards { + resp.Streams[i] = &tabletmanagerdatapb.ReadVReplicationWorkflowResponse_Stream{ + Id: int32(i + 1), + Bls: &binlogdatapb.BinlogSource{ + Keyspace: tmc.sourceKeyspace, + Shard: shard, + Filter: &binlogdatapb.Filter{ + Rules: rules, + }, + }, + } + } + + return resp, nil +} + +func (tmc *fakeTMClient) PrimaryPosition(ctx context.Context, tablet *topodatapb.Tablet) (string, error) { + return fmt.Sprintf("%s/%s", gtidFlavor, gtidPosition), nil +} + +func (tmc *fakeTMClient) VReplicationWaitForPos(ctx context.Context, tablet *topodatapb.Tablet, id int32, pos string) error { + return nil +} + +func (tmc *fakeTMClient) ExecuteFetchAsAllPrivs(ctx context.Context, tablet *topodatapb.Tablet, req *tabletmanagerdatapb.ExecuteFetchAsAllPrivsRequest) (*querypb.QueryResult, error) { + return &querypb.QueryResult{ + RowsAffected: 1, + }, nil +} + +func (tmc *fakeTMClient) VDiff(ctx context.Context, tablet *topodatapb.Tablet, req *tabletmanagerdatapb.VDiffRequest) (*tabletmanagerdatapb.VDiffResponse, error) { + return &tabletmanagerdatapb.VDiffResponse{ + Id: 1, + VdiffUuid: req.VdiffUuid, + Output: &querypb.QueryResult{ + RowsAffected: 1, + }, + }, nil +} diff --git a/go/vt/vttablet/tabletmanager/vdiff/action.go b/go/vt/vttablet/tabletmanager/vdiff/action.go index df8fb8854bf..e38a18a921b 100644 --- a/go/vt/vttablet/tabletmanager/vdiff/action.go +++ b/go/vt/vttablet/tabletmanager/vdiff/action.go @@ -295,20 +295,82 @@ func (vde *Engine) handleStopAction(ctx context.Context, dbClient binlogplayer.D } func (vde *Engine) handleDeleteAction(ctx context.Context, dbClient binlogplayer.DBClient, action VDiffAction, req *tabletmanagerdatapb.VDiffRequest, resp *tabletmanagerdatapb.VDiffResponse) error { - var err error - query := "" + vde.mu.Lock() + defer vde.mu.Unlock() + var deleteQuery string + cleanupController := func(controller *controller) { + if controller == nil { + return + } + controller.Stop() + delete(vde.controllers, controller.id) + } switch req.ActionArg { case AllActionArg: +<<<<<<< HEAD query = fmt.Sprintf(sqlDeleteVDiffs, encodeString(req.Keyspace), encodeString(req.Workflow)) +======= + // We need to stop any running controllers before we delete + // the vdiff records. + query, err := sqlparser.ParseAndBind(sqlGetVDiffIDsByKeyspaceWorkflow, + sqltypes.StringBindVariable(req.Keyspace), + sqltypes.StringBindVariable(req.Workflow), + ) + if err != nil { + return err + } + res, err := dbClient.ExecuteFetch(query, -1) + if err != nil { + return err + } + for _, row := range res.Named().Rows { + cleanupController(vde.controllers[row.AsInt64("id", -1)]) + } + deleteQuery, err = sqlparser.ParseAndBind(sqlDeleteVDiffs, + sqltypes.StringBindVariable(req.Keyspace), + sqltypes.StringBindVariable(req.Workflow), + ) + if err != nil { + return err + } +>>>>>>> 04834c4fec (VDiff: Cleanup the controller for a VDiff before deleting it (#14107)) default: uuid, err := uuid.Parse(req.ActionArg) if err != nil { return fmt.Errorf("action argument %s not supported", req.ActionArg) } +<<<<<<< HEAD query = fmt.Sprintf(sqlDeleteVDiffByUUID, encodeString(uuid.String())) +======= + // We need to be sure that the controller is stopped, if + // it's still running, before we delete the vdiff record. + query, err := sqlparser.ParseAndBind(sqlGetVDiffID, + sqltypes.StringBindVariable(uuid.String()), + ) + if err != nil { + return err + } + res, err := dbClient.ExecuteFetch(query, 1) + if err != nil { + return err + } + row := res.Named().Row() // Must only be one + if row == nil { + return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "no vdiff found for UUID %s on tablet %v", + uuid, vde.thisTablet.Alias) + } + cleanupController(vde.controllers[row.AsInt64("id", -1)]) + deleteQuery, err = sqlparser.ParseAndBind(sqlDeleteVDiffByUUID, + sqltypes.StringBindVariable(uuid.String()), + ) + if err != nil { + return err + } +>>>>>>> 04834c4fec (VDiff: Cleanup the controller for a VDiff before deleting it (#14107)) } - if _, err = dbClient.ExecuteFetch(query, 1); err != nil { + // Execute the query which deletes the vdiff record(s). + if _, err := dbClient.ExecuteFetch(deleteQuery, 1); err != nil { return err } diff --git a/go/vt/vttablet/tabletmanager/vdiff/action_test.go b/go/vt/vttablet/tabletmanager/vdiff/action_test.go index 6c3106f5310..9bbfbaa4d68 100644 --- a/go/vt/vttablet/tabletmanager/vdiff/action_test.go +++ b/go/vt/vttablet/tabletmanager/vdiff/action_test.go @@ -42,6 +42,10 @@ func TestPerformVDiffAction(t *testing.T) { keyspace := "ks" workflow := "wf" uuid := uuid.New().String() + type queryAndResult struct { + query string + result *sqltypes.Result // Optional if you need a non-empty result + } tests := []struct { name string vde *Engine @@ -49,7 +53,7 @@ func TestPerformVDiffAction(t *testing.T) { preFunc func() error postFunc func() error want *tabletmanagerdatapb.VDiffResponse - expectQueries []string + expectQueries []queryAndResult wantErr error }{ { @@ -72,9 +76,13 @@ func TestPerformVDiffAction(t *testing.T) { preFunc: func() error { return tstenv.TopoServ.CreateCellInfo(ctx, "zone100_test", &topodatapb.CellInfo{}) }, - expectQueries: []string{ - fmt.Sprintf("select id as id from _vt.vdiff where vdiff_uuid = %s", encodeString(uuid)), - fmt.Sprintf(`insert into _vt.vdiff(keyspace, workflow, state, options, shard, db_name, vdiff_uuid) values('', '', 'pending', '{\"picker_options\":{\"source_cell\":\"cell1,zone100_test\",\"target_cell\":\"cell1,zone100_test\"}}', '0', 'vt_vttest', %s)`, encodeString(uuid)), + expectQueries: []queryAndResult{ + { + query: fmt.Sprintf("select id as id from _vt.vdiff where vdiff_uuid = %s", encodeString(uuid)), + }, + { + query: fmt.Sprintf(`insert into _vt.vdiff(keyspace, workflow, state, options, shard, db_name, vdiff_uuid) values('', '', 'pending', '{\"picker_options\":{\"source_cell\":\"cell1,zone100_test\",\"target_cell\":\"cell1,zone100_test\"}}', '0', 'vt_vttest', %s)`, encodeString(uuid)), + }, }, postFunc: func() error { return tstenv.TopoServ.DeleteCellInfo(ctx, "zone100_test", true) @@ -102,9 +110,13 @@ func TestPerformVDiffAction(t *testing.T) { Cells: cells, }) }, - expectQueries: []string{ - fmt.Sprintf("select id as id from _vt.vdiff where vdiff_uuid = %s", encodeString(uuid)), - fmt.Sprintf(`insert into _vt.vdiff(keyspace, workflow, state, options, shard, db_name, vdiff_uuid) values('', '', 'pending', '{\"picker_options\":{\"source_cell\":\"all\",\"target_cell\":\"all\"}}', '0', 'vt_vttest', %s)`, encodeString(uuid)), + expectQueries: []queryAndResult{ + { + query: fmt.Sprintf("select id as id from _vt.vdiff where vdiff_uuid = %s", encodeString(uuid)), + }, + { + query: fmt.Sprintf(`insert into _vt.vdiff(keyspace, workflow, state, options, shard, db_name, vdiff_uuid) values('', '', 'pending', '{\"picker_options\":{\"source_cell\":\"all\",\"target_cell\":\"all\"}}', '0', 'vt_vttest', %s)`, encodeString(uuid)), + }, }, postFunc: func() error { if err := tstenv.TopoServ.DeleteCellInfo(ctx, "zone100_test", true); err != nil { @@ -119,9 +131,21 @@ func TestPerformVDiffAction(t *testing.T) { Action: string(DeleteAction), ActionArg: uuid, }, - expectQueries: []string{ - fmt.Sprintf(`delete from vd, vdt using _vt.vdiff as vd left join _vt.vdiff_table as vdt on (vd.id = vdt.vdiff_id) + expectQueries: []queryAndResult{ + { + query: fmt.Sprintf("select id as id from _vt.vdiff where vdiff_uuid = %s", encodeString(uuid)), + result: sqltypes.MakeTestResult( + sqltypes.MakeTestFields( + "id", + "int64", + ), + "1", + ), + }, + { + query: fmt.Sprintf(`delete from vd, vdt using _vt.vdiff as vd left join _vt.vdiff_table as vdt on (vd.id = vdt.vdiff_id) where vd.vdiff_uuid = %s`, encodeString(uuid)), + }, }, }, { @@ -132,10 +156,23 @@ func TestPerformVDiffAction(t *testing.T) { Keyspace: keyspace, Workflow: workflow, }, - expectQueries: []string{ - fmt.Sprintf(`delete from vd, vdt, vdl using _vt.vdiff as vd left join _vt.vdiff_table as vdt on (vd.id = vdt.vdiff_id) + expectQueries: []queryAndResult{ + { + query: fmt.Sprintf("select id as id from _vt.vdiff where keyspace = %s and workflow = %s", encodeString(keyspace), encodeString(workflow)), + result: sqltypes.MakeTestResult( + sqltypes.MakeTestFields( + "id", + "int64", + ), + "1", + "2", + ), + }, + { + query: fmt.Sprintf(`delete from vd, vdt, vdl using _vt.vdiff as vd left join _vt.vdiff_table as vdt on (vd.id = vdt.vdiff_id) left join _vt.vdiff_log as vdl on (vd.id = vdl.vdiff_id) where vd.keyspace = %s and vd.workflow = %s`, encodeString(keyspace), encodeString(workflow)), + }, }, }, } @@ -148,10 +185,14 @@ func TestPerformVDiffAction(t *testing.T) { if tt.vde == nil { tt.vde = vdiffenv.vde } - for _, query := range tt.expectQueries { - vdiffenv.dbClient.ExpectRequest(query, &sqltypes.Result{}, nil) + for _, queryResult := range tt.expectQueries { + if queryResult.result == nil { + queryResult.result = &sqltypes.Result{} + } + vdiffenv.dbClient.ExpectRequest(queryResult.query, queryResult.result, nil) } got, err := tt.vde.PerformVDiffAction(ctx, tt.req) + vdiffenv.dbClient.Wait() if tt.wantErr != nil && !vterrors.Equals(err, tt.wantErr) { t.Errorf("Engine.PerformVDiffAction() error = %v, wantErr %v", err, tt.wantErr) return @@ -163,6 +204,9 @@ func TestPerformVDiffAction(t *testing.T) { err := tt.postFunc() require.NoError(t, err, "post function failed: %v", err) } + // No VDiffs should be running anymore. + require.Equal(t, 0, len(vdiffenv.vde.controllers), "expected no controllers to be running, but found %d", + len(vdiffenv.vde.controllers)) }) } } diff --git a/go/vt/vttablet/tabletmanager/vdiff/controller.go b/go/vt/vttablet/tabletmanager/vdiff/controller.go index 86d5c8b1822..265297d5114 100644 --- a/go/vt/vttablet/tabletmanager/vdiff/controller.go +++ b/go/vt/vttablet/tabletmanager/vdiff/controller.go @@ -23,19 +23,29 @@ import ( "strings" "time" +<<<<<<< HEAD "vitess.io/vitess/go/vt/proto/tabletmanagerdata" "vitess.io/vitess/go/vt/vterrors" "google.golang.org/protobuf/encoding/prototext" "vitess.io/vitess/go/mysql" +======= + "google.golang.org/protobuf/encoding/prototext" + + "vitess.io/vitess/go/mysql/replication" +>>>>>>> 04834c4fec (VDiff: Cleanup the controller for a VDiff before deleting it (#14107)) "vitess.io/vitess/go/sqltypes" "vitess.io/vitess/go/vt/binlog/binlogplayer" "vitess.io/vitess/go/vt/log" - binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata" - vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc" + "vitess.io/vitess/go/vt/proto/tabletmanagerdata" + "vitess.io/vitess/go/vt/sqlparser" "vitess.io/vitess/go/vt/topo" + "vitess.io/vitess/go/vt/vterrors" "vitess.io/vitess/go/vt/vttablet/tmclient" + + binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata" + vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc" ) /* @@ -177,6 +187,8 @@ func (ct *controller) start(ctx context.Context, dbClient binlogplayer.DBClient) select { case <-ctx.Done(): return vterrors.Errorf(vtrpcpb.Code_CANCELED, "context has expired") + case <-ct.done: + return vterrors.Errorf(vtrpcpb.Code_CANCELED, "vdiff was stopped") default: } ct.workflowFilter = fmt.Sprintf("where workflow = %s and db_name = %s", encodeString(ct.workflow), encodeString(ct.vde.dbName)) @@ -190,6 +202,8 @@ func (ct *controller) start(ctx context.Context, dbClient binlogplayer.DBClient) select { case <-ctx.Done(): return vterrors.Errorf(vtrpcpb.Code_CANCELED, "context has expired") + case <-ct.done: + return vterrors.Errorf(vtrpcpb.Code_CANCELED, "vdiff was stopped") default: } source := newMigrationSource() @@ -306,9 +320,9 @@ func (ct *controller) saveErrorState(ctx context.Context, saveErr error) error { log.Warningf("Failed to persist vdiff error state: %v. Will retry in %s", err, retryDelay.String()) select { case <-ctx.Done(): - return fmt.Errorf("engine is shutting down") + return vterrors.Errorf(vtrpcpb.Code_CANCELED, "engine is shutting down") case <-ct.done: - return fmt.Errorf("vdiff was stopped") + return vterrors.Errorf(vtrpcpb.Code_CANCELED, "vdiff was stopped") case <-time.After(retryDelay): if retryDelay < maxRetryDelay { retryDelay = time.Duration(float64(retryDelay) * 1.5) diff --git a/go/vt/vttablet/tabletmanager/vdiff/schema.go b/go/vt/vttablet/tabletmanager/vdiff/schema.go index fa226aa2c74..fb72a5e5161 100644 --- a/go/vt/vttablet/tabletmanager/vdiff/schema.go +++ b/go/vt/vttablet/tabletmanager/vdiff/schema.go @@ -40,6 +40,7 @@ const ( // sqlUpdateVDiffState has a penultimate placeholder for any additional columns you want to update, e.g. `, foo = 1` sqlUpdateVDiffState = "update _vt.vdiff set state = %s, last_error = %s %s where id = %d" sqlUpdateVDiffStopped = `update _vt.vdiff as vd, _vt.vdiff_table as vdt set vd.state = 'stopped', vdt.state = 'stopped', vd.last_error = '' +<<<<<<< HEAD where vd.id = vdt.vdiff_id and vd.id = %d and vd.state != 'completed'` sqlGetVReplicationEntry = "select * from _vt.vreplication %s" sqlGetVDiffsToRun = "select * from _vt.vdiff where state in ('started','pending')" // what VDiffs have not been stopped or completed @@ -47,6 +48,17 @@ const ( sqlGetVDiffID = "select id as id from _vt.vdiff where vdiff_uuid = %s" sqlGetAllVDiffs = "select * from _vt.vdiff order by id desc" sqlGetAllTableRows = "select table_name as table_name, table_rows as table_rows from INFORMATION_SCHEMA.TABLES where table_schema = %s and table_name in (%s)" +======= + where vd.id = vdt.vdiff_id and vd.id = %a and vd.state != 'completed'` + sqlGetVReplicationEntry = "select * from _vt.vreplication %s" + sqlGetVDiffsToRun = "select * from _vt.vdiff where state in ('started','pending')" // what VDiffs have not been stopped or completed + sqlGetVDiffsToRetry = "select * from _vt.vdiff where state = 'error' and json_unquote(json_extract(options, '$.core_options.auto_retry')) = 'true'" + sqlGetVDiffID = "select id as id from _vt.vdiff where vdiff_uuid = %a" + sqlGetVDiffIDsByKeyspaceWorkflow = "select id as id from _vt.vdiff where keyspace = %a and workflow = %a" + sqlGetAllVDiffs = "select * from _vt.vdiff order by id desc" + sqlGetTableRows = "select table_rows as table_rows from INFORMATION_SCHEMA.TABLES where table_schema = %a and table_name = %a" + sqlGetAllTableRows = "select table_name as table_name, table_rows as table_rows from INFORMATION_SCHEMA.TABLES where table_schema = %s and table_name in (%s)" +>>>>>>> 04834c4fec (VDiff: Cleanup the controller for a VDiff before deleting it (#14107)) sqlNewVDiffTable = "insert into _vt.vdiff_table(vdiff_id, table_name, state, table_rows) values(%d, %s, 'pending', %d)" sqlGetVDiffTable = `select vdt.lastpk as lastpk, vdt.mismatch as mismatch, vdt.report as report diff --git a/go/vt/vttablet/tabletmanager/vdiff/table_differ.go b/go/vt/vttablet/tabletmanager/vdiff/table_differ.go index 799f95d080e..58757bdd672 100644 --- a/go/vt/vttablet/tabletmanager/vdiff/table_differ.go +++ b/go/vt/vttablet/tabletmanager/vdiff/table_differ.go @@ -409,6 +409,8 @@ func (td *tableDiffer) streamOneShard(ctx context.Context, participant *shardStr case participant.result <- result: case <-ctx.Done(): return vterrors.Wrap(ctx.Err(), "VStreamRows") + case <-td.wd.ct.done: + return vterrors.Errorf(vtrpcpb.Code_CANCELED, "vdiff was stopped") } return nil }) @@ -490,6 +492,8 @@ func (td *tableDiffer) diff(ctx context.Context, rowsToCompare int64, debug, onl select { case <-ctx.Done(): return nil, vterrors.Errorf(vtrpcpb.Code_CANCELED, "context has expired") + case <-td.wd.ct.done: + return nil, vterrors.Errorf(vtrpcpb.Code_CANCELED, "vdiff was stopped") default: } diff --git a/go/vt/vttablet/tabletmanager/vdiff/workflow_differ.go b/go/vt/vttablet/tabletmanager/vdiff/workflow_differ.go index 35236b50d79..0671d676c7f 100644 --- a/go/vt/vttablet/tabletmanager/vdiff/workflow_differ.go +++ b/go/vt/vttablet/tabletmanager/vdiff/workflow_differ.go @@ -97,6 +97,8 @@ func (wd *workflowDiffer) diffTable(ctx context.Context, dbClient binlogplayer.D select { case <-ctx.Done(): return vterrors.Errorf(vtrpcpb.Code_CANCELED, "context has expired") + case <-wd.ct.done: + return vterrors.Errorf(vtrpcpb.Code_CANCELED, "vdiff was stopped") default: } @@ -141,6 +143,8 @@ func (wd *workflowDiffer) diff(ctx context.Context) error { select { case <-ctx.Done(): return vterrors.Errorf(vtrpcpb.Code_CANCELED, "context has expired") + case <-wd.ct.done: + return vterrors.Errorf(vtrpcpb.Code_CANCELED, "vdiff was stopped") default: } @@ -160,6 +164,8 @@ func (wd *workflowDiffer) diff(ctx context.Context) error { select { case <-ctx.Done(): return vterrors.Errorf(vtrpcpb.Code_CANCELED, "context has expired") + case <-wd.ct.done: + return vterrors.Errorf(vtrpcpb.Code_CANCELED, "vdiff was stopped") default: } query := fmt.Sprintf(sqlGetVDiffTable, wd.ct.id, encodeString(td.table.Name)) diff --git a/go/vt/wrangler/workflow.go b/go/vt/wrangler/workflow.go index fbdeb58dd17..f2ab2bb85b9 100644 --- a/go/vt/wrangler/workflow.go +++ b/go/vt/wrangler/workflow.go @@ -15,8 +15,14 @@ import ( "vitess.io/vitess/go/vt/topo" "vitess.io/vitess/go/vt/topotools" "vitess.io/vitess/go/vt/vtctl/workflow" +<<<<<<< HEAD "vitess.io/vitess/go/vt/vtgate/evalengine" +======= + vdiff2 "vitess.io/vitess/go/vt/vttablet/tabletmanager/vdiff" + + binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata" +>>>>>>> 04834c4fec (VDiff: Cleanup the controller for a VDiff before deleting it (#14107)) tabletmanagerdatapb "vitess.io/vitess/go/vt/proto/tabletmanagerdata" topodatapb "vitess.io/vitess/go/vt/proto/topodata" ) @@ -703,20 +709,22 @@ func (vrw *VReplicationWorkflow) GetCopyProgress() (*CopyProgress, error) { // region Workflow related utility functions -// deleteWorkflowVDiffData cleans up any potential VDiff related data associated with the workflow on the given tablet +// deleteWorkflowVDiffData cleans up any potential VDiff related data associated +// with the workflow on the given tablet. func (wr *Wrangler) deleteWorkflowVDiffData(ctx context.Context, tablet *topodatapb.Tablet, workflow string) { - sqlDeleteVDiffs := `delete from vd, vdt, vdl using _vt.vdiff as vd inner join _vt.vdiff_table as vdt on (vd.id = vdt.vdiff_id) - inner join _vt.vdiff_log as vdl on (vd.id = vdl.vdiff_id) - where vd.keyspace = %s and vd.workflow = %s` - query := fmt.Sprintf(sqlDeleteVDiffs, encodeString(tablet.Keyspace), encodeString(workflow)) - rows := -1 - if _, err := wr.tmc.ExecuteFetchAsDba(ctx, tablet, false, &tabletmanagerdatapb.ExecuteFetchAsDbaRequest{ - Query: []byte(query), - MaxRows: uint64(rows), + if _, err := wr.tmc.VDiff(ctx, tablet, &tabletmanagerdatapb.VDiffRequest{ + Keyspace: tablet.Keyspace, + Workflow: workflow, + Action: string(vdiff2.DeleteAction), + ActionArg: vdiff2.AllActionArg, }); err != nil { +<<<<<<< HEAD if sqlErr, ok := err.(*mysql.SQLError); ok && sqlErr.Num != mysql.ERNoSuchTable { // the tables may not exist if no vdiffs have been run wr.Logger().Errorf("Error deleting vdiff data for %s.%s workflow: %v", tablet.Keyspace, workflow, err) } +======= + log.Errorf("Error deleting vdiff data for %s.%s workflow: %v", tablet.Keyspace, workflow, err) +>>>>>>> 04834c4fec (VDiff: Cleanup the controller for a VDiff before deleting it (#14107)) } } From d24a5e535c71bd629ad067d69d1f5696290c4d77 Mon Sep 17 00:00:00 2001 From: Matt Lord Date: Thu, 28 Sep 2023 22:28:07 -0400 Subject: [PATCH 2/2] Adjust for v16 Signed-off-by: Matt Lord --- go/vt/vtctl/workflow/server.go | 2133 ----------------- .../vttablet/tabletmanager/framework_test.go | 492 ---- go/vt/vttablet/tabletmanager/vdiff/action.go | 16 +- .../tabletmanager/vdiff/controller.go | 10 - go/vt/vttablet/tabletmanager/vdiff/schema.go | 14 +- go/vt/wrangler/workflow.go | 11 - 6 files changed, 9 insertions(+), 2667 deletions(-) delete mode 100644 go/vt/vttablet/tabletmanager/framework_test.go diff --git a/go/vt/vtctl/workflow/server.go b/go/vt/vtctl/workflow/server.go index 53768ade993..b26e198c2cd 100644 --- a/go/vt/vtctl/workflow/server.go +++ b/go/vt/vtctl/workflow/server.go @@ -749,2136 +749,3 @@ func (s *Server) getWorkflowCopyStates(ctx context.Context, tablet *topo.TabletI return copyStates, nil } -<<<<<<< HEAD -======= - -// MoveTablesCreate is part of the vtctlservicepb.VtctldServer interface. -// It passes the embedded TabletRequest object to the given keyspace's -// target primary tablets that will be executing the workflow. -func (s *Server) MoveTablesCreate(ctx context.Context, req *vtctldatapb.MoveTablesCreateRequest) (res *vtctldatapb.WorkflowStatusResponse, err error) { - span, ctx := trace.NewSpan(ctx, "workflow.Server.MoveTablesCreate") - defer span.Finish() - - span.Annotate("keyspace", req.TargetKeyspace) - span.Annotate("workflow", req.Workflow) - span.Annotate("cells", req.Cells) - span.Annotate("tablet_types", req.TabletTypes) - span.Annotate("on_ddl", req.OnDdl) - - sourceKeyspace := req.SourceKeyspace - targetKeyspace := req.TargetKeyspace - //FIXME validate tableSpecs, allTables, excludeTables - var ( - tables = req.IncludeTables - externalTopo *topo.Server - sourceTopo = s.ts - ) - - // When the source is an external cluster mounted using the Mount command. - if req.ExternalClusterName != "" { - externalTopo, err = s.ts.OpenExternalVitessClusterServer(ctx, req.ExternalClusterName) - if err != nil { - return nil, err - } - sourceTopo = externalTopo - log.Infof("Successfully opened external topo: %+v", externalTopo) - } - - var vschema *vschemapb.Keyspace - var origVSchema *vschemapb.Keyspace // If we need to rollback a failed create - vschema, err = s.ts.GetVSchema(ctx, targetKeyspace) - if err != nil { - return nil, err - } - if vschema == nil { - return nil, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "no vschema found for target keyspace %s", targetKeyspace) - } - ksTables, err := getTablesInKeyspace(ctx, sourceTopo, s.tmc, sourceKeyspace) - if err != nil { - return nil, err - } - if len(tables) > 0 { - err = s.validateSourceTablesExist(ctx, sourceKeyspace, ksTables, tables) - if err != nil { - return nil, err - } - } else { - if req.AllTables { - tables = ksTables - } else { - return nil, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "no tables to move") - } - } - if len(req.ExcludeTables) > 0 { - err = s.validateSourceTablesExist(ctx, sourceKeyspace, ksTables, req.ExcludeTables) - if err != nil { - return nil, err - } - } - var tables2 []string - for _, t := range tables { - if shouldInclude(t, req.ExcludeTables) { - tables2 = append(tables2, t) - } - } - tables = tables2 - if len(tables) == 0 { - return nil, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "no tables to move") - } - log.Infof("Found tables to move: %s", strings.Join(tables, ",")) - - if !vschema.Sharded { - // Save the original in case we need to restore it for a late failure - // in the defer(). - origVSchema = vschema.CloneVT() - if err := s.addTablesToVSchema(ctx, sourceKeyspace, vschema, tables, externalTopo == nil); err != nil { - return nil, err - } - } - - ms := &vtctldatapb.MaterializeSettings{ - Workflow: req.Workflow, - MaterializationIntent: vtctldatapb.MaterializationIntent_MOVETABLES, - SourceKeyspace: sourceKeyspace, - TargetKeyspace: targetKeyspace, - Cell: strings.Join(req.Cells, ","), - TabletTypes: topoproto.MakeStringTypeCSV(req.TabletTypes), - TabletSelectionPreference: req.TabletSelectionPreference, - StopAfterCopy: req.StopAfterCopy, - ExternalCluster: req.ExternalClusterName, - SourceShards: req.SourceShards, - OnDdl: req.OnDdl, - DeferSecondaryKeys: req.DeferSecondaryKeys, - AtomicCopy: req.AtomicCopy, - } - if req.SourceTimeZone != "" { - ms.SourceTimeZone = req.SourceTimeZone - ms.TargetTimeZone = "UTC" - } - createDDLMode := createDDLAsCopy - if req.DropForeignKeys { - createDDLMode = createDDLAsCopyDropForeignKeys - } - - for _, table := range tables { - buf := sqlparser.NewTrackedBuffer(nil) - buf.Myprintf("select * from %v", sqlparser.NewIdentifierCS(table)) - ms.TableSettings = append(ms.TableSettings, &vtctldatapb.TableMaterializeSettings{ - TargetTable: table, - SourceExpression: buf.String(), - CreateDdl: createDDLMode, - }) - } - mz := &materializer{ - ctx: ctx, - ts: s.ts, - sourceTs: sourceTopo, - tmc: s.tmc, - ms: ms, - } - err = mz.prepareMaterializerStreams(req) - if err != nil { - return nil, err - } - - // If we get an error after this point, where the vreplication streams/records - // have been created, then we clean up the workflow's artifacts. - defer func() { - if err != nil { - ts, cerr := s.buildTrafficSwitcher(ctx, ms.TargetKeyspace, ms.Workflow) - if cerr != nil { - err = vterrors.Wrapf(err, "failed to cleanup workflow artifacts: %v", cerr) - } - if cerr := s.dropArtifacts(ctx, false, &switcher{s: s, ts: ts}); cerr != nil { - err = vterrors.Wrapf(err, "failed to cleanup workflow artifacts: %v", cerr) - } - if origVSchema == nil { // There's no previous version to restore - return - } - if cerr := s.ts.SaveVSchema(ctx, targetKeyspace, origVSchema); cerr != nil { - err = vterrors.Wrapf(err, "failed to restore original target vschema: %v", cerr) - } - } - }() - - // Now that the streams have been successfully created, let's put the associated - // routing rules in place. - if externalTopo == nil { - if req.NoRoutingRules { - log.Warningf("Found --no-routing-rules flag, not creating routing rules for workflow %s.%s", targetKeyspace, req.Workflow) - } else { - // Save routing rules before vschema. If we save vschema first, and routing - // rules fails to save, we may generate duplicate table errors. - if mz.isPartial { - if err := createDefaultShardRoutingRules(mz.ctx, mz.ms, mz.ts); err != nil { - return nil, err - } - } - - rules, err := topotools.GetRoutingRules(ctx, s.ts) - if err != nil { - return nil, err - } - for _, table := range tables { - toSource := []string{sourceKeyspace + "." + table} - rules[table] = toSource - rules[table+"@replica"] = toSource - rules[table+"@rdonly"] = toSource - rules[targetKeyspace+"."+table] = toSource - rules[targetKeyspace+"."+table+"@replica"] = toSource - rules[targetKeyspace+"."+table+"@rdonly"] = toSource - rules[targetKeyspace+"."+table] = toSource - rules[sourceKeyspace+"."+table+"@replica"] = toSource - rules[sourceKeyspace+"."+table+"@rdonly"] = toSource - } - if err := topotools.SaveRoutingRules(ctx, s.ts, rules); err != nil { - return nil, err - } - } - if vschema != nil { - // We added to the vschema. - if err := s.ts.SaveVSchema(ctx, targetKeyspace, vschema); err != nil { - return nil, err - } - } - - } - if err := s.ts.RebuildSrvVSchema(ctx, nil); err != nil { - return nil, err - } - - if ms.SourceTimeZone != "" { - if err := mz.checkTZConversion(ctx, ms.SourceTimeZone); err != nil { - return nil, err - } - } - - tabletShards, err := s.collectTargetStreams(ctx, mz) - if err != nil { - return nil, err - } - - migrationID, err := getMigrationID(targetKeyspace, tabletShards) - if err != nil { - return nil, err - } - - if mz.ms.ExternalCluster == "" { - exists, tablets, err := s.checkIfPreviousJournalExists(ctx, mz, migrationID) - if err != nil { - return nil, err - } - if exists { - log.Errorf("Found a previous journal entry for %d", migrationID) - msg := fmt.Sprintf("found an entry from a previous run for migration id %d in _vt.resharding_journal on tablets %s, ", - migrationID, strings.Join(tablets, ",")) - msg += fmt.Sprintf("please review and delete it before proceeding and then start the workflow using: MoveTables --workflow %s --target-keyspace %s start", - req.Workflow, req.TargetKeyspace) - return nil, vterrors.Errorf(vtrpcpb.Code_INTERNAL, msg) - } - } - - if req.AutoStart { - if err := mz.startStreams(ctx); err != nil { - return nil, err - } - } - - return s.WorkflowStatus(ctx, &vtctldatapb.WorkflowStatusRequest{ - Keyspace: targetKeyspace, - Workflow: req.Workflow, - }) -} - -// MoveTablesComplete is part of the vtctlservicepb.VtctldServer interface. -// It cleans up a successful MoveTables workflow and its related artifacts. -func (s *Server) MoveTablesComplete(ctx context.Context, req *vtctldatapb.MoveTablesCompleteRequest) (*vtctldatapb.MoveTablesCompleteResponse, error) { - span, ctx := trace.NewSpan(ctx, "workflow.Server.MoveTablesComplete") - defer span.Finish() - - ts, state, err := s.getWorkflowState(ctx, req.TargetKeyspace, req.Workflow) - if err != nil { - return nil, err - } - - var summary string - if req.DryRun { - summary = fmt.Sprintf("Complete dry run results for workflow %s.%s at %v", req.TargetKeyspace, req.Workflow, time.Now().UTC().Format(time.RFC822)) - } else { - summary = fmt.Sprintf("Successfully completed the %s workflow in the %s keyspace", req.Workflow, req.TargetKeyspace) - } - var dryRunResults *[]string - - if state.WorkflowType == TypeMigrate { - dryRunResults, err = s.finalizeMigrateWorkflow(ctx, req.TargetKeyspace, req.Workflow, strings.Join(ts.tables, ","), - false, req.KeepData, req.KeepRoutingRules, req.DryRun) - if err != nil { - return nil, vterrors.Wrapf(err, "failed to finalize the %s workflow in the %s keyspace", - req.Workflow, req.TargetKeyspace) - } - resp := &vtctldatapb.MoveTablesCompleteResponse{ - Summary: summary, - } - if dryRunResults != nil { - resp.DryRunResults = *dryRunResults - } - return resp, nil - } - - if !state.WritesSwitched || len(state.ReplicaCellsNotSwitched) > 0 || len(state.RdonlyCellsNotSwitched) > 0 { - return nil, ErrWorkflowNotFullySwitched - } - var renameTable TableRemovalType - if req.RenameTables { - renameTable = RenameTable - } else { - renameTable = DropTable - } - if dryRunResults, err = s.dropSources(ctx, ts, renameTable, req.KeepData, req.KeepRoutingRules, false, req.DryRun); err != nil { - return nil, err - } - - resp := &vtctldatapb.MoveTablesCompleteResponse{ - Summary: summary, - } - if dryRunResults != nil { - resp.DryRunResults = *dryRunResults - } - - return resp, nil -} - -// ReshardCreate is part of the vtctlservicepb.VtctldServer interface. -func (s *Server) ReshardCreate(ctx context.Context, req *vtctldatapb.ReshardCreateRequest) (*vtctldatapb.WorkflowStatusResponse, error) { - span, ctx := trace.NewSpan(ctx, "workflow.Server.ReshardCreate") - defer span.Finish() - - span.Annotate("keyspace", req.Keyspace) - span.Annotate("workflow", req.Workflow) - span.Annotate("source_shards", req.SourceShards) - span.Annotate("target_shards", req.TargetShards) - span.Annotate("cells", req.Cells) - span.Annotate("tablet_types", req.TabletTypes) - span.Annotate("on_ddl", req.OnDdl) - - keyspace := req.Keyspace - cells := req.Cells - // TODO: validate workflow does not exist. - - if err := s.ts.ValidateSrvKeyspace(ctx, keyspace, strings.Join(cells, ",")); err != nil { - err2 := vterrors.Wrapf(err, "SrvKeyspace for keyspace %s is corrupt for cell(s) %s", keyspace, cells) - log.Errorf("%w", err2) - return nil, err - } - rs, err := s.buildResharder(ctx, keyspace, req.Workflow, req.SourceShards, req.TargetShards, strings.Join(cells, ","), "") - if err != nil { - return nil, vterrors.Wrap(err, "buildResharder") - } - rs.onDDL = req.OnDdl - rs.stopAfterCopy = req.StopAfterCopy - rs.deferSecondaryKeys = req.DeferSecondaryKeys - if !req.SkipSchemaCopy { - if err := rs.copySchema(ctx); err != nil { - return nil, vterrors.Wrap(err, "copySchema") - } - } - if err := rs.createStreams(ctx); err != nil { - return nil, vterrors.Wrap(err, "createStreams") - } - - if req.AutoStart { - if err := rs.startStreams(ctx); err != nil { - return nil, vterrors.Wrap(err, "startStreams") - } - } else { - log.Warningf("Streams will not be started since --auto-start is set to false") - } - return nil, nil -} - -// VDiffCreate is part of the vtctlservicepb.VtctldServer interface. -// It passes on the request to the target primary tablets that are -// participating in the given workflow and VDiff. -func (s *Server) VDiffCreate(ctx context.Context, req *vtctldatapb.VDiffCreateRequest) (*vtctldatapb.VDiffCreateResponse, error) { - span, ctx := trace.NewSpan(ctx, "workflow.Server.VDiffCreate") - defer span.Finish() - - span.Annotate("keyspace", req.TargetKeyspace) - span.Annotate("workflow", req.Workflow) - span.Annotate("uuid", req.Uuid) - span.Annotate("source_cells", req.SourceCells) - span.Annotate("target_cells", req.TargetCells) - span.Annotate("tablet_types", req.TabletTypes) - span.Annotate("tables", req.Tables) - span.Annotate("auto_retry", req.AutoRetry) - - tabletTypesStr := topoproto.MakeStringTypeCSV(req.TabletTypes) - if req.TabletSelectionPreference == tabletmanagerdatapb.TabletSelectionPreference_INORDER { - tabletTypesStr = discovery.InOrderHint + tabletTypesStr - } - - options := &tabletmanagerdatapb.VDiffOptions{ - PickerOptions: &tabletmanagerdatapb.VDiffPickerOptions{ - TabletTypes: tabletTypesStr, - SourceCell: strings.Join(req.SourceCells, ","), - TargetCell: strings.Join(req.TargetCells, ","), - }, - CoreOptions: &tabletmanagerdatapb.VDiffCoreOptions{ - Tables: strings.Join(req.Tables, ","), - AutoRetry: req.AutoRetry, - MaxRows: req.MaxExtraRowsToCompare, - TimeoutSeconds: req.FilteredReplicationWaitTime.Seconds, - MaxExtraRowsToCompare: req.MaxExtraRowsToCompare, - UpdateTableStats: req.UpdateTableStats, - }, - ReportOptions: &tabletmanagerdatapb.VDiffReportOptions{ - OnlyPks: req.OnlyPKs, - DebugQuery: req.DebugQuery, - }, - } - - tabletreq := &tabletmanagerdatapb.VDiffRequest{ - Keyspace: req.TargetKeyspace, - Workflow: req.Workflow, - Action: string(vdiff.CreateAction), - Options: options, - VdiffUuid: req.Uuid, - } - - ts, err := s.buildTrafficSwitcher(ctx, req.TargetKeyspace, req.Workflow) - if err != nil { - return nil, err - } - if ts.frozen { - return nil, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "invalid VDiff run: writes have been already been switched for workflow %s.%s", - req.TargetKeyspace, req.Workflow) - } - - err = ts.ForAllTargets(func(target *MigrationTarget) error { - _, err := s.tmc.VDiff(ctx, target.GetPrimary().Tablet, tabletreq) - return err - }) - if err != nil { - log.Errorf("Error executing vdiff create action: %v", err) - return nil, err - } - - return &vtctldatapb.VDiffCreateResponse{ - UUID: req.Uuid, - }, nil -} - -// VDiffDelete is part of the vtctlservicepb.VtctldServer interface. -func (s *Server) VDiffDelete(ctx context.Context, req *vtctldatapb.VDiffDeleteRequest) (*vtctldatapb.VDiffDeleteResponse, error) { - span, ctx := trace.NewSpan(ctx, "workflow.Server.VDiffDelete") - defer span.Finish() - - span.Annotate("keyspace", req.TargetKeyspace) - span.Annotate("workflow", req.Workflow) - span.Annotate("argument", req.Arg) - - tabletreq := &tabletmanagerdatapb.VDiffRequest{ - Keyspace: req.TargetKeyspace, - Workflow: req.Workflow, - Action: string(vdiff.DeleteAction), - ActionArg: req.Arg, - } - - ts, err := s.buildTrafficSwitcher(ctx, req.TargetKeyspace, req.Workflow) - if err != nil { - return nil, err - } - - err = ts.ForAllTargets(func(target *MigrationTarget) error { - _, err := s.tmc.VDiff(ctx, target.GetPrimary().Tablet, tabletreq) - return err - }) - if err != nil { - log.Errorf("Error executing vdiff delete action: %v", err) - return nil, err - } - - return &vtctldatapb.VDiffDeleteResponse{}, nil -} - -// VDiffResume is part of the vtctlservicepb.VtctldServer interface. -func (s *Server) VDiffResume(ctx context.Context, req *vtctldatapb.VDiffResumeRequest) (*vtctldatapb.VDiffResumeResponse, error) { - span, ctx := trace.NewSpan(ctx, "workflow.Server.VDiffResume") - defer span.Finish() - - span.Annotate("keyspace", req.TargetKeyspace) - span.Annotate("workflow", req.Workflow) - span.Annotate("uuid", req.Uuid) - - tabletreq := &tabletmanagerdatapb.VDiffRequest{ - Keyspace: req.TargetKeyspace, - Workflow: req.Workflow, - Action: string(vdiff.ResumeAction), - VdiffUuid: req.Uuid, - } - - ts, err := s.buildTrafficSwitcher(ctx, req.TargetKeyspace, req.Workflow) - if err != nil { - return nil, err - } - - err = ts.ForAllTargets(func(target *MigrationTarget) error { - _, err := s.tmc.VDiff(ctx, target.GetPrimary().Tablet, tabletreq) - return err - }) - if err != nil { - log.Errorf("Error executing vdiff resume action: %v", err) - return nil, err - } - - return &vtctldatapb.VDiffResumeResponse{}, nil -} - -// VDiffShow is part of the vtctlservicepb.VtctldServer interface. -func (s *Server) VDiffShow(ctx context.Context, req *vtctldatapb.VDiffShowRequest) (*vtctldatapb.VDiffShowResponse, error) { - span, ctx := trace.NewSpan(ctx, "workflow.Server.VDiffShow") - defer span.Finish() - - span.Annotate("keyspace", req.TargetKeyspace) - span.Annotate("workflow", req.Workflow) - span.Annotate("argument", req.Arg) - - tabletreq := &tabletmanagerdatapb.VDiffRequest{ - Keyspace: req.TargetKeyspace, - Workflow: req.Workflow, - Action: string(vdiff.ShowAction), - ActionArg: req.Arg, - } - - ts, err := s.buildTrafficSwitcher(ctx, req.TargetKeyspace, req.Workflow) - if err != nil { - return nil, err - } - - output := &vdiffOutput{ - responses: make(map[string]*tabletmanagerdatapb.VDiffResponse, len(ts.targets)), - err: nil, - } - output.err = ts.ForAllTargets(func(target *MigrationTarget) error { - resp, err := s.tmc.VDiff(ctx, target.GetPrimary().Tablet, tabletreq) - output.mu.Lock() - defer output.mu.Unlock() - output.responses[target.GetShard().ShardName()] = resp - return err - }) - if output.err != nil { - log.Errorf("Error executing vdiff show action: %v", output.err) - return nil, output.err - } - - return &vtctldatapb.VDiffShowResponse{ - TabletResponses: output.responses, - }, nil -} - -// VDiffStop is part of the vtctlservicepb.VtctldServer interface. -func (s *Server) VDiffStop(ctx context.Context, req *vtctldatapb.VDiffStopRequest) (*vtctldatapb.VDiffStopResponse, error) { - span, ctx := trace.NewSpan(ctx, "workflow.Server.VDiffStop") - defer span.Finish() - - span.Annotate("keyspace", req.TargetKeyspace) - span.Annotate("workflow", req.Workflow) - span.Annotate("uuid", req.Uuid) - - tabletreq := &tabletmanagerdatapb.VDiffRequest{ - Keyspace: req.TargetKeyspace, - Workflow: req.Workflow, - Action: string(vdiff.StopAction), - VdiffUuid: req.Uuid, - } - - ts, err := s.buildTrafficSwitcher(ctx, req.TargetKeyspace, req.Workflow) - if err != nil { - return nil, err - } - - err = ts.ForAllTargets(func(target *MigrationTarget) error { - _, err := s.tmc.VDiff(ctx, target.GetPrimary().Tablet, tabletreq) - return err - }) - if err != nil { - log.Errorf("Error executing vdiff stop action: %v", err) - return nil, err - } - - return &vtctldatapb.VDiffStopResponse{}, nil -} - -// WorkflowDelete is part of the vtctlservicepb.VtctldServer interface. -// It passes on the request to the target primary tablets that are -// participating in the given workflow. -func (s *Server) WorkflowDelete(ctx context.Context, req *vtctldatapb.WorkflowDeleteRequest) (*vtctldatapb.WorkflowDeleteResponse, error) { - span, ctx := trace.NewSpan(ctx, "workflow.Server.WorkflowDelete") - defer span.Finish() - - span.Annotate("keyspace", req.Keyspace) - span.Annotate("workflow", req.Workflow) - - // Cleanup related data and artifacts. - if _, err := s.DropTargets(ctx, req.Keyspace, req.Workflow, req.KeepData, req.KeepRoutingRules, false); err != nil { - if topo.IsErrType(err, topo.NoNode) { - return nil, vterrors.Wrapf(err, "%s keyspace does not exist", req.Keyspace) - } - return nil, err - } - - deleteReq := &tabletmanagerdatapb.DeleteVReplicationWorkflowRequest{ - Workflow: req.Workflow, - } - vx := vexec.NewVExec(req.Keyspace, req.Workflow, s.ts, s.tmc) - callback := func(ctx context.Context, tablet *topo.TabletInfo) (*querypb.QueryResult, error) { - res, err := s.tmc.DeleteVReplicationWorkflow(ctx, tablet.Tablet, deleteReq) - if err != nil { - return nil, err - } - // Best effort cleanup and optimization of related data. - s.deleteWorkflowVDiffData(ctx, tablet.Tablet, req.Workflow) - s.optimizeCopyStateTable(tablet.Tablet) - return res.Result, err - } - res, err := vx.CallbackContext(ctx, callback) - if err != nil { - return nil, err - } - - if len(res) == 0 { - return nil, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "the %s workflow does not exist in the %s keyspace", req.Workflow, req.Keyspace) - } - - response := &vtctldatapb.WorkflowDeleteResponse{} - response.Summary = fmt.Sprintf("Successfully cancelled the %s workflow in the %s keyspace", req.Workflow, req.Keyspace) - details := make([]*vtctldatapb.WorkflowDeleteResponse_TabletInfo, 0, len(res)) - for tinfo, tres := range res { - result := &vtctldatapb.WorkflowDeleteResponse_TabletInfo{ - Tablet: tinfo.Alias, - Deleted: tres.RowsAffected > 0, // Can be more than one with shard merges - } - details = append(details, result) - } - response.Details = details - return response, nil -} - -func (s *Server) WorkflowStatus(ctx context.Context, req *vtctldatapb.WorkflowStatusRequest) (*vtctldatapb.WorkflowStatusResponse, error) { - ts, state, err := s.getWorkflowState(ctx, req.Keyspace, req.Workflow) - if err != nil { - return nil, err - } - copyProgress, err := s.GetCopyProgress(ctx, ts, state) - if err != nil { - return nil, err - } - resp := &vtctldatapb.WorkflowStatusResponse{} - if copyProgress != nil { - resp.TableCopyState = make(map[string]*vtctldatapb.WorkflowStatusResponse_TableCopyState, len(*copyProgress)) - // We sort the tables for intuitive and consistent output. - var tables []string - for table := range *copyProgress { - tables = append(tables, table) - } - sort.Strings(tables) - var progress tableCopyProgress - for _, table := range tables { - var rowCountPct, tableSizePct float32 - resp.TableCopyState[table] = &vtctldatapb.WorkflowStatusResponse_TableCopyState{} - progress = *(*copyProgress)[table] - if progress.SourceRowCount > 0 { - rowCountPct = float32(100.0 * float64(progress.TargetRowCount) / float64(progress.SourceRowCount)) - } - if progress.SourceTableSize > 0 { - tableSizePct = float32(100.0 * float64(progress.TargetTableSize) / float64(progress.SourceTableSize)) - } - resp.TableCopyState[table].RowsCopied = progress.TargetRowCount - resp.TableCopyState[table].RowsTotal = progress.SourceRowCount - resp.TableCopyState[table].RowsPercentage = rowCountPct - resp.TableCopyState[table].BytesCopied = progress.TargetTableSize - resp.TableCopyState[table].BytesTotal = progress.SourceTableSize - resp.TableCopyState[table].BytesPercentage = tableSizePct - } - } - - workflow, err := s.GetWorkflow(ctx, req.Keyspace, req.Workflow) - if err != nil { - return nil, err - } - - // The stream key is target keyspace/tablet alias, e.g. 0/test-0000000100. - // We sort the keys for intuitive and consistent output. - streamKeys := make([]string, 0, len(workflow.ShardStreams)) - for streamKey := range workflow.ShardStreams { - streamKeys = append(streamKeys, streamKey) - } - sort.Strings(streamKeys) - resp.ShardStreams = make(map[string]*vtctldatapb.WorkflowStatusResponse_ShardStreams, len(streamKeys)) - for _, streamKey := range streamKeys { - streams := workflow.ShardStreams[streamKey].GetStreams() - keyParts := strings.Split(streamKey, "/") - if len(keyParts) != 2 { - return nil, vterrors.Errorf(vtrpcpb.Code_INTERNAL, "unexpected stream key format in: %s ; expect /", - streamKey) - } - // We want to use target keyspace/shard as the map key for the - // response, e.g. customer/-80. - ksShard := fmt.Sprintf("%s/%s", req.Keyspace, keyParts[0]) - resp.ShardStreams[ksShard] = &vtctldatapb.WorkflowStatusResponse_ShardStreams{} - resp.ShardStreams[ksShard].Streams = make([]*vtctldatapb.WorkflowStatusResponse_ShardStreamState, len(streams)) - for i, st := range streams { - info := []string{} - ts := &vtctldatapb.WorkflowStatusResponse_ShardStreamState{} - if st.State == binlogdatapb.VReplicationWorkflowState_Error.String() { - info = append(info, st.Message) - } else if st.Position == "" { - info = append(info, "VStream has not started") - } else { - now := time.Now().Nanosecond() - updateLag := int64(now) - st.TimeUpdated.Seconds - if updateLag > 0*1e9 { - info = append(info, "VStream may not be running") - } - txLag := int64(now) - st.TransactionTimestamp.Seconds - info = append(info, fmt.Sprintf("VStream Lag: %ds", txLag/1e9)) - if st.TransactionTimestamp.Seconds > 0 { // if no events occur after copy phase, TransactionTimeStamp can be 0 - info = append(info, fmt.Sprintf("; Tx time: %s.", time.Unix(st.TransactionTimestamp.Seconds, 0).Format(time.ANSIC))) - } - } - ts.Id = int32(st.Id) - ts.Tablet = st.Tablet - ts.SourceShard = fmt.Sprintf("%s/%s", st.BinlogSource.Keyspace, st.BinlogSource.Shard) - ts.Position = st.Position - ts.Status = st.State - ts.Info = strings.Join(info, "; ") - resp.ShardStreams[ksShard].Streams[i] = ts - } - } - - return resp, nil -} - -// GetCopyProgress returns the progress of all tables being copied in the -// workflow. -func (s *Server) GetCopyProgress(ctx context.Context, ts *trafficSwitcher, state *State) (*copyProgress, error) { - getTablesQuery := "select distinct table_name from _vt.copy_state cs, _vt.vreplication vr where vr.id = cs.vrepl_id and vr.id = %d" - getRowCountQuery := "select table_name, table_rows, data_length from information_schema.tables where table_schema = %s and table_name in (%s)" - tables := make(map[string]bool) - const MaxRows = 1000 - sourcePrimaries := make(map[*topodatapb.TabletAlias]bool) - for _, target := range ts.targets { - for id, bls := range target.Sources { - query := fmt.Sprintf(getTablesQuery, id) - p3qr, err := s.tmc.ExecuteFetchAsDba(ctx, target.GetPrimary().Tablet, true, &tabletmanagerdatapb.ExecuteFetchAsDbaRequest{ - Query: []byte(query), - MaxRows: MaxRows, - }) - if err != nil { - return nil, err - } - if len(p3qr.Rows) < 1 { - continue - } - qr := sqltypes.Proto3ToResult(p3qr) - for i := 0; i < len(p3qr.Rows); i++ { - tables[qr.Rows[i][0].ToString()] = true - } - sourcesi, err := s.ts.GetShard(ctx, bls.Keyspace, bls.Shard) - if err != nil { - return nil, err - } - found := false - for existingSource := range sourcePrimaries { - if existingSource.Uid == sourcesi.PrimaryAlias.Uid { - found = true - } - } - if !found { - sourcePrimaries[sourcesi.PrimaryAlias] = true - } - } - } - if len(tables) == 0 { - return nil, nil - } - var tableList []string - targetRowCounts := make(map[string]int64) - sourceRowCounts := make(map[string]int64) - targetTableSizes := make(map[string]int64) - sourceTableSizes := make(map[string]int64) - - for table := range tables { - tableList = append(tableList, encodeString(table)) - targetRowCounts[table] = 0 - sourceRowCounts[table] = 0 - targetTableSizes[table] = 0 - sourceTableSizes[table] = 0 - } - - var getTableMetrics = func(tablet *topodatapb.Tablet, query string, rowCounts *map[string]int64, tableSizes *map[string]int64) error { - p3qr, err := s.tmc.ExecuteFetchAsDba(ctx, tablet, true, &tabletmanagerdatapb.ExecuteFetchAsDbaRequest{ - Query: []byte(query), - MaxRows: uint64(len(tables)), - }) - if err != nil { - return err - } - qr := sqltypes.Proto3ToResult(p3qr) - for i := 0; i < len(qr.Rows); i++ { - table := qr.Rows[i][0].ToString() - rowCount, err := qr.Rows[i][1].ToCastInt64() - if err != nil { - return err - } - tableSize, err := qr.Rows[i][2].ToCastInt64() - if err != nil { - return err - } - (*rowCounts)[table] += rowCount - (*tableSizes)[table] += tableSize - } - return nil - } - sourceDbName := "" - for _, tsSource := range ts.sources { - sourceDbName = tsSource.GetPrimary().DbName() - break - } - if sourceDbName == "" { - return nil, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "no sources found for workflow %s.%s", state.TargetKeyspace, state.Workflow) - } - targetDbName := "" - for _, tsTarget := range ts.targets { - targetDbName = tsTarget.GetPrimary().DbName() - break - } - if sourceDbName == "" || targetDbName == "" { - return nil, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "workflow %s.%s is incorrectly configured", state.TargetKeyspace, state.Workflow) - } - sort.Strings(tableList) // sort list for repeatability for mocking in tests - tablesStr := strings.Join(tableList, ",") - query := fmt.Sprintf(getRowCountQuery, encodeString(targetDbName), tablesStr) - for _, target := range ts.targets { - tablet := target.GetPrimary().Tablet - if err := getTableMetrics(tablet, query, &targetRowCounts, &targetTableSizes); err != nil { - return nil, err - } - } - - query = fmt.Sprintf(getRowCountQuery, encodeString(sourceDbName), tablesStr) - for source := range sourcePrimaries { - ti, err := s.ts.GetTablet(ctx, source) - tablet := ti.Tablet - if err != nil { - return nil, err - } - if err := getTableMetrics(tablet, query, &sourceRowCounts, &sourceTableSizes); err != nil { - return nil, err - } - } - - copyProgress := copyProgress{} - for table, rowCount := range targetRowCounts { - copyProgress[table] = &tableCopyProgress{ - TargetRowCount: rowCount, - TargetTableSize: targetTableSizes[table], - SourceRowCount: sourceRowCounts[table], - SourceTableSize: sourceTableSizes[table], - } - } - return ©Progress, nil -} - -// WorkflowUpdate is part of the vtctlservicepb.VtctldServer interface. -// It passes the embedded TabletRequest object to the given keyspace's -// target primary tablets that are participating in the given workflow. -func (s *Server) WorkflowUpdate(ctx context.Context, req *vtctldatapb.WorkflowUpdateRequest) (*vtctldatapb.WorkflowUpdateResponse, error) { - span, ctx := trace.NewSpan(ctx, "workflow.Server.WorkflowUpdate") - defer span.Finish() - - span.Annotate("keyspace", req.Keyspace) - span.Annotate("workflow", req.TabletRequest.Workflow) - span.Annotate("cells", req.TabletRequest.Cells) - span.Annotate("tablet_types", req.TabletRequest.TabletTypes) - span.Annotate("on_ddl", req.TabletRequest.OnDdl) - - vx := vexec.NewVExec(req.Keyspace, req.TabletRequest.Workflow, s.ts, s.tmc) - callback := func(ctx context.Context, tablet *topo.TabletInfo) (*querypb.QueryResult, error) { - res, err := s.tmc.UpdateVReplicationWorkflow(ctx, tablet.Tablet, req.TabletRequest) - if err != nil { - return nil, err - } - return res.Result, err - } - res, err := vx.CallbackContext(ctx, callback) - if err != nil { - if topo.IsErrType(err, topo.NoNode) { - return nil, vterrors.Wrapf(err, "%s keyspace does not exist", req.Keyspace) - } - return nil, err - } - - if len(res) == 0 { - return nil, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "the %s workflow does not exist in the %s keyspace", req.TabletRequest.Workflow, req.Keyspace) - } - - response := &vtctldatapb.WorkflowUpdateResponse{} - response.Summary = fmt.Sprintf("Successfully updated the %s workflow on (%d) target primary tablets in the %s keyspace", req.TabletRequest.Workflow, len(res), req.Keyspace) - details := make([]*vtctldatapb.WorkflowUpdateResponse_TabletInfo, 0, len(res)) - for tinfo, tres := range res { - result := &vtctldatapb.WorkflowUpdateResponse_TabletInfo{ - Tablet: tinfo.Alias, - Changed: tres.RowsAffected > 0, // Can be more than one with shard merges - } - details = append(details, result) - } - response.Details = details - return response, nil -} - -// validateSourceTablesExist validates that tables provided are present -// in the source keyspace. -func (s *Server) validateSourceTablesExist(ctx context.Context, sourceKeyspace string, ksTables, tables []string) error { - var missingTables []string - for _, table := range tables { - if schema.IsInternalOperationTableName(table) { - continue - } - found := false - - for _, ksTable := range ksTables { - if table == ksTable { - found = true - break - } - } - if !found { - missingTables = append(missingTables, table) - } - } - if len(missingTables) > 0 { - return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "table(s) not found in source keyspace %s: %s", sourceKeyspace, strings.Join(missingTables, ",")) - } - return nil -} - -// addTablesToVSchema adds tables to an (unsharded) vschema if they are not already defined. -// If copyVSchema is true then we copy over the vschema table definitions from the source, -// otherwise we create empty ones. -// For a migrate workflow we do not copy the vschema since the source keyspace is just a -// proxy to import data into Vitess. -func (s *Server) addTablesToVSchema(ctx context.Context, sourceKeyspace string, targetVSchema *vschemapb.Keyspace, tables []string, copyVSchema bool) error { - if targetVSchema.Tables == nil { - targetVSchema.Tables = make(map[string]*vschemapb.Table) - } - if copyVSchema { - srcVSchema, err := s.ts.GetVSchema(ctx, sourceKeyspace) - if err != nil { - return vterrors.Wrapf(err, "failed to get vschema for source keyspace %s", sourceKeyspace) - } - for _, table := range tables { - srcTable, sok := srcVSchema.Tables[table] - if _, tok := targetVSchema.Tables[table]; sok && !tok { - targetVSchema.Tables[table] = srcTable - // If going from sharded to unsharded, then we need to remove the - // column vindexes as they are not valid for unsharded tables. - if srcVSchema.Sharded { - targetVSchema.Tables[table].ColumnVindexes = nil - } - } - } - } - // Ensure that each table at least has an empty definition on the target. - for _, table := range tables { - if _, tok := targetVSchema.Tables[table]; !tok { - targetVSchema.Tables[table] = &vschemapb.Table{} - } - } - return nil -} - -func (s *Server) collectTargetStreams(ctx context.Context, mz *materializer) ([]string, error) { - var shardTablets []string - var mu sync.Mutex - err := mz.forAllTargets(func(target *topo.ShardInfo) error { - var qrproto *querypb.QueryResult - var id int64 - var err error - targetPrimary, err := s.ts.GetTablet(ctx, target.PrimaryAlias) - if err != nil { - return vterrors.Wrapf(err, "GetTablet(%v) failed", target.PrimaryAlias) - } - query := fmt.Sprintf("select id from _vt.vreplication where db_name=%s and workflow=%s", encodeString(targetPrimary.DbName()), encodeString(mz.ms.Workflow)) - if qrproto, err = s.tmc.VReplicationExec(ctx, targetPrimary.Tablet, query); err != nil { - return vterrors.Wrapf(err, "VReplicationExec(%v, %s)", targetPrimary.Tablet, query) - } - qr := sqltypes.Proto3ToResult(qrproto) - for i := 0; i < len(qr.Rows); i++ { - id, err = qr.Rows[i][0].ToCastInt64() - if err != nil { - return err - } - mu.Lock() - shardTablets = append(shardTablets, fmt.Sprintf("%s:%d", target.ShardName(), id)) - mu.Unlock() - } - return nil - }) - if err != nil { - return nil, err - } - return shardTablets, nil -} - -func (s *Server) checkIfPreviousJournalExists(ctx context.Context, mz *materializer, migrationID int64) (bool, []string, error) { - forAllSources := func(f func(*topo.ShardInfo) error) error { - var wg sync.WaitGroup - allErrors := &concurrency.AllErrorRecorder{} - for _, sourceShard := range mz.sourceShards { - wg.Add(1) - go func(sourceShard *topo.ShardInfo) { - defer wg.Done() - - if err := f(sourceShard); err != nil { - allErrors.RecordError(err) - } - }(sourceShard) - } - wg.Wait() - return allErrors.AggrError(vterrors.Aggregate) - } - - var ( - mu sync.Mutex - exists bool - tablets []string - ) - - err := forAllSources(func(si *topo.ShardInfo) error { - tablet, err := s.ts.GetTablet(ctx, si.PrimaryAlias) - if err != nil { - return err - } - if tablet == nil { - return nil - } - _, exists, err = s.CheckReshardingJournalExistsOnTablet(ctx, tablet.Tablet, migrationID) - if err != nil { - return err - } - if exists { - mu.Lock() - defer mu.Unlock() - tablets = append(tablets, tablet.AliasString()) - } - return nil - }) - return exists, tablets, err -} - -// deleteWorkflowVDiffData cleans up any potential VDiff related data associated -// with the workflow on the given tablet. -func (s *Server) deleteWorkflowVDiffData(ctx context.Context, tablet *topodatapb.Tablet, workflow string) { - if _, err := s.tmc.VDiff(ctx, tablet, &tabletmanagerdatapb.VDiffRequest{ - Keyspace: tablet.Keyspace, - Workflow: workflow, - Action: string(vdiff.DeleteAction), - ActionArg: vdiff.AllActionArg, - }); err != nil { - log.Errorf("Error deleting vdiff data for %s.%s workflow: %v", tablet.Keyspace, workflow, err) - } -} - -// optimizeCopyStateTable rebuilds the copy_state table to ensure the on-disk -// structures are minimal and optimized and resets the auto-inc value for -// subsequent inserts. -// This helps to ensure that the size, storage, and performance related factors -// for the table remain optimal over time and that we don't ever exhaust the -// available auto-inc values for the table. -// Note: it's not critical that this executes successfully any given time, it's -// only important that we try to do this periodically so that things stay in an -// optimal state over long periods of time. For this reason, the work is done -// asynchronously in the background on the given tablet and any failures are -// logged as warnings. Because it's done in the background we use the AllPrivs -// account to be sure that we don't execute the writes if READ_ONLY is set on -// the MySQL instance. -func (s *Server) optimizeCopyStateTable(tablet *topodatapb.Tablet) { - if s.sem != nil { - if !s.sem.TryAcquire(1) { - log.Warningf("Deferring work to optimize the copy_state table on %q due to hitting the maximum concurrent background job limit.", - tablet.Alias.String()) - return - } - } - go func() { - defer func() { - if s.sem != nil { - s.sem.Release(1) - } - }() - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Minute) - defer cancel() - sqlOptimizeTable := "optimize table _vt.copy_state" - if _, err := s.tmc.ExecuteFetchAsAllPrivs(ctx, tablet, &tabletmanagerdatapb.ExecuteFetchAsAllPrivsRequest{ - Query: []byte(sqlOptimizeTable), - MaxRows: uint64(100), // always produces 1+rows with notes and status - }); err != nil { - if sqlErr, ok := err.(*sqlerror.SQLError); ok && sqlErr.Num == sqlerror.ERNoSuchTable { // the table may not exist - return - } - log.Warningf("Failed to optimize the copy_state table on %q: %v", tablet.Alias.String(), err) - } - // This will automatically set the value to 1 or the current max value in the - // table, whichever is greater. - sqlResetAutoInc := "alter table _vt.copy_state auto_increment = 1" - if _, err := s.tmc.ExecuteFetchAsAllPrivs(ctx, tablet, &tabletmanagerdatapb.ExecuteFetchAsAllPrivsRequest{ - Query: []byte(sqlResetAutoInc), - MaxRows: uint64(0), - }); err != nil { - log.Warningf("Failed to reset the auto_increment value for the copy_state table on %q: %v", - tablet.Alias.String(), err) - } - }() -} - -// DropTargets cleans up target tables, shards and denied tables if a MoveTables/Reshard -// is cancelled. -func (s *Server) DropTargets(ctx context.Context, targetKeyspace, workflow string, keepData, keepRoutingRules, dryRun bool) (*[]string, error) { - ts, state, err := s.getWorkflowState(ctx, targetKeyspace, workflow) - if err != nil { - log.Errorf("Failed to get VReplication workflow state for %s.%s: %v", targetKeyspace, workflow, err) - return nil, err - } - - // Return an error if the workflow traffic is partially switched. - if state.WritesSwitched || len(state.ReplicaCellsSwitched) > 0 || len(state.RdonlyCellsSwitched) > 0 { - return nil, ErrWorkflowPartiallySwitched - } - - if state.WorkflowType == TypeMigrate { - _, err := s.finalizeMigrateWorkflow(ctx, targetKeyspace, workflow, "", true, keepData, keepRoutingRules, dryRun) - return nil, err - } - - ts.keepRoutingRules = keepRoutingRules - var sw iswitcher - if dryRun { - sw = &switcherDryRun{ts: ts, drLog: NewLogRecorder()} - } else { - sw = &switcher{s: s, ts: ts} - } - var tctx context.Context - tctx, sourceUnlock, lockErr := sw.lockKeyspace(ctx, ts.SourceKeyspaceName(), "DropTargets") - if lockErr != nil { - ts.Logger().Errorf("Source LockKeyspace failed: %v", lockErr) - return nil, lockErr - } - defer sourceUnlock(&err) - ctx = tctx - - if ts.TargetKeyspaceName() != ts.SourceKeyspaceName() { - tctx, targetUnlock, lockErr := sw.lockKeyspace(ctx, ts.TargetKeyspaceName(), "DropTargets") - if lockErr != nil { - ts.Logger().Errorf("Target LockKeyspace failed: %v", lockErr) - return nil, lockErr - } - defer targetUnlock(&err) - ctx = tctx - } - if !keepData { - switch ts.MigrationType() { - case binlogdatapb.MigrationType_TABLES: - if err := sw.removeTargetTables(ctx); err != nil { - return nil, err - } - if err := sw.dropSourceDeniedTables(ctx); err != nil { - return nil, err - } - if err := sw.dropTargetDeniedTables(ctx); err != nil { - return nil, err - } - case binlogdatapb.MigrationType_SHARDS: - if err := sw.dropTargetShards(ctx); err != nil { - return nil, err - } - } - } - if err := s.dropRelatedArtifacts(ctx, keepRoutingRules, sw); err != nil { - return nil, err - } - if err := ts.TopoServer().RebuildSrvVSchema(ctx, nil); err != nil { - return nil, err - } - return sw.logs(), nil -} - -func (s *Server) buildTrafficSwitcher(ctx context.Context, targetKeyspace, workflowName string) (*trafficSwitcher, error) { - tgtInfo, err := BuildTargets(ctx, s.ts, s.tmc, targetKeyspace, workflowName) - if err != nil { - log.Infof("Error building targets: %s", err) - return nil, err - } - targets, frozen, optCells, optTabletTypes := tgtInfo.Targets, tgtInfo.Frozen, tgtInfo.OptCells, tgtInfo.OptTabletTypes - - ts := &trafficSwitcher{ - ws: s, - logger: logutil.NewConsoleLogger(), - workflow: workflowName, - reverseWorkflow: ReverseWorkflowName(workflowName), - id: HashStreams(targetKeyspace, targets), - targets: targets, - sources: make(map[string]*MigrationSource), - targetKeyspace: targetKeyspace, - frozen: frozen, - optCells: optCells, - optTabletTypes: optTabletTypes, - workflowType: tgtInfo.WorkflowType, - workflowSubType: tgtInfo.WorkflowSubType, - } - log.Infof("Migration ID for workflow %s: %d", workflowName, ts.id) - sourceTopo := s.ts - - // Build the sources. - for _, target := range targets { - for _, bls := range target.Sources { - if ts.sourceKeyspace == "" { - ts.sourceKeyspace = bls.Keyspace - ts.sourceTimeZone = bls.SourceTimeZone - ts.targetTimeZone = bls.TargetTimeZone - ts.externalCluster = bls.ExternalCluster - if ts.externalCluster != "" { - externalTopo, err := s.ts.OpenExternalVitessClusterServer(ctx, ts.externalCluster) - if err != nil { - return nil, err - } - sourceTopo = externalTopo - ts.externalTopo = externalTopo - } - } else if ts.sourceKeyspace != bls.Keyspace { - return nil, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "source keyspaces are mismatched across streams: %v vs %v", ts.sourceKeyspace, bls.Keyspace) - } - - if ts.tables == nil { - for _, rule := range bls.Filter.Rules { - ts.tables = append(ts.tables, rule.Match) - } - sort.Strings(ts.tables) - } else { - var tables []string - for _, rule := range bls.Filter.Rules { - tables = append(tables, rule.Match) - } - sort.Strings(tables) - if !reflect.DeepEqual(ts.tables, tables) { - return nil, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "table lists are mismatched across streams: %v vs %v", ts.tables, tables) - } - } - - if _, ok := ts.sources[bls.Shard]; ok { - continue - } - sourcesi, err := sourceTopo.GetShard(ctx, bls.Keyspace, bls.Shard) - if err != nil { - return nil, err - } - if sourcesi.PrimaryAlias == nil { - return nil, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "source shard %s/%s currently has no primary tablet", - bls.Keyspace, bls.Shard) - } - sourcePrimary, err := sourceTopo.GetTablet(ctx, sourcesi.PrimaryAlias) - if err != nil { - return nil, err - } - ts.sources[bls.Shard] = NewMigrationSource(sourcesi, sourcePrimary) - } - } - if ts.sourceKeyspace != ts.targetKeyspace || ts.externalCluster != "" { - ts.migrationType = binlogdatapb.MigrationType_TABLES - } else { - // TODO(sougou): for shard migration, validate that source and target combined - // keyranges match. - ts.migrationType = binlogdatapb.MigrationType_SHARDS - for sourceShard := range ts.sources { - if _, ok := ts.targets[sourceShard]; ok { - // If shards are overlapping, then this is a table migration. - ts.migrationType = binlogdatapb.MigrationType_TABLES - break - } - } - } - vs, err := sourceTopo.GetVSchema(ctx, ts.sourceKeyspace) - if err != nil { - return nil, err - } - ts.sourceKSSchema, err = vindexes.BuildKeyspaceSchema(vs, ts.sourceKeyspace) - if err != nil { - return nil, err - } - - sourceShards, targetShards := ts.getSourceAndTargetShardsNames() - - ts.isPartialMigration, err = ts.isPartialMoveTables(sourceShards, targetShards) - if err != nil { - return nil, err - } - if ts.isPartialMigration { - log.Infof("Migration is partial, for shards %+v", sourceShards) - } - return ts, nil -} - -func (s *Server) dropRelatedArtifacts(ctx context.Context, keepRoutingRules bool, sw iswitcher) error { - if err := sw.dropSourceReverseVReplicationStreams(ctx); err != nil { - return err - } - if !keepRoutingRules { - if err := sw.deleteRoutingRules(ctx); err != nil { - return err - } - if err := sw.deleteShardRoutingRules(ctx); err != nil { - return err - } - } - - return nil -} - -// dropSources cleans up source tables, shards and denied tables after a -// MoveTables/Reshard is completed. -func (s *Server) dropSources(ctx context.Context, ts *trafficSwitcher, removalType TableRemovalType, keepData, keepRoutingRules, force, dryRun bool) (*[]string, error) { - var ( - sw iswitcher - err error - ) - if dryRun { - sw = &switcherDryRun{ts: ts, drLog: NewLogRecorder()} - } else { - sw = &switcher{ts: ts, s: s} - } - var tctx context.Context - tctx, sourceUnlock, lockErr := sw.lockKeyspace(ctx, ts.SourceKeyspaceName(), "DropSources") - if lockErr != nil { - ts.Logger().Errorf("Source LockKeyspace failed: %v", lockErr) - return nil, lockErr - } - defer sourceUnlock(&err) - ctx = tctx - if ts.TargetKeyspaceName() != ts.SourceKeyspaceName() { - tctx, targetUnlock, lockErr := sw.lockKeyspace(ctx, ts.TargetKeyspaceName(), "DropSources") - if lockErr != nil { - ts.Logger().Errorf("Target LockKeyspace failed: %v", lockErr) - return nil, lockErr - } - defer targetUnlock(&err) - ctx = tctx - } - if !force { - if err := sw.validateWorkflowHasCompleted(ctx); err != nil { - ts.Logger().Errorf("Workflow has not completed, cannot DropSources: %v", err) - return nil, err - } - } - if !keepData { - switch ts.MigrationType() { - case binlogdatapb.MigrationType_TABLES: - log.Infof("Deleting tables") - if err := sw.removeSourceTables(ctx, removalType); err != nil { - return nil, err - } - if err := sw.dropSourceDeniedTables(ctx); err != nil { - return nil, err - } - if err := sw.dropTargetDeniedTables(ctx); err != nil { - return nil, err - } - - case binlogdatapb.MigrationType_SHARDS: - log.Infof("Removing shards") - if err := sw.dropSourceShards(ctx); err != nil { - return nil, err - } - } - } - if err := s.dropArtifacts(ctx, keepRoutingRules, sw); err != nil { - return nil, err - } - if err := ts.TopoServer().RebuildSrvVSchema(ctx, nil); err != nil { - return nil, err - } - - return sw.logs(), nil -} - -func (s *Server) dropArtifacts(ctx context.Context, keepRoutingRules bool, sw iswitcher) error { - if err := sw.dropSourceReverseVReplicationStreams(ctx); err != nil { - return err - } - if err := sw.dropTargetVReplicationStreams(ctx); err != nil { - return err - } - if !keepRoutingRules { - if err := sw.deleteRoutingRules(ctx); err != nil { - return err - } - if err := sw.deleteShardRoutingRules(ctx); err != nil { - return err - } - } - - return nil -} - -// DeleteShard will do all the necessary changes in the topology server -// to entirely remove a shard. -func (s *Server) DeleteShard(ctx context.Context, keyspace, shard string, recursive, evenIfServing bool) error { - // Read the Shard object. If it's not there, try to clean up - // the topology anyway. - shardInfo, err := s.ts.GetShard(ctx, keyspace, shard) - if err != nil { - if topo.IsErrType(err, topo.NoNode) { - log.Infof("Shard %v/%v doesn't seem to exist, cleaning up any potential leftover", keyspace, shard) - return s.ts.DeleteShard(ctx, keyspace, shard) - } - return err - } - - servingCells, err := s.ts.GetShardServingCells(ctx, shardInfo) - if err != nil { - return err - } - // Check the Serving map for the shard, we don't want to - // remove a serving shard if not absolutely sure. - if !evenIfServing && len(servingCells) > 0 { - return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "shard %v/%v is still serving, cannot delete it, use the even-if-serving flag if needed", keyspace, shard) - } - - cells, err := s.ts.GetCellInfoNames(ctx) - if err != nil { - return err - } - - // Go through all the cells. - for _, cell := range cells { - var aliases []*topodatapb.TabletAlias - - // Get the ShardReplication object for that cell. Try - // to find all tablets that may belong to our shard. - sri, err := s.ts.GetShardReplication(ctx, cell, keyspace, shard) - switch { - case topo.IsErrType(err, topo.NoNode): - // No ShardReplication object. It means the - // topo is inconsistent. Let's read all the - // tablets for that cell, and if we find any - // in our keyspace / shard, either abort or - // try to delete them. - aliases, err = s.ts.GetTabletAliasesByCell(ctx, cell) - if err != nil { - return vterrors.Errorf(vtrpcpb.Code_INTERNAL, "GetTabletsByCell(%v) failed: %v", cell, err) - } - case err == nil: - // We found a ShardReplication object. We - // trust it to have all tablet records. - aliases = make([]*topodatapb.TabletAlias, len(sri.Nodes)) - for i, n := range sri.Nodes { - aliases[i] = n.TabletAlias - } - default: - return vterrors.Errorf(vtrpcpb.Code_INTERNAL, "GetShardReplication(%v, %v, %v) failed: %v", cell, keyspace, shard, err) - } - - // Get the corresponding Tablet records. Note - // GetTabletMap ignores ErrNoNode, and it's good for - // our purpose, it means a tablet was deleted but is - // still referenced. - tabletMap, err := s.ts.GetTabletMap(ctx, aliases) - if err != nil { - return vterrors.Errorf(vtrpcpb.Code_INTERNAL, "GetTabletMap() failed: %v", err) - } - - // Remove the tablets that don't belong to our - // keyspace/shard from the map. - for a, ti := range tabletMap { - if ti.Keyspace != keyspace || ti.Shard != shard { - delete(tabletMap, a) - } - } - - // Now see if we need to DeleteTablet, and if we can, do it. - if len(tabletMap) > 0 { - if !recursive { - return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "shard %v/%v still has %v tablets in cell %v; use --recursive or remove them manually", keyspace, shard, len(tabletMap), cell) - } - - log.Infof("Deleting all tablets in shard %v/%v cell %v", keyspace, shard, cell) - for tabletAlias, tabletInfo := range tabletMap { - // We don't care about scrapping or updating the replication graph, - // because we're about to delete the entire replication graph. - log.Infof("Deleting tablet %v", tabletAlias) - if err := s.ts.DeleteTablet(ctx, tabletInfo.Alias); err != nil && !topo.IsErrType(err, topo.NoNode) { - // We don't want to continue if a DeleteTablet fails for - // any good reason (other than missing tablet, in which - // case it's just a topology server inconsistency we can - // ignore). If we continue and delete the replication - // graph, the tablet record will be orphaned, since - // we'll no longer know it belongs to this shard. - // - // If the problem is temporary, or resolved externally, re-running - // DeleteShard will skip over tablets that were already deleted. - return vterrors.Errorf(vtrpcpb.Code_INTERNAL, "can't delete tablet %v: %v", tabletAlias, err) - } - } - } - } - - // Try to remove the replication graph and serving graph in each cell, - // regardless of its existence. - for _, cell := range cells { - if err := s.ts.DeleteShardReplication(ctx, cell, keyspace, shard); err != nil && !topo.IsErrType(err, topo.NoNode) { - log.Warningf("Cannot delete ShardReplication in cell %v for %v/%v: %v", cell, keyspace, shard, err) - } - } - - return s.ts.DeleteShard(ctx, keyspace, shard) -} - -// updateShardRecords updates the shard records based on 'from' or 'to' direction. -func (s *Server) updateShardRecords(ctx context.Context, keyspace string, shards []*topo.ShardInfo, cells []string, - servedType topodatapb.TabletType, isFrom bool, clearSourceShards bool, logger logutil.Logger) (err error) { - return topotools.UpdateShardRecords(ctx, s.ts, s.tmc, keyspace, shards, cells, servedType, isFrom, clearSourceShards, logger) -} - -// refreshPrimaryTablets will just RPC-ping all the primary tablets with RefreshState -func (s *Server) refreshPrimaryTablets(ctx context.Context, shards []*topo.ShardInfo) error { - wg := sync.WaitGroup{} - rec := concurrency.AllErrorRecorder{} - for _, si := range shards { - wg.Add(1) - go func(si *topo.ShardInfo) { - defer wg.Done() - ti, err := s.ts.GetTablet(ctx, si.PrimaryAlias) - if err != nil { - rec.RecordError(err) - return - } - - if err := s.tmc.RefreshState(ctx, ti.Tablet); err != nil { - rec.RecordError(err) - } else { - log.Infof("%v responded", topoproto.TabletAliasString(si.PrimaryAlias)) - } - }(si) - } - wg.Wait() - return rec.Error() -} - -// finalizeMigrateWorkflow deletes the streams for the Migrate workflow. -// We only cleanup the target for external sources. -func (s *Server) finalizeMigrateWorkflow(ctx context.Context, targetKeyspace, workflow, tableSpecs string, cancel, keepData, keepRoutingRules, dryRun bool) (*[]string, error) { - ts, err := s.buildTrafficSwitcher(ctx, targetKeyspace, workflow) - if err != nil { - ts.Logger().Errorf("buildTrafficSwitcher failed: %v", err) - return nil, err - } - var sw iswitcher - if dryRun { - sw = &switcherDryRun{ts: ts, drLog: NewLogRecorder()} - } else { - sw = &switcher{s: s, ts: ts} - } - var tctx context.Context - tctx, targetUnlock, lockErr := sw.lockKeyspace(ctx, ts.TargetKeyspaceName(), "completeMigrateWorkflow") - if lockErr != nil { - ts.Logger().Errorf("Target LockKeyspace failed: %v", lockErr) - return nil, lockErr - } - defer targetUnlock(&err) - ctx = tctx - if err := sw.dropTargetVReplicationStreams(ctx); err != nil { - return nil, err - } - if !cancel { - if err := sw.addParticipatingTablesToKeyspace(ctx, targetKeyspace, tableSpecs); err != nil { - return nil, err - } - if err := ts.TopoServer().RebuildSrvVSchema(ctx, nil); err != nil { - return nil, err - } - } - log.Infof("cancel is %t, keepData %t", cancel, keepData) - if cancel && !keepData { - if err := sw.removeTargetTables(ctx); err != nil { - return nil, err - } - } - return sw.logs(), nil -} - -// WorkflowSwitchTraffic switches traffic in the direction passed for specified tablet types. -func (s *Server) WorkflowSwitchTraffic(ctx context.Context, req *vtctldatapb.WorkflowSwitchTrafficRequest) (*vtctldatapb.WorkflowSwitchTrafficResponse, error) { - var ( - dryRunResults []string - rdDryRunResults, wrDryRunResults *[]string - hasReplica, hasRdonly, hasPrimary bool - ) - timeout, set, err := protoutil.DurationFromProto(req.Timeout) - if err != nil { - err = vterrors.Wrapf(err, "unable to parse Timeout into a valid duration") - return nil, err - } - if !set { - timeout = defaultDuration - } - ts, startState, err := s.getWorkflowState(ctx, req.Keyspace, req.Workflow) - if err != nil { - return nil, err - } - - if startState.WorkflowType == TypeMigrate { - return nil, vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "invalid action for Migrate workflow: SwitchTraffic") - } - - maxReplicationLagAllowed, set, err := protoutil.DurationFromProto(req.MaxReplicationLagAllowed) - if err != nil { - err = vterrors.Wrapf(err, "unable to parse MaxReplicationLagAllowed into a valid duration") - return nil, err - } - if !set { - maxReplicationLagAllowed = defaultDuration - } - direction := TrafficSwitchDirection(req.Direction) - if direction == DirectionBackward { - ts, startState, err = s.getWorkflowState(ctx, startState.SourceKeyspace, ts.reverseWorkflow) - if err != nil { - return nil, err - } - } - reason, err := s.canSwitch(ctx, ts, startState, direction, int64(maxReplicationLagAllowed.Seconds())) - if err != nil { - return nil, err - } - if reason != "" { - return nil, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "cannot switch traffic for workflow %s at this time: %s", startState.Workflow, reason) - } - hasReplica, hasRdonly, hasPrimary, err = parseTabletTypes(req.TabletTypes) - if err != nil { - return nil, err - } - if hasReplica || hasRdonly { - if rdDryRunResults, err = s.switchReads(ctx, req, ts, startState, timeout, false, direction); err != nil { - return nil, err - } - log.Infof("Switch Reads done for workflow %s.%s", req.Keyspace, req.Workflow) - } - if rdDryRunResults != nil { - dryRunResults = append(dryRunResults, *rdDryRunResults...) - } - if hasPrimary { - if _, wrDryRunResults, err = s.switchWrites(ctx, req, ts, timeout, false); err != nil { - return nil, err - } - log.Infof("Switch Writes done for workflow %s.%s", req.Keyspace, req.Workflow) - } - - if wrDryRunResults != nil { - dryRunResults = append(dryRunResults, *wrDryRunResults...) - } - if req.DryRun && len(dryRunResults) == 0 { - dryRunResults = append(dryRunResults, "No changes required") - } - cmd := "SwitchTraffic" - if direction == DirectionBackward { - cmd = "ReverseTraffic" - } - log.Infof("%s done for workflow %s.%s", cmd, req.Keyspace, req.Workflow) - resp := &vtctldatapb.WorkflowSwitchTrafficResponse{} - if req.DryRun { - resp.Summary = fmt.Sprintf("%s dry run results for workflow %s.%s at %v", cmd, req.Keyspace, req.Workflow, time.Now().UTC().Format(time.RFC822)) - resp.DryRunResults = dryRunResults - } else { - log.Infof("SwitchTraffic done for workflow %s.%s", req.Keyspace, req.Workflow) - resp.Summary = fmt.Sprintf("%s was successful for workflow %s.%s", cmd, req.Keyspace, req.Workflow) - // Reload the state after the SwitchTraffic operation - // and return that as a string. - keyspace := req.Keyspace - workflow := req.Workflow - if direction == DirectionBackward { - keyspace = startState.SourceKeyspace - workflow = ts.reverseWorkflow - } - resp.StartState = startState.String() - log.Infof("Before reloading workflow state after switching traffic: %+v\n", resp.StartState) - _, currentState, err := s.getWorkflowState(ctx, keyspace, workflow) - if err != nil { - resp.CurrentState = fmt.Sprintf("Error reloading workflow state after switching traffic: %v", err) - } else { - resp.CurrentState = currentState.String() - } - } - return resp, nil -} - -// switchReads is a generic way of switching read traffic for a workflow. -func (s *Server) switchReads(ctx context.Context, req *vtctldatapb.WorkflowSwitchTrafficRequest, ts *trafficSwitcher, state *State, timeout time.Duration, cancel bool, direction TrafficSwitchDirection) (*[]string, error) { - roTypesToSwitchStr := topoproto.MakeStringTypeCSV(req.TabletTypes) - var switchReplica, switchRdonly bool - for _, roType := range req.TabletTypes { - switch roType { - case topodatapb.TabletType_REPLICA: - switchReplica = true - case topodatapb.TabletType_RDONLY: - switchRdonly = true - } - } - - // Consistently handle errors by logging and returning them. - handleError := func(message string, err error) (*[]string, error) { - werr := vterrors.Errorf(vtrpcpb.Code_INTERNAL, fmt.Sprintf("%s: %v", message, err)) - ts.Logger().Error(werr) - return nil, werr - } - - log.Infof("Switching reads: %s.%s tablet types: %s, cells: %s, workflow state: %s", ts.targetKeyspace, ts.workflow, roTypesToSwitchStr, ts.optCells, state.String()) - if !switchReplica && !switchRdonly { - return handleError("invalid tablet types", vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "tablet types must be REPLICA or RDONLY: %s", roTypesToSwitchStr)) - } - if !ts.isPartialMigration { // shard level traffic switching is all or nothing - if direction == DirectionBackward && switchReplica && len(state.ReplicaCellsSwitched) == 0 { - return handleError("invalid request", vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "requesting reversal of read traffic for REPLICAs but REPLICA reads have not been switched")) - } - if direction == DirectionBackward && switchRdonly && len(state.RdonlyCellsSwitched) == 0 { - return handleError("invalid request", vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "requesting reversal of SwitchReads for RDONLYs but RDONLY reads have not been switched")) - } - } - var cells = req.Cells - // If no cells were provided in the command then use the value from the workflow. - if len(cells) == 0 && ts.optCells != "" { - cells = strings.Split(strings.TrimSpace(ts.optCells), ",") - } - - // If there are no rdonly tablets in the cells ask to switch rdonly tablets as well so that routing rules - // are updated for rdonly as well. Otherwise vitess will not know that the workflow has completed and will - // incorrectly report that not all reads have been switched. User currently is forced to switch non-existent - // rdonly tablets. - if switchReplica && !switchRdonly { - var err error - rdonlyTabletsExist, err := topotools.DoCellsHaveRdonlyTablets(ctx, s.ts, cells) - if err != nil { - return nil, err - } - if !rdonlyTabletsExist { - req.TabletTypes = append(req.TabletTypes, topodatapb.TabletType_RDONLY) - } - } - - // If journals exist notify user and fail. - journalsExist, _, err := ts.checkJournals(ctx) - if err != nil { - return handleError(fmt.Sprintf("failed to read journal in the %s keyspace", ts.SourceKeyspaceName()), err) - } - if journalsExist { - log.Infof("Found a previous journal entry for %d", ts.id) - } - var sw iswitcher - if req.DryRun { - sw = &switcherDryRun{ts: ts, drLog: NewLogRecorder()} - } else { - sw = &switcher{ts: ts, s: s} - } - - if err := ts.validate(ctx); err != nil { - return handleError("workflow validation failed", err) - } - - // For reads, locking the source keyspace is sufficient. - ctx, unlock, lockErr := sw.lockKeyspace(ctx, ts.SourceKeyspaceName(), "SwitchReads") - if lockErr != nil { - return handleError(fmt.Sprintf("failed to lock the %s keyspace", ts.SourceKeyspaceName()), lockErr) - } - defer unlock(&err) - - if ts.MigrationType() == binlogdatapb.MigrationType_TABLES { - if ts.isPartialMigration { - ts.Logger().Infof("Partial migration, skipping switchTableReads as traffic is all or nothing per shard and overridden for reads AND writes in the ShardRoutingRule created when switching writes.") - } else if err := sw.switchTableReads(ctx, cells, req.TabletTypes, direction); err != nil { - return handleError("failed to switch read traffic for the tables", err) - } - return sw.logs(), nil - } - ts.Logger().Infof("About to switchShardReads: %+v, %+s, %+v", cells, roTypesToSwitchStr, direction) - if err := sw.switchShardReads(ctx, cells, req.TabletTypes, direction); err != nil { - return handleError("failed to switch read traffic for the shards", err) - } - - ts.Logger().Infof("switchShardReads Completed: %+v, %+s, %+v", cells, roTypesToSwitchStr, direction) - if err := s.ts.ValidateSrvKeyspace(ctx, ts.targetKeyspace, strings.Join(cells, ",")); err != nil { - err2 := vterrors.Wrapf(err, "after switching shard reads, found SrvKeyspace for %s is corrupt in cell %s", - ts.targetKeyspace, strings.Join(cells, ",")) - return handleError("failed to validate SrvKeyspace record", err2) - } - return sw.logs(), nil -} - -// switchWrites is a generic way of migrating write traffic for a workflow. -func (s *Server) switchWrites(ctx context.Context, req *vtctldatapb.WorkflowSwitchTrafficRequest, ts *trafficSwitcher, timeout time.Duration, - cancel bool) (journalID int64, dryRunResults *[]string, err error) { - - var sw iswitcher - if req.DryRun { - sw = &switcherDryRun{ts: ts, drLog: NewLogRecorder()} - } else { - sw = &switcher{ts: ts, s: s} - } - - // Consistently handle errors by logging and returning them. - handleError := func(message string, err error) (int64, *[]string, error) { - werr := vterrors.Errorf(vtrpcpb.Code_INTERNAL, fmt.Sprintf("%s: %v", message, err)) - ts.Logger().Error(werr) - return 0, nil, werr - } - - if ts.frozen { - ts.Logger().Warningf("Writes have already been switched for workflow %s, nothing to do here", ts.WorkflowName()) - return 0, sw.logs(), nil - } - - if err := ts.validate(ctx); err != nil { - return handleError("workflow validation failed", err) - } - - if req.EnableReverseReplication { - if err := areTabletsAvailableToStreamFrom(ctx, req, ts, ts.TargetKeyspaceName(), ts.TargetShards()); err != nil { - return handleError(fmt.Sprintf("no tablets were available to stream from in the %s keyspace", ts.SourceKeyspaceName()), err) - } - } - - // Need to lock both source and target keyspaces. - tctx, sourceUnlock, lockErr := sw.lockKeyspace(ctx, ts.SourceKeyspaceName(), "SwitchWrites") - if lockErr != nil { - return handleError(fmt.Sprintf("failed to lock the %s keyspace", ts.SourceKeyspaceName()), lockErr) - } - ctx = tctx - defer sourceUnlock(&err) - if ts.TargetKeyspaceName() != ts.SourceKeyspaceName() { - tctx, targetUnlock, lockErr := sw.lockKeyspace(ctx, ts.TargetKeyspaceName(), "SwitchWrites") - if lockErr != nil { - return handleError(fmt.Sprintf("failed to lock the %s keyspace", ts.TargetKeyspaceName()), lockErr) - } - ctx = tctx - defer targetUnlock(&err) - } - - // Find out if the target is using any sequence tables for auto_increment - // value generation. If so, then we'll need to ensure that they are - // initialized properly before allowing new writes on the target. - sequenceMetadata := make(map[string]*sequenceMetadata) - // For sharded to sharded migrations the sequence must already be setup. - // For reshards the sequence usage is not changed. - if req.InitializeTargetSequences && ts.workflowType == binlogdatapb.VReplicationWorkflowType_MoveTables && - ts.SourceKeyspaceSchema() != nil && ts.SourceKeyspaceSchema().Keyspace != nil && - !ts.SourceKeyspaceSchema().Keyspace.Sharded { - sequenceMetadata, err = ts.getTargetSequenceMetadata(ctx) - if err != nil { - return handleError(fmt.Sprintf("failed to get the sequence information in the %s keyspace", ts.TargetKeyspaceName()), err) - } - } - - // If no journals exist, sourceWorkflows will be initialized by sm.MigrateStreams. - journalsExist, sourceWorkflows, err := ts.checkJournals(ctx) - if err != nil { - return handleError(fmt.Sprintf("failed to read journal in the %s keyspace", ts.SourceKeyspaceName()), err) - } - if !journalsExist { - ts.Logger().Infof("No previous journals were found. Proceeding normally.") - sm, err := BuildStreamMigrator(ctx, ts, cancel) - if err != nil { - return handleError("failed to migrate the workflow streams", err) - } - if cancel { - sw.cancelMigration(ctx, sm) - return 0, sw.logs(), nil - } - - ts.Logger().Infof("Stopping streams") - sourceWorkflows, err = sw.stopStreams(ctx, sm) - if err != nil { - for key, streams := range sm.Streams() { - for _, stream := range streams { - ts.Logger().Errorf("stream in stopStreams: key %s shard %s stream %+v", key, stream.BinlogSource.Shard, stream.BinlogSource) - } - } - sw.cancelMigration(ctx, sm) - return handleError("failed to stop the workflow streams", err) - } - - ts.Logger().Infof("Stopping source writes") - if err := sw.stopSourceWrites(ctx); err != nil { - sw.cancelMigration(ctx, sm) - return handleError(fmt.Sprintf("failed to stop writes in the %s keyspace", ts.SourceKeyspaceName()), err) - } - - if ts.MigrationType() == binlogdatapb.MigrationType_TABLES { - ts.Logger().Infof("Executing LOCK TABLES on source tables %d times", lockTablesCycles) - // Doing this twice with a pause in-between to catch any writes that may have raced in between - // the tablet's deny list check and the first mysqld side table lock. - for cnt := 1; cnt <= lockTablesCycles; cnt++ { - if err := ts.executeLockTablesOnSource(ctx); err != nil { - sw.cancelMigration(ctx, sm) - return handleError(fmt.Sprintf("failed to execute LOCK TABLES (attempt %d of %d) on sources", cnt, lockTablesCycles), err) - } - // No need to UNLOCK the tables as the connection was closed once the locks were acquired - // and thus the locks released. - time.Sleep(lockTablesCycleDelay) - } - } - - ts.Logger().Infof("Waiting for streams to catchup") - if err := sw.waitForCatchup(ctx, timeout); err != nil { - sw.cancelMigration(ctx, sm) - return handleError("failed to sync up replication between the source and target", err) - } - - ts.Logger().Infof("Migrating streams") - if err := sw.migrateStreams(ctx, sm); err != nil { - sw.cancelMigration(ctx, sm) - return handleError("failed to migrate the workflow streams", err) - } - - ts.Logger().Infof("Resetting sequences") - if err := sw.resetSequences(ctx); err != nil { - sw.cancelMigration(ctx, sm) - return handleError("failed to reset the sequences", err) - } - - ts.Logger().Infof("Creating reverse streams") - if err := sw.createReverseVReplication(ctx); err != nil { - sw.cancelMigration(ctx, sm) - return handleError("failed to create the reverse vreplication streams", err) - } - } else { - if cancel { - return handleError("invalid cancel", vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "traffic switching has reached the point of no return, cannot cancel")) - } - ts.Logger().Infof("Journals were found. Completing the left over steps.") - // Need to gather positions in case all journals were not created. - if err := ts.gatherPositions(ctx); err != nil { - return handleError("failed to gather replication positions", err) - } - } - - // This is the point of no return. Once a journal is created, - // traffic can be redirected to target shards. - if err := sw.createJournals(ctx, sourceWorkflows); err != nil { - return handleError("failed to create the journal", err) - } - // Initialize any target sequences, if there are any, before allowing new writes. - if req.InitializeTargetSequences && len(sequenceMetadata) > 0 { - // Writes are blocked so we can safely initialize the sequence tables but - // we also want to use a shorter timeout than the parent context. - // We use up at most half of the overall timeout. - initSeqCtx, cancel := context.WithTimeout(ctx, timeout/2) - defer cancel() - if err := sw.initializeTargetSequences(initSeqCtx, sequenceMetadata); err != nil { - return handleError(fmt.Sprintf("failed to initialize the sequences used in the %s keyspace", ts.TargetKeyspaceName()), err) - } - } - if err := sw.allowTargetWrites(ctx); err != nil { - return handleError(fmt.Sprintf("failed to allow writes in the %s keyspace", ts.TargetKeyspaceName()), err) - } - if err := sw.changeRouting(ctx); err != nil { - return handleError("failed to update the routing rules", err) - } - if err := sw.streamMigraterfinalize(ctx, ts, sourceWorkflows); err != nil { - return handleError("failed to finalize the traffic switch", err) - } - if req.EnableReverseReplication { - if err := sw.startReverseVReplication(ctx); err != nil { - return handleError("failed to start the reverse workflow", err) - } - } - - if err := sw.freezeTargetVReplication(ctx); err != nil { - return handleError(fmt.Sprintf("failed to freeze the workflow in the %s keyspace", ts.TargetKeyspaceName()), err) - } - - return ts.id, sw.logs(), nil -} - -func (s *Server) canSwitch(ctx context.Context, ts *trafficSwitcher, state *State, direction TrafficSwitchDirection, maxAllowedReplLagSecs int64) (reason string, err error) { - if direction == DirectionForward && state.WritesSwitched || - direction == DirectionBackward && !state.WritesSwitched { - log.Infof("writes already switched no need to check lag") - return "", nil - } - wf, err := s.GetWorkflow(ctx, state.TargetKeyspace, state.Workflow) - if err != nil { - return "", err - } - for _, stream := range wf.ShardStreams { - for _, st := range stream.GetStreams() { - if st.Message == Frozen { - return cannotSwitchFrozen, nil - } - // If no new events have been replicated after the copy phase then it will be 0. - if vreplLag := time.Now().Unix() - st.TimeUpdated.Seconds; vreplLag > maxAllowedReplLagSecs { - return fmt.Sprintf(cannotSwitchHighLag, vreplLag, maxAllowedReplLagSecs), nil - } - switch st.State { - case binlogdatapb.VReplicationWorkflowState_Copying.String(): - return cannotSwitchCopyIncomplete, nil - case binlogdatapb.VReplicationWorkflowState_Error.String(): - return cannotSwitchError, nil - } - } - } - - // Ensure that the tablets on both sides are in good shape as we make this same call in the - // process and an error will cause us to backout. - refreshErrors := strings.Builder{} - var m sync.Mutex - var wg sync.WaitGroup - rtbsCtx, cancel := context.WithTimeout(ctx, shardTabletRefreshTimeout) - defer cancel() - refreshTablets := func(shards []*topo.ShardInfo, stype string) { - defer wg.Done() - for _, si := range shards { - if partial, partialDetails, err := topotools.RefreshTabletsByShard(rtbsCtx, s.ts, s.tmc, si, nil, ts.Logger()); err != nil || partial { - m.Lock() - refreshErrors.WriteString(fmt.Sprintf("failed to successfully refresh all tablets in the %s/%s %s shard (%v):\n %v\n", - si.Keyspace(), si.ShardName(), stype, err, partialDetails)) - m.Unlock() - } - } - } - wg.Add(1) - go refreshTablets(ts.SourceShards(), "source") - wg.Add(1) - go refreshTablets(ts.TargetShards(), "target") - wg.Wait() - if refreshErrors.Len() > 0 { - return fmt.Sprintf(cannotSwitchFailedTabletRefresh, refreshErrors.String()), nil - } - return "", nil -} - -// VReplicationExec executes a query remotely using the DBA pool. -func (s *Server) VReplicationExec(ctx context.Context, tabletAlias *topodatapb.TabletAlias, query string) (*querypb.QueryResult, error) { - ti, err := s.ts.GetTablet(ctx, tabletAlias) - if err != nil { - return nil, err - } - return s.tmc.VReplicationExec(ctx, ti.Tablet, query) -} - -// CopySchemaShard copies the schema from a source tablet to the -// specified shard. The schema is applied directly on the primary of -// the destination shard, and is propagated to the replicas through -// binlogs. -func (s *Server) CopySchemaShard(ctx context.Context, sourceTabletAlias *topodatapb.TabletAlias, tables, excludeTables []string, includeViews bool, destKeyspace, destShard string, waitReplicasTimeout time.Duration, skipVerify bool) error { - destShardInfo, err := s.ts.GetShard(ctx, destKeyspace, destShard) - if err != nil { - return vterrors.Errorf(vtrpcpb.Code_INTERNAL, "GetShard(%v, %v) failed: %v", destKeyspace, destShard, err) - } - - if destShardInfo.PrimaryAlias == nil { - return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "no primary in shard record %v/%v. Consider running 'vtctl InitShardPrimary' in case of a new shard or reparenting the shard to fix the topology data", destKeyspace, destShard) - } - - diffs, err := schematools.CompareSchemas(ctx, s.ts, s.tmc, sourceTabletAlias, destShardInfo.PrimaryAlias, tables, excludeTables, includeViews) - if err != nil { - return vterrors.Errorf(vtrpcpb.Code_INTERNAL, "CopySchemaShard failed because schemas could not be compared initially: %v", err) - } - if diffs == nil { - // Return early because dest has already the same schema as source. - return nil - } - - req := &tabletmanagerdatapb.GetSchemaRequest{Tables: tables, ExcludeTables: excludeTables, IncludeViews: includeViews} - sourceSd, err := schematools.GetSchema(ctx, s.ts, s.tmc, sourceTabletAlias, req) - if err != nil { - return vterrors.Errorf(vtrpcpb.Code_INTERNAL, "GetSchema(%v, %v, %v, %v) failed: %v", sourceTabletAlias, tables, excludeTables, includeViews, err) - } - - createSQLstmts := tmutils.SchemaDefinitionToSQLStrings(sourceSd) - - destTabletInfo, err := s.ts.GetTablet(ctx, destShardInfo.PrimaryAlias) - if err != nil { - return vterrors.Errorf(vtrpcpb.Code_INTERNAL, "GetTablet(%v) failed: %v", destShardInfo.PrimaryAlias, err) - } - for _, createSQL := range createSQLstmts { - err = s.applySQLShard(ctx, destTabletInfo, createSQL) - if err != nil { - return vterrors.Errorf(vtrpcpb.Code_INTERNAL, "creating a table failed."+ - " Most likely some tables already exist on the destination and differ from the source."+ - " Please remove all to be copied tables from the destination manually and run this command again."+ - " Full error: %v", err) - } - } - - // Remember the replication position after all the above were applied. - destPrimaryPos, err := s.tmc.PrimaryPosition(ctx, destTabletInfo.Tablet) - if err != nil { - return vterrors.Errorf(vtrpcpb.Code_INTERNAL, "CopySchemaShard: can't get replication position after schema applied: %v", err) - } - - // Although the copy was successful, we have to verify it to catch the case - // where the database already existed on the destination, but with different - // options e.g. a different character set. - // In that case, MySQL would have skipped our CREATE DATABASE IF NOT EXISTS - // statement. - if !skipVerify { - diffs, err = schematools.CompareSchemas(ctx, s.ts, s.tmc, sourceTabletAlias, destShardInfo.PrimaryAlias, tables, excludeTables, includeViews) - if err != nil { - return vterrors.Errorf(vtrpcpb.Code_INTERNAL, "CopySchemaShard failed because schemas could not be compared finally: %v", err) - } - if diffs != nil { - return vterrors.Errorf(vtrpcpb.Code_INTERNAL, "CopySchemaShard was not successful because the schemas between the two tablets %v and %v differ: %v", sourceTabletAlias, destShardInfo.PrimaryAlias, diffs) - } - } - - // Notify Replicas to reload schema. This is best-effort. - reloadCtx, cancel := context.WithTimeout(ctx, waitReplicasTimeout) - defer cancel() - _, ok := schematools.ReloadShard(reloadCtx, s.ts, s.tmc, logutil.NewMemoryLogger(), destKeyspace, destShard, destPrimaryPos, nil, true) - if !ok { - log.Error(vterrors.Errorf(vtrpcpb.Code_INTERNAL, "CopySchemaShard: failed to reload schema on all replicas")) - } - - return err -} - -// applySQLShard applies a given SQL change on a given tablet alias. It allows executing arbitrary -// SQL statements, but doesn't return any results, so it's only useful for SQL statements -// that would be run for their effects (e.g., CREATE). -// It works by applying the SQL statement on the shard's primary tablet with replication turned on. -// Thus it should be used only for changes that can be applied on a live instance without causing issues; -// it shouldn't be used for anything that will require a pivot. -// The SQL statement string is expected to have {{.DatabaseName}} in place of the actual db name. -func (s *Server) applySQLShard(ctx context.Context, tabletInfo *topo.TabletInfo, change string) error { - filledChange, err := fillStringTemplate(change, map[string]string{"DatabaseName": tabletInfo.DbName()}) - if err != nil { - return vterrors.Errorf(vtrpcpb.Code_INTERNAL, "fillStringTemplate failed: %v", err) - } - ctx, cancel := context.WithTimeout(ctx, 30*time.Second) - defer cancel() - // Need to make sure that replication is enabled since we're only applying the statement on primaries - _, err = s.tmc.ApplySchema(ctx, tabletInfo.Tablet, &tmutils.SchemaChange{ - SQL: filledChange, - Force: false, - AllowReplication: true, - SQLMode: vreplication.SQLMode, - }) - return err -} - -// fillStringTemplate returns the string template filled -func fillStringTemplate(tmpl string, vars any) (string, error) { - myTemplate := template.Must(template.New("").Parse(tmpl)) - data := new(bytes.Buffer) - if err := myTemplate.Execute(data, vars); err != nil { - return "", err - } - return data.String(), nil -} ->>>>>>> 04834c4fec (VDiff: Cleanup the controller for a VDiff before deleting it (#14107)) diff --git a/go/vt/vttablet/tabletmanager/framework_test.go b/go/vt/vttablet/tabletmanager/framework_test.go deleted file mode 100644 index 4734ab9ee96..00000000000 --- a/go/vt/vttablet/tabletmanager/framework_test.go +++ /dev/null @@ -1,492 +0,0 @@ -/* -Copyright 2023 The Vitess Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package tabletmanager - -import ( - "context" - "fmt" - "regexp" - "strings" - "sync" - "testing" - - "github.com/stretchr/testify/require" - - "vitess.io/vitess/go/mysql/fakesqldb" - "vitess.io/vitess/go/mysql/replication" - "vitess.io/vitess/go/sqltypes" - "vitess.io/vitess/go/vt/binlog/binlogplayer" - "vitess.io/vitess/go/vt/dbconfigs" - "vitess.io/vitess/go/vt/grpcclient" - "vitess.io/vitess/go/vt/mysqlctl" - "vitess.io/vitess/go/vt/topo" - "vitess.io/vitess/go/vt/topo/memorytopo" - "vitess.io/vitess/go/vt/vttablet/queryservice" - "vitess.io/vitess/go/vt/vttablet/tabletconn" - "vitess.io/vitess/go/vt/vttablet/tabletconntest" - "vitess.io/vitess/go/vt/vttablet/tabletmanager/vreplication" - "vitess.io/vitess/go/vt/vttablet/tmclient" - "vitess.io/vitess/go/vt/vttablet/tmclienttest" - - binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata" - querypb "vitess.io/vitess/go/vt/proto/query" - tabletmanagerdatapb "vitess.io/vitess/go/vt/proto/tabletmanagerdata" - topodatapb "vitess.io/vitess/go/vt/proto/topodata" -) - -const ( - gtidFlavor = "MySQL56" - gtidPosition = "16b1039f-22b6-11ed-b765-0a43f95f28a3:1-220" -) - -func init() { - tabletconn.RegisterDialer("grpc", func(tablet *topodatapb.Tablet, failFast grpcclient.FailFast) (queryservice.QueryService, error) { - return &tabletconntest.FakeQueryService{ - StreamHealthResponse: &querypb.StreamHealthResponse{ - Serving: true, - Target: &querypb.Target{ - Keyspace: tablet.Keyspace, - Shard: tablet.Shard, - TabletType: tablet.Type, - Cell: tablet.Alias.Cell, - }, - RealtimeStats: &querypb.RealtimeStats{}, - }, - }, nil - }) -} - -type testEnv struct { - mu sync.Mutex - ctx context.Context - ts *topo.Server - cells []string - mysqld *mysqlctl.FakeMysqlDaemon - tmc *fakeTMClient - dbName string - protoName string -} - -func newTestEnv(t *testing.T, ctx context.Context, sourceKeyspace string, sourceShards []string) *testEnv { - tenv := &testEnv{ - ctx: context.Background(), - tmc: newFakeTMClient(), - cells: []string{"zone1"}, - dbName: "tmtestdb", - protoName: t.Name(), - } - tenv.mu.Lock() - defer tenv.mu.Unlock() - tenv.ts = memorytopo.NewServer(ctx, tenv.cells...) - tenv.tmc.sourceKeyspace = sourceKeyspace - tenv.tmc.sourceShards = sourceShards - tenv.tmc.schema = defaultSchema - - tabletconn.RegisterDialer(t.Name(), func(tablet *topodatapb.Tablet, failFast grpcclient.FailFast) (queryservice.QueryService, error) { - tenv.mu.Lock() - defer tenv.mu.Unlock() - if qs, ok := tenv.tmc.tablets[int(tablet.Alias.Uid)]; ok { - return qs, nil - } - return nil, fmt.Errorf("tablet %d not found", tablet.Alias.Uid) - }) - tabletconntest.SetProtocol(fmt.Sprintf("go.vt.vttablet.tabletmanager.framework_test_%s", t.Name()), tenv.protoName) - tmclient.RegisterTabletManagerClientFactory(t.Name(), func() tmclient.TabletManagerClient { - return tenv.tmc - }) - tmclienttest.SetProtocol(fmt.Sprintf("go.vt.vttablet.tabletmanager.framework_test_%s", t.Name()), tenv.protoName) - - tenv.mysqld = mysqlctl.NewFakeMysqlDaemon(fakesqldb.New(t)) - var err error - tenv.mysqld.CurrentPrimaryPosition, err = replication.ParsePosition(gtidFlavor, gtidPosition) - require.NoError(t, err) - - return tenv -} - -func (tenv *testEnv) close() { - tenv.mu.Lock() - defer tenv.mu.Unlock() - tenv.ts.Close() - tenv.mysqld.Close() -} - -//-------------------------------------- -// Tablets - -func (tenv *testEnv) addTablet(t *testing.T, id int, keyspace, shard string) *fakeTabletConn { - tenv.mu.Lock() - defer tenv.mu.Unlock() - tablet := &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{ - Cell: tenv.cells[0], - Uid: uint32(id), - }, - Keyspace: keyspace, - Shard: shard, - Type: topodatapb.TabletType_PRIMARY, - PortMap: map[string]int32{ - tenv.protoName: int32(id), - }, - } - if err := tenv.ts.InitTablet(tenv.ctx, tablet, false /* allowPrimaryOverride */, true /* createShardAndKeyspace */, false /* allowUpdate */); err != nil { - panic(err) - } - if _, err := tenv.ts.UpdateShardFields(tenv.ctx, keyspace, shard, func(si *topo.ShardInfo) error { - si.PrimaryAlias = tablet.Alias - si.IsPrimaryServing = true - return nil - }); err != nil { - panic(err) - } - if err := tenv.ts.EnsureVSchema(tenv.ctx, keyspace); err != nil { - panic(err) - } - - vrdbClient := binlogplayer.NewMockDBClient(t) - vrdbClient.Tag = fmt.Sprintf("tablet:%d", id) - tenv.tmc.tablets[id] = &fakeTabletConn{ - tablet: tablet, - vrdbClient: vrdbClient, - } - - dbClientFactory := func() binlogplayer.DBClient { - return tenv.tmc.tablets[id].vrdbClient - } - tenv.tmc.tablets[id].vrengine = vreplication.NewTestEngine(tenv.ts, tenv.cells[0], tenv.mysqld, dbClientFactory, dbClientFactory, tenv.dbName, nil) - tenv.tmc.tablets[id].vrdbClient.ExpectRequest(fmt.Sprintf("select * from _vt.vreplication where db_name='%s'", tenv.dbName), &sqltypes.Result{}, nil) - tenv.tmc.tablets[id].vrengine.Open(tenv.ctx) - require.True(t, tenv.tmc.tablets[id].vrengine.IsOpen(), "vreplication engine was not open") - - tenv.tmc.tablets[id].tm = &TabletManager{ - VREngine: tenv.tmc.tablets[id].vrengine, - DBConfigs: &dbconfigs.DBConfigs{ - DBName: tenv.dbName, - }, - } - - return tenv.tmc.tablets[id] -} - -func (tenv *testEnv) deleteTablet(tablet *topodatapb.Tablet) { - tenv.mu.Lock() - defer tenv.mu.Unlock() - tenv.tmc.tablets[int(tablet.Alias.Uid)].vrdbClient.Close() - tenv.tmc.tablets[int(tablet.Alias.Uid)].vrengine.Close() - tenv.ts.DeleteTablet(tenv.ctx, tablet.Alias) - // This is not automatically removed from shard replication, which results in log spam. - topo.DeleteTabletReplicationData(tenv.ctx, tenv.ts, tablet) -} - -// fakeTabletConn implements the TabletConn and QueryService interfaces. -type fakeTabletConn struct { - queryservice.QueryService - tablet *topodatapb.Tablet - tm *TabletManager - vrdbClient *binlogplayer.MockDBClient - vrengine *vreplication.Engine -} - -// fakeTabletConn implements the QueryService interface. -func (ftc *fakeTabletConn) Begin(ctx context.Context, target *querypb.Target, options *querypb.ExecuteOptions) (queryservice.TransactionState, error) { - return queryservice.TransactionState{ - TransactionID: 1, - }, nil -} - -// fakeTabletConn implements the QueryService interface. -func (ftc *fakeTabletConn) Commit(ctx context.Context, target *querypb.Target, transactionID int64) (int64, error) { - return 0, nil -} - -// fakeTabletConn implements the QueryService interface. -func (ftc *fakeTabletConn) Rollback(ctx context.Context, target *querypb.Target, transactionID int64) (int64, error) { - return 0, nil -} - -// fakeTabletConn implements the QueryService interface. -func (ftc *fakeTabletConn) Prepare(ctx context.Context, target *querypb.Target, transactionID int64, dtid string) (err error) { - return nil -} - -// fakeTabletConn implements the QueryService interface. -func (ftc *fakeTabletConn) CommitPrepared(ctx context.Context, target *querypb.Target, dtid string) (err error) { - return nil -} - -// fakeTabletConn implements the QueryService interface. -func (ftc *fakeTabletConn) RollbackPrepared(ctx context.Context, target *querypb.Target, dtid string, originalID int64) (err error) { - return nil -} - -// fakeTabletConn implements the QueryService interface. -func (ftc *fakeTabletConn) CreateTransaction(ctx context.Context, target *querypb.Target, dtid string, participants []*querypb.Target) (err error) { - return nil -} - -// fakeTabletConn implements the QueryService interface. -func (ftc *fakeTabletConn) StartCommit(ctx context.Context, target *querypb.Target, transactionID int64, dtid string) (err error) { - return nil -} - -// fakeTabletConn implements the QueryService interface. -func (ftc *fakeTabletConn) SetRollback(ctx context.Context, target *querypb.Target, dtid string, transactionID int64) (err error) { - return nil -} - -// fakeTabletConn implements the QueryService interface. -func (ftc *fakeTabletConn) ConcludeTransaction(ctx context.Context, target *querypb.Target, dtid string) (err error) { - return nil -} - -// fakeTabletConn implements the QueryService interface. -func (ftc *fakeTabletConn) ReadTransaction(ctx context.Context, target *querypb.Target, dtid string) (metadata *querypb.TransactionMetadata, err error) { - return nil, nil -} - -// fakeTabletConn implements the QueryService interface. -func (ftc *fakeTabletConn) Execute(ctx context.Context, target *querypb.Target, sql string, bindVariables map[string]*querypb.BindVariable, transactionID, reservedID int64, options *querypb.ExecuteOptions) (*sqltypes.Result, error) { - return nil, nil -} - -// fakeTabletConn implements the QueryService interface. -func (ftc *fakeTabletConn) StreamExecute(ctx context.Context, target *querypb.Target, sql string, bindVariables map[string]*querypb.BindVariable, transactionID int64, reservedID int64, options *querypb.ExecuteOptions, callback func(*sqltypes.Result) error) error { - return nil -} - -// fakeTabletConn implements the QueryService interface. -func (ftc *fakeTabletConn) BeginExecute(ctx context.Context, target *querypb.Target, preQueries []string, sql string, bindVariables map[string]*querypb.BindVariable, reservedID int64, options *querypb.ExecuteOptions) (queryservice.TransactionState, *sqltypes.Result, error) { - return queryservice.TransactionState{ - TransactionID: 1, - }, nil, nil -} - -// fakeTabletConn implements the QueryService interface. -func (ftc *fakeTabletConn) BeginStreamExecute(ctx context.Context, target *querypb.Target, preQueries []string, sql string, bindVariables map[string]*querypb.BindVariable, reservedID int64, options *querypb.ExecuteOptions, callback func(*sqltypes.Result) error) (queryservice.TransactionState, error) { - return queryservice.TransactionState{ - TransactionID: 1, - }, nil -} - -// fakeTabletConn implements the QueryService interface. -func (ftc *fakeTabletConn) MessageStream(ctx context.Context, target *querypb.Target, name string, callback func(*sqltypes.Result) error) error { - return nil -} - -// fakeTabletConn implements the QueryService interface. -func (ftc *fakeTabletConn) MessageAck(ctx context.Context, target *querypb.Target, name string, ids []*querypb.Value) (count int64, err error) { - return 0, nil -} - -// fakeTabletConn implements the QueryService interface. -func (ftc *fakeTabletConn) VStream(ctx context.Context, request *binlogdatapb.VStreamRequest, send func([]*binlogdatapb.VEvent) error) error { - return nil -} - -// fakeTabletConn implements the QueryService interface. -func (ftc *fakeTabletConn) VStreamRows(ctx context.Context, request *binlogdatapb.VStreamRowsRequest, send func(*binlogdatapb.VStreamRowsResponse) error) error { - return nil -} - -// fakeTabletConn implements the QueryService interface. -func (ftc *fakeTabletConn) VStreamResults(ctx context.Context, target *querypb.Target, query string, send func(*binlogdatapb.VStreamResultsResponse) error) error { - return nil -} - -// fakeTabletConn implements the QueryService interface. -func (ftc *fakeTabletConn) HandlePanic(err *error) { -} - -// fakeTabletConn implements the QueryService interface. -func (ftc *fakeTabletConn) ReserveBeginExecute(ctx context.Context, target *querypb.Target, preQueries []string, postBeginQueries []string, sql string, bindVariables map[string]*querypb.BindVariable, options *querypb.ExecuteOptions) (queryservice.ReservedTransactionState, *sqltypes.Result, error) { - return queryservice.ReservedTransactionState{ - ReservedID: 1, - }, nil, nil -} - -// fakeTabletConn implements the QueryService interface. -func (ftc *fakeTabletConn) ReserveBeginStreamExecute(ctx context.Context, target *querypb.Target, preQueries []string, postBeginQueries []string, sql string, bindVariables map[string]*querypb.BindVariable, options *querypb.ExecuteOptions, callback func(*sqltypes.Result) error) (queryservice.ReservedTransactionState, error) { - return queryservice.ReservedTransactionState{ - ReservedID: 1, - }, nil -} - -// fakeTabletConn implements the QueryService interface. -func (ftc *fakeTabletConn) ReserveExecute(ctx context.Context, target *querypb.Target, preQueries []string, sql string, bindVariables map[string]*querypb.BindVariable, transactionID int64, options *querypb.ExecuteOptions) (queryservice.ReservedState, *sqltypes.Result, error) { - return queryservice.ReservedState{ - ReservedID: 1, - }, nil, nil -} - -// fakeTabletConn implements the QueryService interface. -func (ftc *fakeTabletConn) ReserveStreamExecute(ctx context.Context, target *querypb.Target, preQueries []string, sql string, bindVariables map[string]*querypb.BindVariable, transactionID int64, options *querypb.ExecuteOptions, callback func(*sqltypes.Result) error) (queryservice.ReservedState, error) { - return queryservice.ReservedState{ - ReservedID: 1, - }, nil -} - -// fakeTabletConn implements the QueryService interface. -func (ftc *fakeTabletConn) Release(ctx context.Context, target *querypb.Target, transactionID, reservedID int64) error { - return nil -} - -// fakeTabletConn implements the QueryService interface. -func (ftc *fakeTabletConn) GetSchema(ctx context.Context, target *querypb.Target, tableType querypb.SchemaTableType, tableNames []string, callback func(schemaRes *querypb.GetSchemaResponse) error) error { - return nil -} - -// fakeTabletConn implements the QueryService interface. -func (ftc *fakeTabletConn) Close(ctx context.Context) error { - return nil -} - -func (ftc *fakeTabletConn) StreamHealth(ctx context.Context, callback func(*querypb.StreamHealthResponse) error) error { - return callback(&querypb.StreamHealthResponse{ - Serving: true, - Target: &querypb.Target{ - Keyspace: ftc.tablet.Keyspace, - Shard: ftc.tablet.Shard, - TabletType: ftc.tablet.Type, - Cell: ftc.tablet.Alias.Cell, - }, - RealtimeStats: &querypb.RealtimeStats{}, - }) -} - -//---------------------------------------------- -// fakeTMClient - -type fakeTMClient struct { - tmclient.TabletManagerClient - sourceKeyspace string - sourceShards []string - tablets map[int]*fakeTabletConn - schema *tabletmanagerdatapb.SchemaDefinition - vreQueries map[int]map[string]*querypb.QueryResult -} - -func newFakeTMClient() *fakeTMClient { - return &fakeTMClient{ - tablets: make(map[int]*fakeTabletConn), - vreQueries: make(map[int]map[string]*querypb.QueryResult), - schema: &tabletmanagerdatapb.SchemaDefinition{}, - } -} - -func (tmc *fakeTMClient) GetSchema(ctx context.Context, tablet *topodatapb.Tablet, request *tabletmanagerdatapb.GetSchemaRequest) (*tabletmanagerdatapb.SchemaDefinition, error) { - return tmc.schema, nil -} - -func (tmc *fakeTMClient) SetSchema(schema *tabletmanagerdatapb.SchemaDefinition) { - tmc.schema = schema -} - -func (tmc *fakeTMClient) ExecuteFetchAsApp(ctx context.Context, tablet *topodatapb.Tablet, usePool bool, req *tabletmanagerdatapb.ExecuteFetchAsAppRequest) (*querypb.QueryResult, error) { - // Reuse VReplicationExec - return tmc.VReplicationExec(ctx, tablet, string(req.Query)) -} - -func (tmc *fakeTMClient) ExecuteFetchAsDba(ctx context.Context, tablet *topodatapb.Tablet, usePool bool, req *tabletmanagerdatapb.ExecuteFetchAsDbaRequest) (*querypb.QueryResult, error) { - // Reuse VReplicationExec - return tmc.VReplicationExec(ctx, tablet, string(req.Query)) -} - -// setVReplicationExecResults allows you to specify VReplicationExec queries -// and their results. You can specify exact strings or strings prefixed with -// a '/', in which case they will be treated as a valid regexp. -func (tmc *fakeTMClient) setVReplicationExecResults(tablet *topodatapb.Tablet, query string, result *sqltypes.Result) { - queries, ok := tmc.vreQueries[int(tablet.Alias.Uid)] - if !ok { - queries = make(map[string]*querypb.QueryResult) - tmc.vreQueries[int(tablet.Alias.Uid)] = queries - } - queries[query] = sqltypes.ResultToProto3(result) -} - -func (tmc *fakeTMClient) VReplicationExec(ctx context.Context, tablet *topodatapb.Tablet, query string) (*querypb.QueryResult, error) { - if result, ok := tmc.vreQueries[int(tablet.Alias.Uid)][query]; ok { - return result, nil - } - for qry, res := range tmc.vreQueries[int(tablet.Alias.Uid)] { - if strings.HasPrefix(qry, "/") { - re := regexp.MustCompile(qry) - if re.MatchString(qry) { - return res, nil - } - } - } - return nil, fmt.Errorf("query %q not found for tablet %d", query, tablet.Alias.Uid) -} - -func (tmc *fakeTMClient) CreateVReplicationWorkflow(ctx context.Context, tablet *topodatapb.Tablet, req *tabletmanagerdatapb.CreateVReplicationWorkflowRequest) (*tabletmanagerdatapb.CreateVReplicationWorkflowResponse, error) { - return tmc.tablets[int(tablet.Alias.Uid)].tm.CreateVReplicationWorkflow(ctx, req) -} - -func (tmc *fakeTMClient) ReadVReplicationWorkflow(ctx context.Context, tablet *topodatapb.Tablet, req *tabletmanagerdatapb.ReadVReplicationWorkflowRequest) (*tabletmanagerdatapb.ReadVReplicationWorkflowResponse, error) { - resp := &tabletmanagerdatapb.ReadVReplicationWorkflowResponse{ - Workflow: req.Workflow, - WorkflowSubType: binlogdatapb.VReplicationWorkflowSubType_None, - WorkflowType: binlogdatapb.VReplicationWorkflowType_MoveTables, - TabletTypes: []topodatapb.TabletType{topodatapb.TabletType_PRIMARY}, - Streams: make([]*tabletmanagerdatapb.ReadVReplicationWorkflowResponse_Stream, len(tmc.sourceShards)), - } - rules := make([]*binlogdatapb.Rule, len(defaultSchema.TableDefinitions)) - for i, table := range defaultSchema.TableDefinitions { - rules[i] = &binlogdatapb.Rule{ - Match: table.Name, - Filter: tablet.Shard, - } - } - for i, shard := range tmc.sourceShards { - resp.Streams[i] = &tabletmanagerdatapb.ReadVReplicationWorkflowResponse_Stream{ - Id: int32(i + 1), - Bls: &binlogdatapb.BinlogSource{ - Keyspace: tmc.sourceKeyspace, - Shard: shard, - Filter: &binlogdatapb.Filter{ - Rules: rules, - }, - }, - } - } - - return resp, nil -} - -func (tmc *fakeTMClient) PrimaryPosition(ctx context.Context, tablet *topodatapb.Tablet) (string, error) { - return fmt.Sprintf("%s/%s", gtidFlavor, gtidPosition), nil -} - -func (tmc *fakeTMClient) VReplicationWaitForPos(ctx context.Context, tablet *topodatapb.Tablet, id int32, pos string) error { - return nil -} - -func (tmc *fakeTMClient) ExecuteFetchAsAllPrivs(ctx context.Context, tablet *topodatapb.Tablet, req *tabletmanagerdatapb.ExecuteFetchAsAllPrivsRequest) (*querypb.QueryResult, error) { - return &querypb.QueryResult{ - RowsAffected: 1, - }, nil -} - -func (tmc *fakeTMClient) VDiff(ctx context.Context, tablet *topodatapb.Tablet, req *tabletmanagerdatapb.VDiffRequest) (*tabletmanagerdatapb.VDiffResponse, error) { - return &tabletmanagerdatapb.VDiffResponse{ - Id: 1, - VdiffUuid: req.VdiffUuid, - Output: &querypb.QueryResult{ - RowsAffected: 1, - }, - }, nil -} diff --git a/go/vt/vttablet/tabletmanager/vdiff/action.go b/go/vt/vttablet/tabletmanager/vdiff/action.go index e38a18a921b..ba09d8914ad 100644 --- a/go/vt/vttablet/tabletmanager/vdiff/action.go +++ b/go/vt/vttablet/tabletmanager/vdiff/action.go @@ -25,6 +25,7 @@ import ( "github.com/google/uuid" + "vitess.io/vitess/go/vt/sqlparser" "vitess.io/vitess/go/vt/topo/topoproto" "vitess.io/vitess/go/vt/vterrors" @@ -155,7 +156,12 @@ func (vde *Engine) handleCreateResumeAction(ctx context.Context, dbClient binlog var err error options := req.Options - query := fmt.Sprintf(sqlGetVDiffID, encodeString(req.VdiffUuid)) + query, err := sqlparser.ParseAndBind(sqlGetVDiffID, + sqltypes.StringBindVariable(req.VdiffUuid), + ) + if err != nil { + return err + } if qr, err = dbClient.ExecuteFetch(query, 1); err != nil { return err } @@ -308,9 +314,6 @@ func (vde *Engine) handleDeleteAction(ctx context.Context, dbClient binlogplayer switch req.ActionArg { case AllActionArg: -<<<<<<< HEAD - query = fmt.Sprintf(sqlDeleteVDiffs, encodeString(req.Keyspace), encodeString(req.Workflow)) -======= // We need to stop any running controllers before we delete // the vdiff records. query, err := sqlparser.ParseAndBind(sqlGetVDiffIDsByKeyspaceWorkflow, @@ -334,15 +337,11 @@ func (vde *Engine) handleDeleteAction(ctx context.Context, dbClient binlogplayer if err != nil { return err } ->>>>>>> 04834c4fec (VDiff: Cleanup the controller for a VDiff before deleting it (#14107)) default: uuid, err := uuid.Parse(req.ActionArg) if err != nil { return fmt.Errorf("action argument %s not supported", req.ActionArg) } -<<<<<<< HEAD - query = fmt.Sprintf(sqlDeleteVDiffByUUID, encodeString(uuid.String())) -======= // We need to be sure that the controller is stopped, if // it's still running, before we delete the vdiff record. query, err := sqlparser.ParseAndBind(sqlGetVDiffID, @@ -367,7 +366,6 @@ func (vde *Engine) handleDeleteAction(ctx context.Context, dbClient binlogplayer if err != nil { return err } ->>>>>>> 04834c4fec (VDiff: Cleanup the controller for a VDiff before deleting it (#14107)) } // Execute the query which deletes the vdiff record(s). if _, err := dbClient.ExecuteFetch(deleteQuery, 1); err != nil { diff --git a/go/vt/vttablet/tabletmanager/vdiff/controller.go b/go/vt/vttablet/tabletmanager/vdiff/controller.go index 265297d5114..f24230eb0e0 100644 --- a/go/vt/vttablet/tabletmanager/vdiff/controller.go +++ b/go/vt/vttablet/tabletmanager/vdiff/controller.go @@ -23,23 +23,13 @@ import ( "strings" "time" -<<<<<<< HEAD - "vitess.io/vitess/go/vt/proto/tabletmanagerdata" - "vitess.io/vitess/go/vt/vterrors" - "google.golang.org/protobuf/encoding/prototext" "vitess.io/vitess/go/mysql" -======= - "google.golang.org/protobuf/encoding/prototext" - - "vitess.io/vitess/go/mysql/replication" ->>>>>>> 04834c4fec (VDiff: Cleanup the controller for a VDiff before deleting it (#14107)) "vitess.io/vitess/go/sqltypes" "vitess.io/vitess/go/vt/binlog/binlogplayer" "vitess.io/vitess/go/vt/log" "vitess.io/vitess/go/vt/proto/tabletmanagerdata" - "vitess.io/vitess/go/vt/sqlparser" "vitess.io/vitess/go/vt/topo" "vitess.io/vitess/go/vt/vterrors" "vitess.io/vitess/go/vt/vttablet/tmclient" diff --git a/go/vt/vttablet/tabletmanager/vdiff/schema.go b/go/vt/vttablet/tabletmanager/vdiff/schema.go index fb72a5e5161..619c0e5cc79 100644 --- a/go/vt/vttablet/tabletmanager/vdiff/schema.go +++ b/go/vt/vttablet/tabletmanager/vdiff/schema.go @@ -28,9 +28,9 @@ const ( sqlGetVDiffByID = "select * from _vt.vdiff where id = %d" sqlDeleteVDiffs = `delete from vd, vdt, vdl using _vt.vdiff as vd left join _vt.vdiff_table as vdt on (vd.id = vdt.vdiff_id) left join _vt.vdiff_log as vdl on (vd.id = vdl.vdiff_id) - where vd.keyspace = %s and vd.workflow = %s` + where vd.keyspace = %a and vd.workflow = %a` sqlDeleteVDiffByUUID = `delete from vd, vdt using _vt.vdiff as vd left join _vt.vdiff_table as vdt on (vd.id = vdt.vdiff_id) - where vd.vdiff_uuid = %s` + where vd.vdiff_uuid = %a` sqlVDiffSummary = `select vd.state as vdiff_state, vd.last_error as last_error, vdt.table_name as table_name, vd.vdiff_uuid as 'uuid', vdt.state as table_state, vdt.table_rows as table_rows, vd.started_at as started_at, vdt.rows_compared as rows_compared, vd.completed_at as completed_at, @@ -40,16 +40,7 @@ const ( // sqlUpdateVDiffState has a penultimate placeholder for any additional columns you want to update, e.g. `, foo = 1` sqlUpdateVDiffState = "update _vt.vdiff set state = %s, last_error = %s %s where id = %d" sqlUpdateVDiffStopped = `update _vt.vdiff as vd, _vt.vdiff_table as vdt set vd.state = 'stopped', vdt.state = 'stopped', vd.last_error = '' -<<<<<<< HEAD where vd.id = vdt.vdiff_id and vd.id = %d and vd.state != 'completed'` - sqlGetVReplicationEntry = "select * from _vt.vreplication %s" - sqlGetVDiffsToRun = "select * from _vt.vdiff where state in ('started','pending')" // what VDiffs have not been stopped or completed - sqlGetVDiffsToRetry = "select * from _vt.vdiff where state = 'error' and json_unquote(json_extract(options, '$.core_options.auto_retry')) = 'true'" - sqlGetVDiffID = "select id as id from _vt.vdiff where vdiff_uuid = %s" - sqlGetAllVDiffs = "select * from _vt.vdiff order by id desc" - sqlGetAllTableRows = "select table_name as table_name, table_rows as table_rows from INFORMATION_SCHEMA.TABLES where table_schema = %s and table_name in (%s)" -======= - where vd.id = vdt.vdiff_id and vd.id = %a and vd.state != 'completed'` sqlGetVReplicationEntry = "select * from _vt.vreplication %s" sqlGetVDiffsToRun = "select * from _vt.vdiff where state in ('started','pending')" // what VDiffs have not been stopped or completed sqlGetVDiffsToRetry = "select * from _vt.vdiff where state = 'error' and json_unquote(json_extract(options, '$.core_options.auto_retry')) = 'true'" @@ -58,7 +49,6 @@ const ( sqlGetAllVDiffs = "select * from _vt.vdiff order by id desc" sqlGetTableRows = "select table_rows as table_rows from INFORMATION_SCHEMA.TABLES where table_schema = %a and table_name = %a" sqlGetAllTableRows = "select table_name as table_name, table_rows as table_rows from INFORMATION_SCHEMA.TABLES where table_schema = %s and table_name in (%s)" ->>>>>>> 04834c4fec (VDiff: Cleanup the controller for a VDiff before deleting it (#14107)) sqlNewVDiffTable = "insert into _vt.vdiff_table(vdiff_id, table_name, state, table_rows) values(%d, %s, 'pending', %d)" sqlGetVDiffTable = `select vdt.lastpk as lastpk, vdt.mismatch as mismatch, vdt.report as report diff --git a/go/vt/wrangler/workflow.go b/go/vt/wrangler/workflow.go index f2ab2bb85b9..46c1a629a20 100644 --- a/go/vt/wrangler/workflow.go +++ b/go/vt/wrangler/workflow.go @@ -15,14 +15,9 @@ import ( "vitess.io/vitess/go/vt/topo" "vitess.io/vitess/go/vt/topotools" "vitess.io/vitess/go/vt/vtctl/workflow" -<<<<<<< HEAD "vitess.io/vitess/go/vt/vtgate/evalengine" - -======= vdiff2 "vitess.io/vitess/go/vt/vttablet/tabletmanager/vdiff" - binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata" ->>>>>>> 04834c4fec (VDiff: Cleanup the controller for a VDiff before deleting it (#14107)) tabletmanagerdatapb "vitess.io/vitess/go/vt/proto/tabletmanagerdata" topodatapb "vitess.io/vitess/go/vt/proto/topodata" ) @@ -718,13 +713,7 @@ func (wr *Wrangler) deleteWorkflowVDiffData(ctx context.Context, tablet *topodat Action: string(vdiff2.DeleteAction), ActionArg: vdiff2.AllActionArg, }); err != nil { -<<<<<<< HEAD - if sqlErr, ok := err.(*mysql.SQLError); ok && sqlErr.Num != mysql.ERNoSuchTable { // the tables may not exist if no vdiffs have been run - wr.Logger().Errorf("Error deleting vdiff data for %s.%s workflow: %v", tablet.Keyspace, workflow, err) - } -======= log.Errorf("Error deleting vdiff data for %s.%s workflow: %v", tablet.Keyspace, workflow, err) ->>>>>>> 04834c4fec (VDiff: Cleanup the controller for a VDiff before deleting it (#14107)) } }