Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 42 additions & 1 deletion go/vt/mysqlctl/query.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,13 +71,54 @@ func (mysqld *Mysqld) ExecuteSuperQueryList(ctx context.Context, queryList []str
func (mysqld *Mysqld) executeSuperQueryListConn(ctx context.Context, conn *dbconnpool.PooledDBConnection, queryList []string) error {
for _, query := range queryList {
log.Infof("exec %v", redactMasterPassword(query))
if _, err := mysqld.executeFetchContext(ctx, conn, query, 10000, false); err != nil {
_, err := mysqld.executeFetchContext(ctx, conn, query, 10000, false)
switch {
case err == nil:
return nil

case strings.ToUpper(query) == "START SLAVE" && isReplicationErr1872(err):
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think a generic "execute query" function is the right place for this specific logic. If there's not a subroutine like mysqld.StartSlave() that all the code paths you're trying to catch use, then we should add one and change the appropriate sites to use it.

Or alternatively, would it work for you if we only check for this error in the repair section of the replication health reporter? That seems safer than injecting the fix at this low level, unless for some reason the periodic repair doesn't work for your needs.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, it felt weird to do this at this low level. I started at repairReplication and kept digging. That code path never called StartSlave that I found.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you're ok with only doing this check in the repairReplication code path, then we can just make sure sufficient error detail gets propagated back up to that level to detect this specific case, and put the logic there. WDYT?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That works for me. This was just a quick stab in the dark to get some feedback.

if err = mysqld.fixReplication(ctx, conn); err == nil {
// keep running commands if this fixed replication
continue
}
fallthrough

default:
return fmt.Errorf("ExecuteFetch(%v) failed: %v", redactMasterPassword(query), err.Error())
}
}
return nil
}

func isReplicationErr1872(err error) bool {
if err == nil {
return false
}
return strings.Contains(err.Error(), "Slave failed to initialize relay log info structure from the repository")
}

// fixReplication attempts a fix for this error:
// Slave failed to initialize relay log info structure from the repository (errno 1872) (sqlstate HY000) during query: START SLAVE
// see https://bugs.mysql.com/bug.php?id=83713 or https://github.com/vitessio/vitess/issues/5067
func (mysqld *Mysqld) fixReplication(ctx context.Context, conn *dbconnpool.PooledDBConnection) error {
queryList := []string{
"STOP SLAVE",
"RESET SLAVE",
"START SLAVE",
}

// copy from executeSuperQueryListConn to avoid infinite loop
for _, query := range queryList {
log.Infof("exec %v", redactMasterPassword(query))
if _, err := mysqld.executeFetchContext(ctx, conn, query, 10000, false); err != nil {
return err
}
}

log.Infof("replication fixed for errno 1872")
return nil
}

// FetchSuperQuery returns the results of executing a query as a super user.
func (mysqld *Mysqld) FetchSuperQuery(ctx context.Context, query string) (*sqltypes.Result, error) {
conn, connErr := getPoolReconnect(ctx, mysqld.dbaPool)
Expand Down