Skip to content

Commit 4a36e24

Browse files
authored
Merge pull request #14 from ccoffey/cathal/safer_cut_over
Cut-over should wait for heartbeat lag to be low enough to succeed
2 parents 253658d + 3135a25 commit 4a36e24

File tree

4 files changed

+56
-5
lines changed

4 files changed

+56
-5
lines changed

doc/hooks.md

+1
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ The following variables are available on all hooks:
6666
- `GH_OST_ESTIMATED_ROWS` - estimated total rows in table
6767
- `GH_OST_COPIED_ROWS` - number of rows copied by `gh-ost`
6868
- `GH_OST_INSPECTED_LAG` - lag in seconds (floating point) of inspected server
69+
- `GH_OST_HEARTBEAT_LAG` - lag in seconds (floating point) of heartbeat
6970
- `GH_OST_PROGRESS` - progress pct ([0..100], floating point) of migration
7071
- `GH_OST_MIGRATED_HOST`
7172
- `GH_OST_INSPECTED_HOST`

go/base/context.go

+21
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,8 @@ type MigrationContext struct {
178178
RenameTablesEndTime time.Time
179179
pointOfInterestTime time.Time
180180
pointOfInterestTimeMutex *sync.Mutex
181+
lastHeartbeatOnChangelogTime time.Time
182+
lastHeartbeatOnChangelogMutex *sync.Mutex
181183
CurrentLag int64
182184
currentProgress uint64
183185
ThrottleHTTPStatusCode int64
@@ -272,6 +274,7 @@ func NewMigrationContext() *MigrationContext {
272274
throttleControlReplicaKeys: mysql.NewInstanceKeyMap(),
273275
configMutex: &sync.Mutex{},
274276
pointOfInterestTimeMutex: &sync.Mutex{},
277+
lastHeartbeatOnChangelogMutex: &sync.Mutex{},
275278
ColumnRenameMap: make(map[string]string),
276279
PanicAbort: make(chan error),
277280
Log: NewDefaultLogger(),
@@ -455,6 +458,10 @@ func (this *MigrationContext) MarkRowCopyEndTime() {
455458
this.RowCopyEndTime = time.Now()
456459
}
457460

461+
func (this *MigrationContext) TimeSinceLastHeartbeatOnChangelog() time.Duration {
462+
return time.Since(this.GetLastHeartbeatOnChangelogTime())
463+
}
464+
458465
func (this *MigrationContext) GetCurrentLagDuration() time.Duration {
459466
return time.Duration(atomic.LoadInt64(&this.CurrentLag))
460467
}
@@ -494,6 +501,20 @@ func (this *MigrationContext) TimeSincePointOfInterest() time.Duration {
494501
return time.Since(this.pointOfInterestTime)
495502
}
496503

504+
func (this *MigrationContext) SetLastHeartbeatOnChangelogTime(t time.Time) {
505+
this.lastHeartbeatOnChangelogMutex.Lock()
506+
defer this.lastHeartbeatOnChangelogMutex.Unlock()
507+
508+
this.lastHeartbeatOnChangelogTime = t
509+
}
510+
511+
func (this *MigrationContext) GetLastHeartbeatOnChangelogTime() time.Time {
512+
this.lastHeartbeatOnChangelogMutex.Lock()
513+
defer this.lastHeartbeatOnChangelogMutex.Unlock()
514+
515+
return this.lastHeartbeatOnChangelogTime
516+
}
517+
497518
func (this *MigrationContext) SetHeartbeatIntervalMilliseconds(heartbeatIntervalMilliseconds int64) {
498519
if heartbeatIntervalMilliseconds < 100 {
499520
heartbeatIntervalMilliseconds = 100

go/logic/hooks.go

+1
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ func (this *HooksExecutor) applyEnvironmentVariables(extraVariables ...string) [
6464
env = append(env, fmt.Sprintf("GH_OST_INSPECTED_HOST=%s", this.migrationContext.GetInspectorHostname()))
6565
env = append(env, fmt.Sprintf("GH_OST_EXECUTING_HOST=%s", this.migrationContext.Hostname))
6666
env = append(env, fmt.Sprintf("GH_OST_INSPECTED_LAG=%f", this.migrationContext.GetCurrentLagDuration().Seconds()))
67+
env = append(env, fmt.Sprintf("GH_OST_HEARTBEAT_LAG=%f", this.migrationContext.TimeSinceLastHeartbeatOnChangelog().Seconds()))
6768
env = append(env, fmt.Sprintf("GH_OST_PROGRESS=%f", this.migrationContext.GetProgressPct()))
6869
env = append(env, fmt.Sprintf("GH_OST_HOOKS_HINT=%s", this.migrationContext.HooksHintMessage))
6970
env = append(env, fmt.Sprintf("GH_OST_HOOKS_HINT_OWNER=%s", this.migrationContext.HooksHintOwner))

go/logic/migrator.go

+33-5
Original file line numberDiff line numberDiff line change
@@ -207,12 +207,20 @@ func (this *Migrator) canStopStreaming() bool {
207207
return atomic.LoadInt64(&this.migrationContext.CutOverCompleteFlag) != 0
208208
}
209209

210-
// onChangelogStateEvent is called when a binlog event operation on the changelog table is intercepted.
211-
func (this *Migrator) onChangelogStateEvent(dmlEvent *binlog.BinlogDMLEvent) (err error) {
210+
// onChangelogEvent is called when a binlog event operation on the changelog table is intercepted.
211+
func (this *Migrator) onChangelogEvent(dmlEvent *binlog.BinlogDMLEvent) (err error) {
212212
// Hey, I created the changelog table, I know the type of columns it has!
213-
if hint := dmlEvent.NewColumnValues.StringColumn(2); hint != "state" {
213+
switch hint := dmlEvent.NewColumnValues.StringColumn(2); hint {
214+
case "state":
215+
return this.onChangelogStateEvent(dmlEvent)
216+
case "heartbeat":
217+
return this.onChangelogHeartbeatEvent(dmlEvent)
218+
default:
214219
return nil
215220
}
221+
}
222+
223+
func (this *Migrator) onChangelogStateEvent(dmlEvent *binlog.BinlogDMLEvent) (err error) {
216224
changelogStateString := dmlEvent.NewColumnValues.StringColumn(3)
217225
changelogState := ReadChangelogState(changelogStateString)
218226
this.migrationContext.Log.Infof("Intercepted changelog state %s", changelogState)
@@ -245,6 +253,18 @@ func (this *Migrator) onChangelogStateEvent(dmlEvent *binlog.BinlogDMLEvent) (er
245253
return nil
246254
}
247255

256+
func (this *Migrator) onChangelogHeartbeatEvent(dmlEvent *binlog.BinlogDMLEvent) (err error) {
257+
changelogHeartbeatString := dmlEvent.NewColumnValues.StringColumn(3)
258+
259+
heartbeatTime, err := time.Parse(time.RFC3339Nano, changelogHeartbeatString)
260+
if err != nil {
261+
return this.migrationContext.Log.Errore(err)
262+
} else {
263+
this.migrationContext.SetLastHeartbeatOnChangelogTime(heartbeatTime)
264+
return nil
265+
}
266+
}
267+
248268
// listenOnPanicAbort aborts on abort request
249269
func (this *Migrator) listenOnPanicAbort() {
250270
err := <-this.migrationContext.PanicAbort
@@ -476,6 +496,13 @@ func (this *Migrator) cutOver() (err error) {
476496
this.migrationContext.Log.Debugf("checking for cut-over postpone")
477497
this.sleepWhileTrue(
478498
func() (bool, error) {
499+
heartbeatLag := this.migrationContext.TimeSinceLastHeartbeatOnChangelog()
500+
maxLagMillisecondsThrottle := time.Duration(atomic.LoadInt64(&this.migrationContext.MaxLagMillisecondsThrottleThreshold)) * time.Millisecond
501+
cutOverLockTimeout := time.Duration(this.migrationContext.CutOverLockTimeoutSeconds) * time.Second
502+
if heartbeatLag > maxLagMillisecondsThrottle || heartbeatLag > cutOverLockTimeout {
503+
this.migrationContext.Log.Debugf("current HeartbeatLag (%.2fs) is too high, it needs to be less than both --max-lag-millis (%.2fs) and --cut-over-lock-timeout-seconds (%.2fs) to continue", heartbeatLag.Seconds(), maxLagMillisecondsThrottle.Seconds(), cutOverLockTimeout.Seconds())
504+
return true, nil
505+
}
479506
if this.migrationContext.PostponeCutOverFlagFile == "" {
480507
return false, nil
481508
}
@@ -962,13 +989,14 @@ func (this *Migrator) printStatus(rule PrintStatusRule, writers ...io.Writer) {
962989

963990
currentBinlogCoordinates := *this.eventsStreamer.GetCurrentBinlogCoordinates()
964991

965-
status := fmt.Sprintf("Copy: %d/%d %.1f%%; Applied: %d; Backlog: %d/%d; Time: %+v(total), %+v(copy); streamer: %+v; Lag: %.2fs, State: %s; ETA: %s",
992+
status := fmt.Sprintf("Copy: %d/%d %.1f%%; Applied: %d; Backlog: %d/%d; Time: %+v(total), %+v(copy); streamer: %+v; Lag: %.2fs, HeartbeatLag: %.2fs, State: %s; ETA: %s",
966993
totalRowsCopied, rowsEstimate, progressPct,
967994
atomic.LoadInt64(&this.migrationContext.TotalDMLEventsApplied),
968995
len(this.applyEventsQueue), cap(this.applyEventsQueue),
969996
base.PrettifyDurationOutput(elapsedTime), base.PrettifyDurationOutput(this.migrationContext.ElapsedRowCopyTime()),
970997
currentBinlogCoordinates,
971998
this.migrationContext.GetCurrentLagDuration().Seconds(),
999+
this.migrationContext.TimeSinceLastHeartbeatOnChangelog().Seconds(),
9721000
state,
9731001
eta,
9741002
)
@@ -995,7 +1023,7 @@ func (this *Migrator) initiateStreaming() error {
9951023
this.migrationContext.DatabaseName,
9961024
this.migrationContext.GetChangelogTableName(),
9971025
func(dmlEvent *binlog.BinlogDMLEvent) error {
998-
return this.onChangelogStateEvent(dmlEvent)
1026+
return this.onChangelogEvent(dmlEvent)
9991027
},
10001028
)
10011029

0 commit comments

Comments
 (0)