Skip to content

Support lock-wait with --lock-retry in restic 0.16 #240

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .mockery.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,7 @@ packages:
github.com/creativeprojects/resticprofile/schedule:
interfaces:
Handler:

github.com/creativeprojects/resticprofile/monitor:
interfaces:
OutputAnalysis:
2 changes: 1 addition & 1 deletion config/global.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ type Global struct {
ResticVersion string // not configurable at the moment. To be set after ResticBinary is known.
FilterResticFlags bool `mapstructure:"restic-arguments-filter" default:"true" description:"Remove unknown flags instead of passing all configured flags to restic"`
ResticLockRetryAfter time.Duration `mapstructure:"restic-lock-retry-after" default:"1m" description:"Time to wait before trying to get a lock on a restic repositoey - see https://creativeprojects.github.io/resticprofile/usage/locks/"`
ResticStaleLockAge time.Duration `mapstructure:"restic-stale-lock-age" default:"2h" description:"The age an unused lock on a restic repository must have at least before resiticprofile attempts to unlock - see https://creativeprojects.github.io/resticprofile/usage/locks/"`
ResticStaleLockAge time.Duration `mapstructure:"restic-stale-lock-age" default:"1h" description:"The age an unused lock on a restic repository must have at least before resiticprofile attempts to unlock - see https://creativeprojects.github.io/resticprofile/usage/locks/"`
ShellBinary []string `mapstructure:"shell" default:"auto" examples:"sh;bash;pwsh;powershell;cmd" description:"The shell that is used to run commands (default is OS specific)"`
MinMemory uint64 `mapstructure:"min-memory" default:"100" description:"Minimum available memory (in MB) required to run any commands - see https://creativeprojects.github.io/resticprofile/usage/memory/"`
Scheduler string `mapstructure:"scheduler" description:"Leave blank for the default scheduler or use \"crond\" to select cron on supported operating systems"`
Expand Down
2 changes: 1 addition & 1 deletion constants/default.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ const (
DefaultCommand = "snapshots"
DefaultFilterResticFlags = true
DefaultResticLockRetryAfter = 60 * time.Second
DefaultResticStaleLockAge = 2 * time.Hour
DefaultResticStaleLockAge = 1 * time.Hour
DefaultTheme = "light"
DefaultIONiceFlag = false
DefaultIONiceClass = 2
Expand Down
7 changes: 4 additions & 3 deletions constants/global.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,10 @@ var (

// Limits for restic lock handling (stale locks and retry on lock failure)
const (
MinResticLockRetryTime = 15 * time.Second
MaxResticLockRetryTime = 30 * time.Minute
MinResticStaleLockAge = 1 * time.Hour
MinResticLockRetryDelay = 15 * time.Second
MaxResticLockRetryDelay = 30 * time.Minute
MaxResticLockRetryTimeArgument = 10 * time.Minute
MinResticStaleLockAge = 15 * time.Minute
)

// Schedule lock mode config options
Expand Down
1 change: 1 addition & 0 deletions constants/parameter.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ const (
ParameterVerbose = "verbose"
ParameterDescription = "description"
ParameterVersion = "version"
ParameterRetryLock = "retry-lock"
ParameterRepository = "repo"
ParameterRepositoryFile = "repository-file"
ParameterPasswordFile = "password-file"
Expand Down
2 changes: 1 addition & 1 deletion docs/content/usage/locks/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ In some cases, resticprofile as well as restic may leave a lock behind if the pr

For that matter, if you add the flag `force-inactive-lock` to your profile, resticprofile will detect and remove stale locks:
* **resticprofile locks**: Check for the presence of a process with the PID indicated in the lockfile. If it can't find any, it will try to delete the lock and continue the operation (locking again, running profile and so on...)
* **restic locks**: Evaluate if a restic command failed on acquiring a lock. If the lock is older than `restic-stale-lock-age`, invoke `restic unlock` and retry the command that failed (can be disabled by setting `restic-stale-lock-age` to 0, default is 2h).
* **restic locks**: Evaluate if a restic command failed on acquiring a lock. If the lock is older than `restic-stale-lock-age`, invoke `restic unlock` and retry the command that failed (can be disabled by setting `restic-stale-lock-age` to 0, default is 1h).

{{< tabs groupId="config-with-json" >}}
{{% tab name="toml" %}}
Expand Down
4 changes: 4 additions & 0 deletions monitor/summary.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ type OutputAnalysis interface {
// If no remote lock is held or the time cannot be determined, the second parameter is false.
GetRemoteLockedSince() (time.Duration, bool)

// GetRemoteLockedMaxWait returns the max time duration that restic waited for the lock to acquire.
// If no remote lock is held or the time cannot be determined, the second parameter is false.
GetRemoteLockedMaxWait() (time.Duration, bool)

// GetRemoteLockedBy returns who locked the remote lock, if available.
GetRemoteLockedBy() (string, bool)
}
36 changes: 26 additions & 10 deletions shell/analyser.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,10 @@ type OutputAnalyser struct {
}

var outputAnalyserPatterns = map[string]*regexp.Regexp{
"lock-failure,who": regexp.MustCompile("unable to create lock.+already locked.+?by (.+)$"),
"lock-failure,age": regexp.MustCompile("lock was created at.+\\(([^()]+)\\s+ago\\)"),
"lock-failure,stale": regexp.MustCompile("the\\W+unlock\\W+command can be used to remove stale locks"),
"lock-failure,who": regexp.MustCompile("unable to create lock.+already locked.+?by (.+)$"),
"lock-failure,age": regexp.MustCompile("lock was created at.+\\(([^()]+)\\s+ago\\)"),
"lock-failure,stale": regexp.MustCompile("the\\W+unlock\\W+command can be used to remove stale locks"),
"lock-retry,max-wait": regexp.MustCompile("repo already locked, waiting up to (\\S+) for the lock"),
}

func NewOutputAnalyser() *OutputAnalyser {
Expand Down Expand Up @@ -99,7 +100,7 @@ func (a *OutputAnalyser) SetCallback(name, pattern string, minCount, maxCalls in
return nil
}

func (a OutputAnalyser) invokeCallback(name, line string) (err error) {
func (a *OutputAnalyser) invokeCallback(name, line string) (err error) {
count, hasCount := a.counts[name]
cb, hasCallback := a.callbacks[name]

Expand All @@ -124,7 +125,7 @@ func (a OutputAnalyser) invokeCallback(name, line string) (err error) {
return
}

func (a OutputAnalyser) ContainsRemoteLockFailure() bool {
func (a *OutputAnalyser) ContainsRemoteLockFailure() bool {
a.lock.Lock()
defer a.lock.Unlock()

Expand All @@ -134,7 +135,7 @@ func (a OutputAnalyser) ContainsRemoteLockFailure() bool {
return false
}

func (a OutputAnalyser) GetRemoteLockedSince() (time.Duration, bool) {
func (a *OutputAnalyser) GetRemoteLockedSince() (time.Duration, bool) {
a.lock.Lock()
defer a.lock.Unlock()

Expand All @@ -149,7 +150,22 @@ func (a OutputAnalyser) GetRemoteLockedSince() (time.Duration, bool) {
return 0, false
}

func (a OutputAnalyser) GetRemoteLockedBy() (string, bool) {
func (a *OutputAnalyser) GetRemoteLockedMaxWait() (time.Duration, bool) {
a.lock.Lock()
defer a.lock.Unlock()

if m, ok := a.matches["lock-retry,max-wait"]; ok && len(m) > 1 {
if maxWait, err := time.ParseDuration(m[1]); err == nil {
return maxWait.Truncate(time.Millisecond), true
} else {
clog.Debugf("failed parsing restic retry-lock max wait. Cause %s", err.Error())
}
}

return 0, false
}

func (a *OutputAnalyser) GetRemoteLockedBy() (string, bool) {
a.lock.Lock()
defer a.lock.Unlock()

Expand All @@ -160,11 +176,11 @@ func (a OutputAnalyser) GetRemoteLockedBy() (string, bool) {
return "", false
}

func (a OutputAnalyser) AnalyseStringLines(output string) error {
func (a *OutputAnalyser) AnalyseStringLines(output string) error {
return a.AnalyseLines(strings.NewReader(output))
}

func (a OutputAnalyser) AnalyseLines(output io.Reader) (err error) {
func (a *OutputAnalyser) AnalyseLines(output io.Reader) (err error) {
a.lock.Lock()
defer a.lock.Unlock()

Expand All @@ -183,7 +199,7 @@ func (a OutputAnalyser) AnalyseLines(output io.Reader) (err error) {
return
}

func (a OutputAnalyser) analyseLine(line string) (err error) {
func (a *OutputAnalyser) analyseLine(line string) (err error) {
for _, pattern := range a.patterns {
match := pattern.expression.FindStringSubmatch(line)
if match != nil {
Expand Down
7 changes: 7 additions & 0 deletions shell/analyser_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
)

const ResticLockFailureOutput = `
repo already locked, waiting up to 2m1s115.3ms for the lock
unable to create lock in backend: repository is already locked by PID 27153 on app-server-01 by root (UID 0, GID 0)
lock was created at 2021-04-17 19:00:16 (1s727.5ms ago)
storage ID 870530a4
Expand All @@ -33,6 +34,12 @@ func TestRemoteLockFailure(t *testing.T) {
assert.Equal(t, true, ok)
assert.Equal(t, "PID 27153 on app-server-01 by root (UID 0, GID 0)", name)
})

t.Run("GetRemoteLockedMaxWait", func(t *testing.T) {
wait, ok := analysis.GetRemoteLockedMaxWait()
assert.Equal(t, true, ok)
assert.Equal(t, 2*time.Minute+time.Second+(115*time.Millisecond), wait)
})
}

func TestCustomErrorCallback(t *testing.T) {
Expand Down
75 changes: 57 additions & 18 deletions wrapper.go
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,11 @@ func (r *resticWrapper) getCommandArgumentsFilter(command string) argumentsFilte
return nil
}

func (r *resticWrapper) containsArguments(arguments []string, subset ...string) (found bool) {
found = len(r.validArgumentsFilter(subset)(arguments, true)) > 0
return
}

func (r *resticWrapper) prepareCommand(command string, args *shell.Args, allowExtraValues bool) shellCommandDefinition {
// Create local instance to allow modification
args = args.Clone()
Expand All @@ -364,6 +369,20 @@ func (r *resticWrapper) prepareCommand(command string, args *shell.Args, allowEx
args.AddArgs(r.profile.GetBackupSource(), shell.ArgConfigBackupSource)
}

// Add retry-lock (supported from restic 0.16, depends on filter being enabled)
if lockRetryTime, enabled := r.remainingLockRetryTime(); enabled && filter != nil {
// limiting the retry handling in restic, we need to make sure we can retry internally so that unlock is called.
lockRetryTime = lockRetryTime - r.global.ResticLockRetryAfter - constants.MinResticLockRetryDelay
if lockRetryTime > constants.MaxResticLockRetryTimeArgument {
lockRetryTime = constants.MaxResticLockRetryTimeArgument
}
lockRetryTime = lockRetryTime.Truncate(time.Minute)

if lockRetryTime > 0 && !r.containsArguments(args.GetAll(), fmt.Sprintf("--%s", constants.ParameterRetryLock)) {
args.AddFlag(constants.ParameterRetryLock, fmt.Sprintf("%.0fm", lockRetryTime.Minutes()), shell.ArgConfigEscape)
}
}

// Build arguments and publicArguments (for logging)
arguments, publicArguments := args.GetAll(), config.GetNonConfidentialArgs(r.profile, args).GetAll()
if filter != nil {
Expand Down Expand Up @@ -731,7 +750,16 @@ func (r *resticWrapper) canRetryAfterError(command string, summary monitor.Summa
output := summary.OutputAnalysis

if output != nil && output.ContainsRemoteLockFailure() {
clog.Debugf("repository lock failed when running '%s'", command)
// Do not count lock-wait time as normal execution time (to calc correct remaining lock-wait time)
if maxWait, ok := output.GetRemoteLockedMaxWait(); ok {
r.executionTime -= maxWait
} else {
r.executionTime -= summary.Duration
}
if r.executionTime < 0 {
r.executionTime = 0
}
clog.Debugf("repository lock failed when running '%s', counted execution time %s", command, r.executionTime.Truncate(time.Second))
retry, sleep = r.canRetryAfterRemoteLockFailure(output)
}

Expand Down Expand Up @@ -787,28 +815,26 @@ func (r *resticWrapper) canRetryAfterRemoteLockFailure(output monitor.OutputAnal
}

// Check if we have time left to wait on a non-stale lock
retryDelay := r.global.ResticLockRetryAfter

if r.lockWait != nil && retryDelay > 0 {
elapsedTime := time.Since(r.startTime)
availableTime := *r.lockWait - elapsedTime + r.executionTime

if retryDelay < constants.MinResticLockRetryTime {
retryDelay = constants.MinResticLockRetryTime
} else if retryDelay > constants.MaxResticLockRetryTime {
retryDelay = constants.MaxResticLockRetryTime
if remainingTime, enabled := r.remainingLockRetryTime(); enabled && remainingTime > 0 {
retryDelay := r.global.ResticLockRetryAfter
if retryDelay < constants.MinResticLockRetryDelay {
retryDelay = constants.MinResticLockRetryDelay
} else if retryDelay > constants.MaxResticLockRetryDelay {
retryDelay = constants.MaxResticLockRetryDelay
}

if retryDelay > availableTime {
retryDelay = availableTime
if retryDelay > remainingTime {
retryDelay = remainingTime
}

if retryDelay >= constants.MinResticLockRetryTime {
if retryDelay >= constants.MinResticLockRetryDelay {
lockName := r.profile.Repository.String()
if lockedBy, ok := output.GetRemoteLockedBy(); ok {
lockName = fmt.Sprintf("%s locked by %s", lockName, lockedBy)
}
logLockWait(lockName, r.startTime, time.Unix(0, 0), *r.lockWait)
if r.lockWait != nil {
logLockWait(lockName, r.startTime, time.Unix(0, 0), r.executionTime, *r.lockWait)
}

return true, retryDelay
}
Expand All @@ -818,6 +844,18 @@ func (r *resticWrapper) canRetryAfterRemoteLockFailure(output monitor.OutputAnal
return false, 0
}

func (r *resticWrapper) remainingLockRetryTime() (remaining time.Duration, enabled bool) {
enabled = r.global.ResticLockRetryAfter > 0 && r.lockWait != nil
if enabled {
elapsedTime := time.Since(r.startTime)
remaining = *r.lockWait - elapsedTime + r.executionTime
}
if remaining < 0 {
remaining = 0
}
return
}

// lockRun is making sure the function is only run once by putting a lockfile on the disk
func lockRun(lockFile string, force bool, lockWait *time.Duration, run func(setPID lock.SetPID) error) error {
// No lock
Expand Down Expand Up @@ -872,7 +910,7 @@ func lockRun(lockFile string, force bool, lockWait *time.Duration, run func(setP
}
if time.Since(start) < *lockWait {
lockName := fmt.Sprintf("%s locked by %s", lockFile, locker)
lockWaitLogged = logLockWait(lockName, start, lockWaitLogged, *lockWait)
lockWaitLogged = logLockWait(lockName, start, lockWaitLogged, 0, *lockWait)

sleep := 3 * time.Second
if sleep > *lockWait {
Expand All @@ -893,15 +931,16 @@ func lockRun(lockFile string, force bool, lockWait *time.Duration, run func(setP

const logLockWaitEvery = 5 * time.Minute

func logLockWait(lockName string, started, lastLogged time.Time, maxLockWait time.Duration) time.Time {
func logLockWait(lockName string, started, lastLogged time.Time, executed, maxLockWait time.Duration) time.Time {
now := time.Now()
lastLog := now.Sub(lastLogged)
elapsed := now.Sub(started).Truncate(time.Second)
waited := (elapsed - executed).Truncate(time.Second)
remaining := (maxLockWait - elapsed).Truncate(time.Second)

if lastLog > logLockWaitEvery {
if elapsed > logLockWaitEvery {
clog.Infof("lock wait (remaining %s / elapsed %s): %s", remaining, elapsed, strings.TrimSpace(lockName))
clog.Infof("lock wait (remaining %s / waited %s / elapsed %s): %s", remaining, waited, elapsed, strings.TrimSpace(lockName))
} else {
clog.Infof("lock wait (remaining %s): %s", remaining, strings.TrimSpace(lockName))
}
Expand Down
Loading