diff --git a/op-challenger/cmd/run_trace.go b/op-challenger/cmd/run_trace.go index e2fee2e800f3c..588b0894751d7 100644 --- a/op-challenger/cmd/run_trace.go +++ b/op-challenger/cmd/run_trace.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "strings" + "time" "github.com/ethereum-optimism/optimism/op-challenger/flags" gameTypes "github.com/ethereum-optimism/optimism/op-challenger/game/types" @@ -43,11 +44,12 @@ func RunTrace(ctx *cli.Context, _ context.CancelCauseFunc) (cliapp.Lifecycle, er runConfigs = append(runConfigs, runner.RunConfig{GameType: gameType}) } } - return runner.NewRunner(logger, cfg, runConfigs), nil + vmTimeout := ctx.Duration(VMTimeoutFlag.Name) + return runner.NewRunner(logger, cfg, runConfigs, vmTimeout), nil } func runTraceFlags() []cli.Flag { - return append(flags.Flags, RunTraceRunFlag) + return append(flags.Flags, RunTraceRunFlag, VMTimeoutFlag) } var RunTraceCommand = &cli.Command{ @@ -58,6 +60,8 @@ var RunTraceCommand = &cli.Command{ Flags: runTraceFlags(), } +const DefaultVMTimeout = 3 * time.Hour + var ( RunTraceRunFlag = &cli.StringSliceFlag{ Name: "run", @@ -69,6 +73,12 @@ var ( "If the prestateHash is omitted, the absolute prestate hash used for new games on-chain.", EnvVars: opservice.PrefixEnvVar(flags.EnvVarPrefix, "RUN"), } + VMTimeoutFlag = &cli.DurationFlag{ + Name: "vm-timeout", + Usage: fmt.Sprintf("Maximum duration for VM execution per run. Default is %s. Set to 0 to disable timeout.", DefaultVMTimeout), + EnvVars: opservice.PrefixEnvVar(flags.EnvVarPrefix, "VM_TIMEOUT"), + Value: DefaultVMTimeout, + } ) func parseRunArgs(args []string) ([]runner.RunConfig, error) { diff --git a/op-challenger/runner/metrics.go b/op-challenger/runner/metrics.go index 8aa7494fa2c04..9b0fd98a56846 100644 --- a/op-challenger/runner/metrics.go +++ b/op-challenger/runner/metrics.go @@ -20,16 +20,22 @@ type Metrics struct { *metrics.VmMetrics opmetrics.RPCMetrics - up prometheus.Gauge - vmLastExecutionTime *prometheus.GaugeVec - vmLastMemoryUsed *prometheus.GaugeVec - successTotal *prometheus.CounterVec - failuresTotal *prometheus.CounterVec - consecutiveFailuresCurrent *prometheus.GaugeVec - panicsTotal *prometheus.CounterVec - invalidTotal *prometheus.CounterVec + up prometheus.Gauge + vmLastExecutionTime *prometheus.GaugeVec + vmLastMemoryUsed *prometheus.GaugeVec + successTotal *prometheus.CounterVec + setupFailuresTotal *prometheus.CounterVec + consecutiveSetupFailuresCurrent *prometheus.GaugeVec + vmFailuresTotal *prometheus.CounterVec } +// Reason labels for vmFailuresTotal metric +const ( + ReasonIncorrectStatus = "incorrect_status" + ReasonPanic = "panic" + ReasonTimeout = "timeout" +) + var _ Metricer = (*Metrics)(nil) // Metrics implementation must implement RegistryMetricer to allow the metrics server to work. @@ -68,34 +74,30 @@ func NewMetrics(runConfigs []RunConfig) *Metrics { Name: "success_total", Help: "Number of VM executions that successfully verified the output root", }, []string{"type"}), - failuresTotal: factory.NewCounterVec(prometheus.CounterOpts{ + setupFailuresTotal: factory.NewCounterVec(prometheus.CounterOpts{ Namespace: Namespace, - Name: "failures_total", - Help: "Number of failures to execute a VM", + Name: "setup_failures_total", + Help: "Number of setup failures before VM execution", }, []string{"type"}), - consecutiveFailuresCurrent: factory.NewGaugeVec(prometheus.GaugeOpts{ + consecutiveSetupFailuresCurrent: factory.NewGaugeVec(prometheus.GaugeOpts{ Namespace: Namespace, - Name: "consecutive_failures_current", + Name: "consecutive_setup_failures_current", Help: "Number of consecutive setup failures by VM type. Resets to 0 on any complete run.", }, []string{"type"}), - panicsTotal: factory.NewCounterVec(prometheus.CounterOpts{ - Namespace: Namespace, - Name: "panics_total", - Help: "Number of times the VM panicked", - }, []string{"type"}), - invalidTotal: factory.NewCounterVec(prometheus.CounterOpts{ + vmFailuresTotal: factory.NewCounterVec(prometheus.CounterOpts{ Namespace: Namespace, - Name: "invalid_total", - Help: "Number of runs that determined the output root was invalid", - }, []string{"type"}), + Name: "vm_failures_total", + Help: "Number of VM execution failures by type and reason (incorrect_status, panic, timeout)", + }, []string{"type", "reason"}), } for _, runConfig := range runConfigs { metrics.successTotal.WithLabelValues(runConfig.Name).Add(0) - metrics.failuresTotal.WithLabelValues(runConfig.Name).Add(0) - metrics.consecutiveFailuresCurrent.WithLabelValues(runConfig.Name).Set(0) - metrics.panicsTotal.WithLabelValues(runConfig.Name).Add(0) - metrics.invalidTotal.WithLabelValues(runConfig.Name).Add(0) + metrics.setupFailuresTotal.WithLabelValues(runConfig.Name).Add(0) + metrics.consecutiveSetupFailuresCurrent.WithLabelValues(runConfig.Name).Set(0) + metrics.vmFailuresTotal.WithLabelValues(runConfig.Name, ReasonIncorrectStatus).Add(0) + metrics.vmFailuresTotal.WithLabelValues(runConfig.Name, ReasonPanic).Add(0) + metrics.vmFailuresTotal.WithLabelValues(runConfig.Name, ReasonTimeout).Add(0) metrics.RecordUp() } @@ -123,22 +125,15 @@ func (m *Metrics) RecordVmMemoryUsed(vmType string, memoryUsed uint64) { func (m *Metrics) RecordSuccess(vmType string) { m.successTotal.WithLabelValues(vmType).Inc() - m.consecutiveFailuresCurrent.WithLabelValues(vmType).Set(0) -} - -func (m *Metrics) RecordFailure(vmType string) { - m.failuresTotal.WithLabelValues(vmType).Inc() - m.consecutiveFailuresCurrent.WithLabelValues(vmType).Inc() + m.consecutiveSetupFailuresCurrent.WithLabelValues(vmType).Set(0) } -func (m *Metrics) RecordPanic(vmType string) { - m.panicsTotal.WithLabelValues(vmType).Inc() - // The result was bad, but we still completed setup successfully - m.consecutiveFailuresCurrent.WithLabelValues(vmType).Set(0) +func (m *Metrics) RecordSetupFailure(vmType string) { + m.setupFailuresTotal.WithLabelValues(vmType).Inc() + m.consecutiveSetupFailuresCurrent.WithLabelValues(vmType).Inc() } -func (m *Metrics) RecordInvalid(vmType string) { - m.invalidTotal.WithLabelValues(vmType).Inc() - // The result was bad, but we still completed setup successfully - m.consecutiveFailuresCurrent.WithLabelValues(vmType).Set(0) +func (m *Metrics) RecordVmFailure(vmType string, reason string) { + m.vmFailuresTotal.WithLabelValues(vmType, reason).Inc() + m.consecutiveSetupFailuresCurrent.WithLabelValues(vmType).Set(0) } diff --git a/op-challenger/runner/runner.go b/op-challenger/runner/runner.go index 796a9da1897a3..da3b06a26c6e5 100644 --- a/op-challenger/runner/runner.go +++ b/op-challenger/runner/runner.go @@ -36,6 +36,7 @@ import ( var ( ErrUnexpectedStatusCode = errors.New("unexpected status code") + ErrVMTimeout = errors.New("VM execution timed out") ) type Metricer interface { @@ -43,9 +44,8 @@ type Metricer interface { metrics.VmMetricer opmetrics.RPCMetricer - RecordFailure(vmType string) - RecordPanic(vmType string) - RecordInvalid(vmType string) + RecordSetupFailure(vmType string) + RecordVmFailure(vmType string, reason string) RecordSuccess(vmType string) } @@ -56,11 +56,24 @@ type RunConfig struct { PrestateFilename string } +type TraceProviderCreator func( + ctx context.Context, + logger log.Logger, + m trace.Metricer, + cfg *config.Config, + prestateSource prestateFetcher, + gameType gameTypes.GameType, + localInputs utils.LocalGameInputs, + dir string, +) (types.TraceProvider, error) + type Runner struct { - log log.Logger - cfg *config.Config - runConfigs []RunConfig - m Metricer + log log.Logger + cfg *config.Config + runConfigs []RunConfig + m Metricer + vmTimeout time.Duration + traceProviderCreator TraceProviderCreator running atomic.Bool ctx context.Context @@ -69,12 +82,14 @@ type Runner struct { metricsSrv *httputil.HTTPServer } -func NewRunner(logger log.Logger, cfg *config.Config, runConfigs []RunConfig) *Runner { +func NewRunner(logger log.Logger, cfg *config.Config, runConfigs []RunConfig, vmTimeout time.Duration) *Runner { return &Runner{ - log: logger, - cfg: cfg, - runConfigs: runConfigs, - m: NewMetrics(runConfigs), + log: logger, + cfg: cfg, + runConfigs: runConfigs, + m: NewMetrics(runConfigs), + vmTimeout: vmTimeout, + traceProviderCreator: createTraceProvider, } } @@ -142,13 +157,16 @@ func (r *Runner) runAndRecordOnce(ctx context.Context, rlog log.Logger, runConfi recordError := func(err error, configName string, m Metricer, log log.Logger) { if errors.Is(err, ErrUnexpectedStatusCode) { log.Error("Incorrect status code", "type", runConfig.Name, "err", err) - m.RecordInvalid(configName) + m.RecordVmFailure(configName, ReasonIncorrectStatus) } else if errors.Is(err, trace.ErrVMPanic) { log.Error("VM panicked", "type", runConfig.Name) - m.RecordPanic(configName) + m.RecordVmFailure(configName, ReasonPanic) + } else if errors.Is(err, ErrVMTimeout) { + log.Error("VM execution timed out", "type", runConfig.Name, "timeout", r.vmTimeout) + m.RecordVmFailure(configName, ReasonTimeout) } else if err != nil { log.Error("Failed to run", "type", runConfig.Name, "err", err) - m.RecordFailure(configName) + m.RecordSetupFailure(configName) } else { log.Info("Successfully verified output root", "type", runConfig.Name) m.RecordSuccess(configName) @@ -195,12 +213,20 @@ func (r *Runner) runAndRecordOnce(ctx context.Context, rlog log.Logger, runConfi } func (r *Runner) runOnce(ctx context.Context, logger log.Logger, name string, gameType gameTypes.GameType, prestateSource prestateFetcher, localInputs utils.LocalGameInputs, dir string) error { - provider, err := createTraceProvider(ctx, logger, metrics.NewTypedVmMetrics(r.m, name), r.cfg, prestateSource, gameType, localInputs, dir) + if r.vmTimeout > 0 { + var cancel context.CancelFunc + ctx, cancel = context.WithTimeout(ctx, r.vmTimeout) + defer cancel() + } + provider, err := r.traceProviderCreator(ctx, logger, metrics.NewTypedVmMetrics(r.m, name), r.cfg, prestateSource, gameType, localInputs, dir) if err != nil { return fmt.Errorf("failed to create trace provider: %w", err) } hash, err := provider.Get(ctx, types.RootPosition) if err != nil { + if errors.Is(err, context.DeadlineExceeded) { + return fmt.Errorf("%w: %w", ErrVMTimeout, err) + } return fmt.Errorf("failed to execute trace provider: %w", err) } if hash[0] != mipsevm.VMStatusValid { diff --git a/op-challenger/runner/runner_test.go b/op-challenger/runner/runner_test.go new file mode 100644 index 0000000000000..2af0cea0b9065 --- /dev/null +++ b/op-challenger/runner/runner_test.go @@ -0,0 +1,138 @@ +package runner + +import ( + "context" + "testing" + "time" + + "github.com/ethereum-optimism/optimism/cannon/mipsevm" + "github.com/ethereum-optimism/optimism/op-challenger/config" + "github.com/ethereum-optimism/optimism/op-challenger/game/fault/trace/utils" + "github.com/ethereum-optimism/optimism/op-challenger/game/fault/trace/vm" + "github.com/ethereum-optimism/optimism/op-challenger/game/fault/types" + gameTypes "github.com/ethereum-optimism/optimism/op-challenger/game/types" + "github.com/ethereum/go-ethereum/common" + "github.com/ethereum/go-ethereum/log" + "github.com/stretchr/testify/require" +) + +func TestNewRunnerSetsTimeout(t *testing.T) { + timeout := 5 * time.Minute + r := NewRunner(nil, nil, nil, timeout) + require.Equal(t, timeout, r.vmTimeout) +} + +func TestRunOnceAppliesTimeout(t *testing.T) { + tests := []struct { + name string + vmTimeout time.Duration + expectDeadline bool + }{ + { + name: "timeout applied when set", + vmTimeout: 100 * time.Millisecond, + expectDeadline: true, + }, + { + name: "no deadline when timeout is zero", + vmTimeout: 0, + expectDeadline: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + var capturedCtx context.Context + + mockCreator := func( + ctx context.Context, + _ log.Logger, + _ vm.Metricer, + _ *config.Config, + _ prestateFetcher, + _ gameTypes.GameType, + _ utils.LocalGameInputs, + _ string, + ) (types.TraceProvider, error) { + capturedCtx = ctx + return &stubTraceProvider{}, nil + } + + r := &Runner{ + vmTimeout: tt.vmTimeout, + traceProviderCreator: mockCreator, + } + + err := r.runOnce(context.Background(), log.New(), "test", gameTypes.CannonGameType, nil, utils.LocalGameInputs{}, "") + require.NoError(t, err) + + _, hasDeadline := capturedCtx.Deadline() + require.Equal(t, tt.expectDeadline, hasDeadline) + }) + } +} + +type stubTraceProvider struct{} + +func (s *stubTraceProvider) Get(_ context.Context, _ types.Position) (common.Hash, error) { + // Return a hash with VMStatusValid as the first byte + var hash common.Hash + hash[0] = mipsevm.VMStatusValid + return hash, nil +} + +func (s *stubTraceProvider) GetStepData(_ context.Context, _ types.Position) ([]byte, []byte, *types.PreimageOracleData, error) { + return nil, nil, nil, nil +} + +func (s *stubTraceProvider) AbsolutePreStateCommitment(_ context.Context) (common.Hash, error) { + return common.Hash{}, nil +} + +func (s *stubTraceProvider) GetL2BlockNumberChallenge(_ context.Context) (*types.InvalidL2BlockNumberChallenge, error) { + return nil, types.ErrL2BlockNumberValid +} + +// slowTraceProvider blocks until context is done, simulating a slow VM +type slowTraceProvider struct{} + +func (s *slowTraceProvider) Get(ctx context.Context, _ types.Position) (common.Hash, error) { + <-ctx.Done() + return common.Hash{}, ctx.Err() +} + +func (s *slowTraceProvider) GetStepData(_ context.Context, _ types.Position) ([]byte, []byte, *types.PreimageOracleData, error) { + return nil, nil, nil, nil +} + +func (s *slowTraceProvider) AbsolutePreStateCommitment(_ context.Context) (common.Hash, error) { + return common.Hash{}, nil +} + +func (s *slowTraceProvider) GetL2BlockNumberChallenge(_ context.Context) (*types.InvalidL2BlockNumberChallenge, error) { + return nil, types.ErrL2BlockNumberValid +} + +func TestRunOnceReturnsTimeoutError(t *testing.T) { + mockCreator := func( + _ context.Context, + _ log.Logger, + _ vm.Metricer, + _ *config.Config, + _ prestateFetcher, + _ gameTypes.GameType, + _ utils.LocalGameInputs, + _ string, + ) (types.TraceProvider, error) { + return &slowTraceProvider{}, nil + } + + r := &Runner{ + vmTimeout: 50 * time.Millisecond, + traceProviderCreator: mockCreator, + } + + err := r.runOnce(context.Background(), log.New(), "test", gameTypes.CannonGameType, nil, utils.LocalGameInputs{}, "") + require.ErrorIs(t, err, ErrVMTimeout) + require.ErrorIs(t, err, context.DeadlineExceeded) +}