Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions op-challenger/cmd/run_trace.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"errors"
"fmt"
"strings"
"time"

"github.com/ethereum-optimism/optimism/op-challenger/flags"
gameTypes "github.com/ethereum-optimism/optimism/op-challenger/game/types"
Expand Down Expand Up @@ -43,11 +44,12 @@ func RunTrace(ctx *cli.Context, _ context.CancelCauseFunc) (cliapp.Lifecycle, er
runConfigs = append(runConfigs, runner.RunConfig{GameType: gameType})
}
}
return runner.NewRunner(logger, cfg, runConfigs), nil
vmTimeout := ctx.Duration(VMTimeoutFlag.Name)
return runner.NewRunner(logger, cfg, runConfigs, vmTimeout), nil
}

func runTraceFlags() []cli.Flag {
return append(flags.Flags, RunTraceRunFlag)
return append(flags.Flags, RunTraceRunFlag, VMTimeoutFlag)
}

var RunTraceCommand = &cli.Command{
Expand All @@ -58,6 +60,8 @@ var RunTraceCommand = &cli.Command{
Flags: runTraceFlags(),
}

const DefaultVMTimeout = 3 * time.Hour

var (
RunTraceRunFlag = &cli.StringSliceFlag{
Name: "run",
Expand All @@ -69,6 +73,12 @@ var (
"If the prestateHash is omitted, the absolute prestate hash used for new games on-chain.",
EnvVars: opservice.PrefixEnvVar(flags.EnvVarPrefix, "RUN"),
}
VMTimeoutFlag = &cli.DurationFlag{
Name: "vm-timeout",
Usage: fmt.Sprintf("Maximum duration for VM execution per run. Default is %s. Set to 0 to disable timeout.", DefaultVMTimeout),
EnvVars: opservice.PrefixEnvVar(flags.EnvVarPrefix, "VM_TIMEOUT"),
Value: DefaultVMTimeout,
}
)

func parseRunArgs(args []string) ([]runner.RunConfig, error) {
Expand Down
75 changes: 35 additions & 40 deletions op-challenger/runner/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,22 @@ type Metrics struct {
*metrics.VmMetrics
opmetrics.RPCMetrics

up prometheus.Gauge
vmLastExecutionTime *prometheus.GaugeVec
vmLastMemoryUsed *prometheus.GaugeVec
successTotal *prometheus.CounterVec
failuresTotal *prometheus.CounterVec
consecutiveFailuresCurrent *prometheus.GaugeVec
panicsTotal *prometheus.CounterVec
invalidTotal *prometheus.CounterVec
up prometheus.Gauge
vmLastExecutionTime *prometheus.GaugeVec
vmLastMemoryUsed *prometheus.GaugeVec
successTotal *prometheus.CounterVec
setupFailuresTotal *prometheus.CounterVec
consecutiveSetupFailuresCurrent *prometheus.GaugeVec
vmFailuresTotal *prometheus.CounterVec
}

// Reason labels for vmFailuresTotal metric
const (
ReasonIncorrectStatus = "incorrect_status"
ReasonPanic = "panic"
ReasonTimeout = "timeout"
)

var _ Metricer = (*Metrics)(nil)

// Metrics implementation must implement RegistryMetricer to allow the metrics server to work.
Expand Down Expand Up @@ -68,34 +74,30 @@ func NewMetrics(runConfigs []RunConfig) *Metrics {
Name: "success_total",
Help: "Number of VM executions that successfully verified the output root",
}, []string{"type"}),
failuresTotal: factory.NewCounterVec(prometheus.CounterOpts{
setupFailuresTotal: factory.NewCounterVec(prometheus.CounterOpts{
Namespace: Namespace,
Name: "failures_total",
Help: "Number of failures to execute a VM",
Name: "setup_failures_total",
Help: "Number of setup failures before VM execution",
}, []string{"type"}),
consecutiveFailuresCurrent: factory.NewGaugeVec(prometheus.GaugeOpts{
consecutiveSetupFailuresCurrent: factory.NewGaugeVec(prometheus.GaugeOpts{
Namespace: Namespace,
Name: "consecutive_failures_current",
Name: "consecutive_setup_failures_current",
Help: "Number of consecutive setup failures by VM type. Resets to 0 on any complete run.",
}, []string{"type"}),
panicsTotal: factory.NewCounterVec(prometheus.CounterOpts{
Namespace: Namespace,
Name: "panics_total",
Help: "Number of times the VM panicked",
}, []string{"type"}),
invalidTotal: factory.NewCounterVec(prometheus.CounterOpts{
vmFailuresTotal: factory.NewCounterVec(prometheus.CounterOpts{
Namespace: Namespace,
Name: "invalid_total",
Help: "Number of runs that determined the output root was invalid",
}, []string{"type"}),
Name: "vm_failures_total",
Help: "Number of VM execution failures by type and reason (incorrect_status, panic, timeout)",
}, []string{"type", "reason"}),
}

for _, runConfig := range runConfigs {
metrics.successTotal.WithLabelValues(runConfig.Name).Add(0)
metrics.failuresTotal.WithLabelValues(runConfig.Name).Add(0)
metrics.consecutiveFailuresCurrent.WithLabelValues(runConfig.Name).Set(0)
metrics.panicsTotal.WithLabelValues(runConfig.Name).Add(0)
metrics.invalidTotal.WithLabelValues(runConfig.Name).Add(0)
metrics.setupFailuresTotal.WithLabelValues(runConfig.Name).Add(0)
metrics.consecutiveSetupFailuresCurrent.WithLabelValues(runConfig.Name).Set(0)
metrics.vmFailuresTotal.WithLabelValues(runConfig.Name, ReasonIncorrectStatus).Add(0)
metrics.vmFailuresTotal.WithLabelValues(runConfig.Name, ReasonPanic).Add(0)
metrics.vmFailuresTotal.WithLabelValues(runConfig.Name, ReasonTimeout).Add(0)
metrics.RecordUp()
}

Expand Down Expand Up @@ -123,22 +125,15 @@ func (m *Metrics) RecordVmMemoryUsed(vmType string, memoryUsed uint64) {

func (m *Metrics) RecordSuccess(vmType string) {
m.successTotal.WithLabelValues(vmType).Inc()
m.consecutiveFailuresCurrent.WithLabelValues(vmType).Set(0)
}

func (m *Metrics) RecordFailure(vmType string) {
m.failuresTotal.WithLabelValues(vmType).Inc()
m.consecutiveFailuresCurrent.WithLabelValues(vmType).Inc()
m.consecutiveSetupFailuresCurrent.WithLabelValues(vmType).Set(0)
}

func (m *Metrics) RecordPanic(vmType string) {
m.panicsTotal.WithLabelValues(vmType).Inc()
// The result was bad, but we still completed setup successfully
m.consecutiveFailuresCurrent.WithLabelValues(vmType).Set(0)
func (m *Metrics) RecordSetupFailure(vmType string) {
m.setupFailuresTotal.WithLabelValues(vmType).Inc()
m.consecutiveSetupFailuresCurrent.WithLabelValues(vmType).Inc()
}

func (m *Metrics) RecordInvalid(vmType string) {
m.invalidTotal.WithLabelValues(vmType).Inc()
// The result was bad, but we still completed setup successfully
m.consecutiveFailuresCurrent.WithLabelValues(vmType).Set(0)
func (m *Metrics) RecordVmFailure(vmType string, reason string) {
m.vmFailuresTotal.WithLabelValues(vmType, reason).Inc()
m.consecutiveSetupFailuresCurrent.WithLabelValues(vmType).Set(0)
}
58 changes: 42 additions & 16 deletions op-challenger/runner/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,16 +36,16 @@ import (

var (
ErrUnexpectedStatusCode = errors.New("unexpected status code")
ErrVMTimeout = errors.New("VM execution timed out")
)

type Metricer interface {
contractMetrics.ContractMetricer
metrics.VmMetricer
opmetrics.RPCMetricer

RecordFailure(vmType string)
RecordPanic(vmType string)
RecordInvalid(vmType string)
RecordSetupFailure(vmType string)
RecordVmFailure(vmType string, reason string)
RecordSuccess(vmType string)
}

Expand All @@ -56,11 +56,24 @@ type RunConfig struct {
PrestateFilename string
}

type TraceProviderCreator func(
ctx context.Context,
logger log.Logger,
m trace.Metricer,
cfg *config.Config,
prestateSource prestateFetcher,
gameType gameTypes.GameType,
localInputs utils.LocalGameInputs,
dir string,
) (types.TraceProvider, error)

type Runner struct {
log log.Logger
cfg *config.Config
runConfigs []RunConfig
m Metricer
log log.Logger
cfg *config.Config
runConfigs []RunConfig
m Metricer
vmTimeout time.Duration
traceProviderCreator TraceProviderCreator

running atomic.Bool
ctx context.Context
Expand All @@ -69,12 +82,14 @@ type Runner struct {
metricsSrv *httputil.HTTPServer
}

func NewRunner(logger log.Logger, cfg *config.Config, runConfigs []RunConfig) *Runner {
func NewRunner(logger log.Logger, cfg *config.Config, runConfigs []RunConfig, vmTimeout time.Duration) *Runner {
return &Runner{
log: logger,
cfg: cfg,
runConfigs: runConfigs,
m: NewMetrics(runConfigs),
log: logger,
cfg: cfg,
runConfigs: runConfigs,
m: NewMetrics(runConfigs),
vmTimeout: vmTimeout,
traceProviderCreator: createTraceProvider,
}
}

Expand Down Expand Up @@ -142,13 +157,16 @@ func (r *Runner) runAndRecordOnce(ctx context.Context, rlog log.Logger, runConfi
recordError := func(err error, configName string, m Metricer, log log.Logger) {
if errors.Is(err, ErrUnexpectedStatusCode) {
log.Error("Incorrect status code", "type", runConfig.Name, "err", err)
m.RecordInvalid(configName)
m.RecordVmFailure(configName, ReasonIncorrectStatus)
} else if errors.Is(err, trace.ErrVMPanic) {
log.Error("VM panicked", "type", runConfig.Name)
m.RecordPanic(configName)
m.RecordVmFailure(configName, ReasonPanic)
} else if errors.Is(err, ErrVMTimeout) {
log.Error("VM execution timed out", "type", runConfig.Name, "timeout", r.vmTimeout)
m.RecordVmFailure(configName, ReasonTimeout)
} else if err != nil {
log.Error("Failed to run", "type", runConfig.Name, "err", err)
m.RecordFailure(configName)
m.RecordSetupFailure(configName)
} else {
log.Info("Successfully verified output root", "type", runConfig.Name)
m.RecordSuccess(configName)
Expand Down Expand Up @@ -195,12 +213,20 @@ func (r *Runner) runAndRecordOnce(ctx context.Context, rlog log.Logger, runConfi
}

func (r *Runner) runOnce(ctx context.Context, logger log.Logger, name string, gameType gameTypes.GameType, prestateSource prestateFetcher, localInputs utils.LocalGameInputs, dir string) error {
provider, err := createTraceProvider(ctx, logger, metrics.NewTypedVmMetrics(r.m, name), r.cfg, prestateSource, gameType, localInputs, dir)
if r.vmTimeout > 0 {
var cancel context.CancelFunc
ctx, cancel = context.WithTimeout(ctx, r.vmTimeout)
defer cancel()
}
provider, err := r.traceProviderCreator(ctx, logger, metrics.NewTypedVmMetrics(r.m, name), r.cfg, prestateSource, gameType, localInputs, dir)
if err != nil {
return fmt.Errorf("failed to create trace provider: %w", err)
}
hash, err := provider.Get(ctx, types.RootPosition)
if err != nil {
if errors.Is(err, context.DeadlineExceeded) {
return fmt.Errorf("%w: %w", ErrVMTimeout, err)
}
return fmt.Errorf("failed to execute trace provider: %w", err)
}
if hash[0] != mipsevm.VMStatusValid {
Expand Down
Loading