Skip to content

Commit c161967

Browse files
authored
fix(execution/evm): verify payload status (#2863)
PR adding retry logic and better payload verification (as found in the audit). ~~It was one shot by Claude Sonnet, so still checking it :D~~ done and updated.
1 parent bfa745c commit c161967

File tree

6 files changed

+454
-84
lines changed

6 files changed

+454
-84
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
2525

2626
### Changed
2727

28+
- Improved EVM execution client payload status validation with proper retry logic for SYNCING states in `InitChain`, `ExecuteTxs`, and `SetFinal` methods. The implementation now follows Engine API specification by retrying SYNCING/ACCEPTED status with exponential backoff and failing immediately on INVALID status, preventing unnecessary node shutdowns during transient execution engine sync operations. ([#2863](https://github.com/evstack/ev-node/pull/2863))
2829
- Remove GasPrice and GasMultiplier from DA interface and configuration to use celestia-node's native fee estimation. ([#2822](https://github.com/evstack/ev-node/pull/2822))
2930
- Use cache instead of in memory store for reaper. Persist cache on reload. Autoclean after 24 hours. ([#2811](https://github.com/evstack/ev-node/pull/2811))
3031
- Improved P2P sync service store initialization to be atomic and prevent race conditions ([#2838](https://github.com/evstack/ev-node/pull/2838))

block/internal/executing/executor.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -594,7 +594,8 @@ func (e *Executor) signHeader(header types.Header) (types.Signature, error) {
594594
return e.signer.Sign(bz)
595595
}
596596

597-
// executeTxsWithRetry executes transactions with retry logic
597+
// executeTxsWithRetry executes transactions with retry logic.
598+
// NOTE: the function retries the execution client call regardless of the error. Some execution clients errors are irrecoverable, and will eventually halt the node, as expected.
598599
func (e *Executor) executeTxsWithRetry(ctx context.Context, rawTxs [][]byte, header types.Header, currentState types.State) ([]byte, error) {
599600
for attempt := 1; attempt <= common.MaxRetriesBeforeHalt; attempt++ {
600601
newAppHash, _, err := e.exec.ExecuteTxs(ctx, rawTxs, header.Height(), header.Time(), currentState.AppHash)

block/internal/submitting/submitter.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,8 @@ func (s *Submitter) processDAInclusionLoop() {
268268
}
269269
}
270270

271-
// setFinalWithRetry sets the final height in executor with retry logic
271+
// setFinalWithRetry sets the final height in executor with retry logic.
272+
// NOTE: the function retries the execution client call regardless of the error. Some execution clients errors are irrecoverable, and will eventually halt the node, as expected.
272273
func (s *Submitter) setFinalWithRetry(nextHeight uint64) error {
273274
for attempt := 1; attempt <= common.MaxRetriesBeforeHalt; attempt++ {
274275
if err := s.exec.SetFinal(s.ctx, nextHeight); err != nil {

block/internal/syncing/syncer.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -593,7 +593,8 @@ func (s *Syncer) applyBlock(header types.Header, data *types.Data, currentState
593593
return newState, nil
594594
}
595595

596-
// executeTxsWithRetry executes transactions with retry logic
596+
// executeTxsWithRetry executes transactions with retry logic.
597+
// NOTE: the function retries the execution client call regardless of the error. Some execution clients errors are irrecoverable, and will eventually halt the node, as expected.
597598
func (s *Syncer) executeTxsWithRetry(ctx context.Context, rawTxs [][]byte, header types.Header, currentState types.State) ([]byte, error) {
598599
for attempt := 1; attempt <= common.MaxRetriesBeforeHalt; attempt++ {
599600
newAppHash, _, err := s.exec.ExecuteTxs(ctx, rawTxs, header.Height(), header.Time(), currentState.AppHash)

execution/evm/execution.go

Lines changed: 194 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,94 @@ import (
2222
"github.com/evstack/ev-node/core/execution"
2323
)
2424

25+
const (
26+
// MaxPayloadStatusRetries is the maximum number of retries for SYNCING status.
27+
// According to the Engine API specification, SYNCING indicates temporary unavailability
28+
// and should be retried with exponential backoff.
29+
MaxPayloadStatusRetries = 3
30+
// InitialRetryBackoff is the initial backoff duration for retries.
31+
// The backoff doubles on each retry attempt (exponential backoff).
32+
InitialRetryBackoff = 1 * time.Second
33+
)
34+
2535
var (
26-
// ErrInvalidPayloadStatus indicates that EVM returned status != VALID
36+
// ErrInvalidPayloadStatus indicates that the execution engine returned a permanent
37+
// failure status (INVALID or unknown status). This error should not be retried.
2738
ErrInvalidPayloadStatus = errors.New("invalid payload status")
39+
// ErrPayloadSyncing indicates that the execution engine is temporarily syncing.
40+
// According to the Engine API specification, this is a transient condition that
41+
// should be handled with retry logic rather than immediate failure.
42+
ErrPayloadSyncing = errors.New("payload syncing")
2843
)
2944

3045
// Ensure EngineAPIExecutionClient implements the execution.Execute interface
3146
var _ execution.Executor = (*EngineClient)(nil)
3247

48+
// validatePayloadStatus checks the payload status and returns appropriate errors.
49+
// It implements the Engine API specification's status handling:
50+
// - VALID: Operation succeeded, return nil
51+
// - SYNCING/ACCEPTED: Temporary unavailability, return ErrPayloadSyncing for retry
52+
// - INVALID: Permanent failure, return ErrInvalidPayloadStatus (no retry)
53+
// - Unknown: Treat as permanent failure (no retry)
54+
func validatePayloadStatus(status engine.PayloadStatusV1) error {
55+
switch status.Status {
56+
case engine.VALID:
57+
return nil
58+
case engine.SYNCING, engine.ACCEPTED:
59+
// SYNCING and ACCEPTED indicate temporary unavailability - should retry
60+
return ErrPayloadSyncing
61+
case engine.INVALID:
62+
// INVALID is a permanent failure - should not retry
63+
return ErrInvalidPayloadStatus
64+
default:
65+
// Unknown status - treat as invalid
66+
return ErrInvalidPayloadStatus
67+
}
68+
}
69+
70+
// retryWithBackoff executes a function with exponential backoff retry logic.
71+
// It implements the Engine API specification's recommendation to retry SYNCING
72+
// status with exponential backoff. The function:
73+
// - Retries only on ErrPayloadSyncing (transient failures)
74+
// - Fails immediately on ErrInvalidPayloadStatus (permanent failures)
75+
// - Respects context cancellation for graceful shutdown
76+
// - Uses exponential backoff that doubles on each attempt
77+
func retryWithBackoffOnPayloadStatus(ctx context.Context, fn func() error, maxRetries int, initialBackoff time.Duration, operation string) error {
78+
backoff := initialBackoff
79+
80+
for attempt := 1; attempt <= maxRetries; attempt++ {
81+
err := fn()
82+
if err == nil {
83+
return nil
84+
}
85+
86+
// Don't retry on invalid status
87+
if errors.Is(err, ErrInvalidPayloadStatus) {
88+
return err
89+
}
90+
91+
// Only retry on syncing status
92+
if !errors.Is(err, ErrPayloadSyncing) {
93+
return err
94+
}
95+
96+
// Check if we've exhausted retries
97+
if attempt >= maxRetries {
98+
return fmt.Errorf("max retries (%d) exceeded for %s: %w", maxRetries, operation, err)
99+
}
100+
101+
// Wait with exponential backoff
102+
select {
103+
case <-ctx.Done():
104+
return fmt.Errorf("context cancelled during retry for %s: %w", operation, ctx.Err())
105+
case <-time.After(backoff):
106+
backoff *= 2
107+
}
108+
}
109+
110+
return fmt.Errorf("max retries (%d) exceeded for %s", maxRetries, operation)
111+
}
112+
33113
// EngineClient represents a client that interacts with an Ethereum execution engine
34114
// through the Engine API. It manages connections to both the engine and standard Ethereum
35115
// APIs, and maintains state related to block processing.
@@ -105,28 +185,35 @@ func (c *EngineClient) InitChain(ctx context.Context, genesisTime time.Time, ini
105185
return nil, 0, fmt.Errorf("initialHeight must be 1, got %d", initialHeight)
106186
}
107187

108-
// Acknowledge the genesis block
109-
var forkchoiceResult engine.ForkChoiceResponse
110-
err := c.engineClient.CallContext(ctx, &forkchoiceResult, "engine_forkchoiceUpdatedV3",
111-
engine.ForkchoiceStateV1{
112-
HeadBlockHash: c.genesisHash,
113-
SafeBlockHash: c.genesisHash,
114-
FinalizedBlockHash: c.genesisHash,
115-
},
116-
nil,
117-
)
118-
if err != nil {
119-
return nil, 0, fmt.Errorf("engine_forkchoiceUpdatedV3 failed: %w", err)
120-
}
188+
// Acknowledge the genesis block with retry logic for SYNCING status
189+
err := retryWithBackoffOnPayloadStatus(ctx, func() error {
190+
var forkchoiceResult engine.ForkChoiceResponse
191+
err := c.engineClient.CallContext(ctx, &forkchoiceResult, "engine_forkchoiceUpdatedV3",
192+
engine.ForkchoiceStateV1{
193+
HeadBlockHash: c.genesisHash,
194+
SafeBlockHash: c.genesisHash,
195+
FinalizedBlockHash: c.genesisHash,
196+
},
197+
nil,
198+
)
199+
if err != nil {
200+
return fmt.Errorf("engine_forkchoiceUpdatedV3 failed: %w", err)
201+
}
121202

122-
// Validate payload status
123-
if forkchoiceResult.PayloadStatus.Status != engine.VALID {
124-
c.logger.Warn().
125-
Str("status", string(forkchoiceResult.PayloadStatus.Status)).
126-
Str("latestValidHash", forkchoiceResult.PayloadStatus.LatestValidHash.Hex()).
127-
Interface("validationError", forkchoiceResult.PayloadStatus.ValidationError).
128-
Msg("InitChain: engine_forkchoiceUpdatedV3 returned non-VALID status")
129-
return nil, 0, fmt.Errorf("%w: status=%s", ErrInvalidPayloadStatus, forkchoiceResult.PayloadStatus.Status)
203+
// Validate payload status
204+
if err := validatePayloadStatus(forkchoiceResult.PayloadStatus); err != nil {
205+
c.logger.Warn().
206+
Str("status", forkchoiceResult.PayloadStatus.Status).
207+
Str("latestValidHash", forkchoiceResult.PayloadStatus.LatestValidHash.Hex()).
208+
Interface("validationError", forkchoiceResult.PayloadStatus.ValidationError).
209+
Msg("InitChain: engine_forkchoiceUpdatedV3 returned non-VALID status")
210+
return err
211+
}
212+
213+
return nil
214+
}, MaxPayloadStatusRetries, InitialRetryBackoff, "InitChain")
215+
if err != nil {
216+
return nil, 0, err
130217
}
131218

132219
_, stateRoot, gasLimit, _, err := c.getBlockInfo(ctx, 0)
@@ -184,7 +271,7 @@ func (c *EngineClient) ExecuteTxs(ctx context.Context, txs [][]byte, blockHeight
184271

185272
// update forkchoice to get the next payload id
186273
// Create evolve-compatible payloadtimestamp.Unix()
187-
evPayloadAttrs := map[string]interface{}{
274+
evPayloadAttrs := map[string]any{
188275
// Standard Ethereum payload attributes (flattened) - using camelCase as expected by JSON
189276
"timestamp": timestamp.Unix(),
190277
"prevRandao": c.derivePrevRandao(blockHeight),
@@ -202,64 +289,82 @@ func (c *EngineClient) ExecuteTxs(ctx context.Context, txs [][]byte, blockHeight
202289
Int("tx_count", len(txs)).
203290
Msg("engine_forkchoiceUpdatedV3")
204291

205-
var forkchoiceResult engine.ForkChoiceResponse
206-
err = c.engineClient.CallContext(ctx, &forkchoiceResult, "engine_forkchoiceUpdatedV3", args, evPayloadAttrs)
207-
if err != nil {
208-
return nil, 0, fmt.Errorf("forkchoice update failed: %w", err)
209-
}
292+
// Call forkchoice update with retry logic for SYNCING status
293+
var payloadID *engine.PayloadID
294+
err = retryWithBackoffOnPayloadStatus(ctx, func() error {
295+
var forkchoiceResult engine.ForkChoiceResponse
296+
err := c.engineClient.CallContext(ctx, &forkchoiceResult, "engine_forkchoiceUpdatedV3", args, evPayloadAttrs)
297+
if err != nil {
298+
return fmt.Errorf("forkchoice update failed: %w", err)
299+
}
210300

211-
// Validate payload status
212-
if forkchoiceResult.PayloadStatus.Status != engine.VALID {
213-
c.logger.Warn().
214-
Str("status", string(forkchoiceResult.PayloadStatus.Status)).
215-
Str("latestValidHash", forkchoiceResult.PayloadStatus.LatestValidHash.Hex()).
216-
Interface("validationError", forkchoiceResult.PayloadStatus.ValidationError).
217-
Uint64("blockHeight", blockHeight).
218-
Msg("ExecuteTxs: engine_forkchoiceUpdatedV3 returned non-VALID status")
219-
return nil, 0, fmt.Errorf("%w: status=%s", ErrInvalidPayloadStatus, forkchoiceResult.PayloadStatus.Status)
220-
}
301+
// Validate payload status
302+
if err := validatePayloadStatus(forkchoiceResult.PayloadStatus); err != nil {
303+
c.logger.Warn().
304+
Str("status", forkchoiceResult.PayloadStatus.Status).
305+
Str("latestValidHash", forkchoiceResult.PayloadStatus.LatestValidHash.Hex()).
306+
Interface("validationError", forkchoiceResult.PayloadStatus.ValidationError).
307+
Uint64("blockHeight", blockHeight).
308+
Msg("ExecuteTxs: engine_forkchoiceUpdatedV3 returned non-VALID status")
309+
return err
310+
}
311+
312+
if forkchoiceResult.PayloadID == nil {
313+
c.logger.Error().
314+
Str("status", forkchoiceResult.PayloadStatus.Status).
315+
Str("latestValidHash", forkchoiceResult.PayloadStatus.LatestValidHash.Hex()).
316+
Interface("validationError", forkchoiceResult.PayloadStatus.ValidationError).
317+
Interface("forkchoiceState", args).
318+
Interface("payloadAttributes", evPayloadAttrs).
319+
Uint64("blockHeight", blockHeight).
320+
Msg("returned nil PayloadID")
321+
322+
return fmt.Errorf("returned nil PayloadID - (status: %s, latestValidHash: %s)",
323+
forkchoiceResult.PayloadStatus.Status,
324+
forkchoiceResult.PayloadStatus.LatestValidHash.Hex())
325+
}
221326

222-
if forkchoiceResult.PayloadID == nil {
223-
c.logger.Error().
224-
Str("status", string(forkchoiceResult.PayloadStatus.Status)).
225-
Str("latestValidHash", forkchoiceResult.PayloadStatus.LatestValidHash.Hex()).
226-
Interface("validationError", forkchoiceResult.PayloadStatus.ValidationError).
227-
Interface("forkchoiceState", args).
228-
Interface("payloadAttributes", evPayloadAttrs).
229-
Uint64("blockHeight", blockHeight).
230-
Msg("returned nil PayloadID")
231-
232-
return nil, 0, fmt.Errorf("returned nil PayloadID - (status: %s, latestValidHash: %s)",
233-
forkchoiceResult.PayloadStatus.Status,
234-
forkchoiceResult.PayloadStatus.LatestValidHash.Hex())
327+
payloadID = forkchoiceResult.PayloadID
328+
return nil
329+
}, MaxPayloadStatusRetries, InitialRetryBackoff, "ExecuteTxs forkchoice")
330+
if err != nil {
331+
return nil, 0, err
235332
}
236333

237334
// get payload
238335
var payloadResult engine.ExecutionPayloadEnvelope
239-
err = c.engineClient.CallContext(ctx, &payloadResult, "engine_getPayloadV4", *forkchoiceResult.PayloadID)
336+
err = c.engineClient.CallContext(ctx, &payloadResult, "engine_getPayloadV4", *payloadID)
240337
if err != nil {
241338
return nil, 0, fmt.Errorf("get payload failed: %w", err)
242339
}
243340

244-
// submit payload
341+
// Submit payload with retry logic for SYNCING status
245342
var newPayloadResult engine.PayloadStatusV1
246-
err = c.engineClient.CallContext(ctx, &newPayloadResult, "engine_newPayloadV4",
247-
payloadResult.ExecutionPayload,
248-
[]string{}, // No blob hashes
249-
common.Hash{}.Hex(), // Use zero hash for parentBeaconBlockRoot (same as in payload attributes)
250-
[][]byte{}, // No execution requests
251-
)
252-
if err != nil {
253-
return nil, 0, fmt.Errorf("new payload submission failed: %w", err)
254-
}
343+
err = retryWithBackoffOnPayloadStatus(ctx, func() error {
344+
err := c.engineClient.CallContext(ctx, &newPayloadResult, "engine_newPayloadV4",
345+
payloadResult.ExecutionPayload,
346+
[]string{}, // No blob hashes
347+
common.Hash{}.Hex(), // Use zero hash for parentBeaconBlockRoot (same as in payload attributes)
348+
[][]byte{}, // No execution requests
349+
)
350+
if err != nil {
351+
return fmt.Errorf("new payload submission failed: %w", err)
352+
}
255353

256-
if newPayloadResult.Status != engine.VALID {
257-
c.logger.Warn().
258-
Str("status", string(newPayloadResult.Status)).
259-
Str("latestValidHash", newPayloadResult.LatestValidHash.Hex()).
260-
Interface("validationError", newPayloadResult.ValidationError).
261-
Msg("engine_newPayloadV4 returned non-VALID status")
262-
return nil, 0, ErrInvalidPayloadStatus
354+
// Validate payload status
355+
if err := validatePayloadStatus(newPayloadResult); err != nil {
356+
c.logger.Warn().
357+
Str("status", newPayloadResult.Status).
358+
Str("latestValidHash", newPayloadResult.LatestValidHash.Hex()).
359+
Interface("validationError", newPayloadResult.ValidationError).
360+
Uint64("blockHeight", blockHeight).
361+
Msg("engine_newPayloadV4 returned non-VALID status")
362+
return err
363+
}
364+
return nil
365+
}, MaxPayloadStatusRetries, InitialRetryBackoff, "ExecuteTxs newPayload")
366+
if err != nil {
367+
return nil, 0, err
263368
}
264369

265370
// forkchoice update
@@ -290,19 +395,27 @@ func (c *EngineClient) setFinal(ctx context.Context, blockHash common.Hash, isFi
290395
}
291396
c.mu.Unlock()
292397

293-
var forkchoiceResult engine.ForkChoiceResponse
294-
err := c.engineClient.CallContext(ctx, &forkchoiceResult, "engine_forkchoiceUpdatedV3", args, nil)
295-
if err != nil {
296-
return fmt.Errorf("forkchoice update failed with error: %w", err)
297-
}
398+
// Call forkchoice update with retry logic for SYNCING status
399+
err := retryWithBackoffOnPayloadStatus(ctx, func() error {
400+
var forkchoiceResult engine.ForkChoiceResponse
401+
err := c.engineClient.CallContext(ctx, &forkchoiceResult, "engine_forkchoiceUpdatedV3", args, nil)
402+
if err != nil {
403+
return fmt.Errorf("forkchoice update failed: %w", err)
404+
}
298405

299-
if forkchoiceResult.PayloadStatus.Status != engine.VALID {
300-
c.logger.Warn().
301-
Str("status", string(forkchoiceResult.PayloadStatus.Status)).
302-
Str("latestValidHash", forkchoiceResult.PayloadStatus.LatestValidHash.Hex()).
303-
Interface("validationError", forkchoiceResult.PayloadStatus.ValidationError).
304-
Msg("forkchoiceUpdatedV3 returned non-VALID status")
305-
return ErrInvalidPayloadStatus
406+
// Validate payload status
407+
if err := validatePayloadStatus(forkchoiceResult.PayloadStatus); err != nil {
408+
c.logger.Warn().
409+
Str("status", forkchoiceResult.PayloadStatus.Status).
410+
Str("latestValidHash", forkchoiceResult.PayloadStatus.LatestValidHash.Hex()).
411+
Interface("validationError", forkchoiceResult.PayloadStatus.ValidationError).
412+
Msg("forkchoiceUpdatedV3 returned non-VALID status")
413+
return err
414+
}
415+
return nil
416+
}, MaxPayloadStatusRetries, InitialRetryBackoff, "setFinal")
417+
if err != nil {
418+
return err
306419
}
307420

308421
return nil

0 commit comments

Comments
 (0)