From cc82819f277c014ef900250f9954c964011389e8 Mon Sep 17 00:00:00 2001
From: danceratopz <danceratopz@gmail.com>
Date: Wed, 10 Dec 2025 13:23:10 +0100
Subject: [PATCH 01/11] docs(testing-cli-consume): fix hive simulator names
 post-weld (#1842)

Co-authored-by: spencer <spencer.tb@ethereum.org>
---
 docs/filling_tests/test_ids.md            |  4 +--
 docs/running_tests/consume/exceptions.md  |  2 +-
 docs/running_tests/execute/hive.md        |  2 +-
 docs/running_tests/hive/client_config.md  |  6 ++--
 docs/running_tests/hive/common_options.md | 18 +++++-----
 docs/running_tests/hive/dev_mode.md       | 43 +++++++++++------------
 docs/running_tests/hive/index.md          |  2 +-
 docs/running_tests/releases.md            |  6 ++--
 docs/running_tests/running.md             | 22 ++++++------
 9 files changed, 52 insertions(+), 53 deletions(-)

diff --git a/docs/filling_tests/test_ids.md b/docs/filling_tests/test_ids.md
index 7005ab639b..069518bb06 100644
--- a/docs/filling_tests/test_ids.md
+++ b/docs/filling_tests/test_ids.md
@@ -55,8 +55,8 @@ The test framework can also generate blockchain tests containing blocks that spa
 
 Each Python test case is also typically parametrized by test type, respectively fixture format. For example, if the test is implemented as a `state_test`, the test framework will additionally generate the following blockchain test fixtures (consisting of a single block with a single transaction):
 
-- a `blockchain_test` which can be tested via the Hive `eest/consume-rlp` simulator (or directly via a dedicated client interface).
-- a `blockchain_engine_test` (for post-merge forks) which can be tested via the Hive `eest/consume-engine` simulator.
+- a `blockchain_test` which can be tested via the Hive `eels/consume-rlp` simulator (or directly via a dedicated client interface).
+- a `blockchain_engine_test` (for post-merge forks) which can be tested via the Hive `eels/consume-engine` simulator.
 
 ### Example: The Test IDs generated for `test_chainid`
 
diff --git a/docs/running_tests/consume/exceptions.md b/docs/running_tests/consume/exceptions.md
index 453284aedf..a5a82c499a 100644
--- a/docs/running_tests/consume/exceptions.md
+++ b/docs/running_tests/consume/exceptions.md
@@ -98,7 +98,7 @@ uv run consume engine --disable-strict-exception-matching=nimbus-el
 Enable verbose client output:
 
 ```bash
-./hive --sim ethereum/eest/consume-engine \
+./hive --sim ethereum/eels/consume-engine \
   --docker.output \
   --sim.loglevel 5
 ```
diff --git a/docs/running_tests/execute/hive.md b/docs/running_tests/execute/hive.md
index 6af210f909..c4d89e48c0 100644
--- a/docs/running_tests/execute/hive.md
+++ b/docs/running_tests/execute/hive.md
@@ -2,7 +2,7 @@
 
 Tests can be executed on a local hive-controlled single-client network by running the `execute hive` command.
 
-## The `eest/execute-blobs` Simulator
+## The `eels/execute-blobs` Simulator
 
 The `blob_transaction_test` execute test spec sends blob transactions to a running client. Blob transactions are fully supported in execute mode:
 
diff --git a/docs/running_tests/hive/client_config.md b/docs/running_tests/hive/client_config.md
index be04107f11..e2713780e9 100644
--- a/docs/running_tests/hive/client_config.md
+++ b/docs/running_tests/hive/client_config.md
@@ -102,17 +102,17 @@ cp -r /path/to/your/go-ethereum ./clients/go-ethereum/go-ethereum-local
 Force rebuild base images:
 
 ```bash
-./hive --docker.pull --sim ethereum/eest/consume-engine
+./hive --docker.pull --sim ethereum/eels/consume-engine
 ```
 
 Force rebuild specific client:
 
 ```bash
-./hive --docker.nocache "clients/go-ethereum" --sim ethereum/eest/consume-engine
+./hive --docker.nocache "clients/go-ethereum" --sim ethereum/eels/consume-engine
 ```
 
 Show the docker container build output:
 
 ```bash
-./hive --docker.buildoutput --sim ethereum/eest/consume-engine
+./hive --docker.buildoutput --sim ethereum/eels/consume-engine
 ```
diff --git a/docs/running_tests/hive/common_options.md b/docs/running_tests/hive/common_options.md
index ff3d313252..2272e7d058 100644
--- a/docs/running_tests/hive/common_options.md
+++ b/docs/running_tests/hive/common_options.md
@@ -1,15 +1,15 @@
 # Common Simulator Options
 
-All EEST Hive simulators share common command-line options and patterns.
+All execution-specs (EELS) Hive simulators share common command-line options and patterns.
 
 ## Basic Usage
 
-While they may be omitted, it's recommended to specify the `fixtures` and `branch` simulator build arguments when running EEST simulators.
+While they may be omitted, it's recommended to specify the `fixtures` and `branch` simulator build arguments when running execution-specs simulators.
 
 For example, this runs "stable" fixtures from the v4.3.0 [latest stable release](../releases.md#standard-releases) and builds the simulator at the v4.3.0 tag:
 
 ```bash
-./hive --sim ethereum/eest/consume-engine \
+./hive --sim ethereum/eels/consume-engine \
   --sim.buildarg fixtures=stable@v4.3.0 \
   --sim.buildarg branch=v4.3.0 \
   --client go-ethereum
@@ -20,7 +20,7 @@ For example, this runs "stable" fixtures from the v4.3.0 [latest stable release]
 Run a subset of tests by filtering tests using `--sim.limit=<regex>` to perform a regular expression match against test IDs:
 
 ```bash
-./hive --sim ethereum/eest/consume-engine --sim.limit ".*eip4844.*"
+./hive --sim ethereum/eels/consume-engine --sim.limit ".*eip4844.*"
 ```
 
 ### Collect Only/Dry-Run
@@ -28,7 +28,7 @@ Run a subset of tests by filtering tests using `--sim.limit=<regex>` to perform
 The `collectonly:` prefix can be used to inspect which tests would match an expression (dry-run), `--docker.output` must be specified to see the simulator's collection result:
 
 ```bash
-./hive --sim ethereum/eest/consume-engine \
+./hive --sim ethereum/eels/consume-engine \
      --sim.buildarg fixtures=stable@v4.3.0 \
      --sim.buildarg branch=v4.3.0 \
      --docker.output \
@@ -40,7 +40,7 @@ The `collectonly:` prefix can be used to inspect which tests would match an expr
 The `id:` prefix can be used to select a single test via its ID (this will automatically escape any special characters in the test case ID):
 
 ```console
-./hive --sim ethereum/eest/consume-engine \
+./hive --sim ethereum/eels/consume-engine \
      --sim.buildarg fixtures=stable@v4.3.0 \
      --sim.buildarg branch=v4.3.0 \
      --docker.output \
@@ -52,7 +52,7 @@ The `id:` prefix can be used to select a single test via its ID (this will autom
 To run multiple tests in parallel, use `--sim.parallelism`:
 
 ```bash
-./hive --sim ethereum/eest/consume-rlp --sim.parallelism 4
+./hive --sim ethereum/eels/consume-rlp --sim.parallelism 4
 ```
 
 ### Output Options
@@ -60,7 +60,7 @@ To run multiple tests in parallel, use `--sim.parallelism`:
 See hive log output in the console:
 
 ```bash
-./hive --sim ethereum/eest/consume-engine --sim.loglevel 5
+./hive --sim ethereum/eels/consume-engine --sim.loglevel 5
 ```
 
 ### Container Issues
@@ -68,5 +68,5 @@ See hive log output in the console:
 Increase client timeout:
 
 ```bash
-./hive --client.checktimelimit=180s --sim ethereum/eest/consume-engine
+./hive --client.checktimelimit=180s --sim ethereum/eels/consume-engine
 ```
diff --git a/docs/running_tests/hive/dev_mode.md b/docs/running_tests/hive/dev_mode.md
index 61b382ff44..965a7e1855 100644
--- a/docs/running_tests/hive/dev_mode.md
+++ b/docs/running_tests/hive/dev_mode.md
@@ -1,6 +1,6 @@
 # Hive Development Mode
 
-This section explains how to run EEST simulators using their EEST commands, e.g., `uv run consume engine`, against a Hive "development" server as apposed to using the standalone `./hive` command.
+This section explains how to run EELS simulators using their Python-based commands, e.g., `uv run consume engine`, against a Hive "development" server as apposed to using the standalone `./hive` command.
 
 This avoids running the simulator in a dockerized environment and has several advantages:
 
@@ -18,7 +18,7 @@ This avoids running the simulator in a dockerized environment and has several ad
 
 ### Prerequisites
 
-- EEST is installed, see [Installation](../../getting_started/installation.md)
+- The execution-specs repo is setup in development mode, see [Installation](../../getting_started/installation.md)
 - Hive is built, see [Hive](../hive/index.md#quick-start).
 
 ## Hive Dev Setup on Linux
@@ -29,7 +29,7 @@ This avoids running the simulator in a dockerized environment and has several ad
     ./hive --dev --client go-ethereum --client-file clients.yaml --docker.output
     ```
 
-2. In a separate shell, configure environment for EEST:
+2. In a separate shell, configure environment for execution-specs:
 
     === "bash/zsh"
 
@@ -43,7 +43,7 @@ This avoids running the simulator in a dockerized environment and has several ad
         set -x HIVE_SIMULATOR http://127.0.0.1:3000
         ```
 
-3. Run EEST consume commands
+3. Run execution-specs `consume` commands
 
     ```bash
     uv run consume engine --input ./fixtures -k "test_chainid"
@@ -56,37 +56,36 @@ Due to Docker running within a VM on macOS, the host machine and Docker containe
 
 1. Linux VM: Run a Linux virtual machine on your macOS and execute the standard development workflow above from within the VM.
 2. Remote Linux: SSH into a remote Linux environment (server or cloud instance) and run development mode there.
-3. **Docker Development Image**: Create a containerized EEST environment that runs within Docker's network namespace (recommended).
+3. **Docker Development Image**: Create a containerized execution-specs environment that runs within Docker's network namespace (recommended).
 
 The following section details the setup and usage of option 3.
 
-### EEST Docker Development Image
+### EELS Docker Development Image
 
-Within the [`eest/`](https://github.com/ethereum/hive/tree/master/simulators/ethereum/eest) directory of hive, a new dockerfile must be created: `Dockerfile.dev`, with the following contents:
+Within the [`eels/`](https://github.com/ethereum/hive/tree/master/simulators/ethereum/eels) directory of hive, a new dockerfile must be created: `Dockerfile.dev`, with the following contents:
 
 ```docker
 FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim
-ARG branch=main
-ENV GIT_REF=${branch} 
+ARG branch=""
 
 RUN apt-get update && apt-get install -y git
-RUN git init execution-spec-tests && \
-    cd execution-spec-tests && \
-    git remote add origin https://github.com/ethereum/execution-spec-tests.git && \
-    git fetch --depth 1 origin $GIT_REF && \
-    git checkout FETCH_HEAD;
-
-WORKDIR /execution-spec-tests
+RUN git clone --depth 1 https://github.com/ethereum/execution-specs.git && \
+    cd execution-specs && \
+    if [ -n "$branch" ]; then \
+        git fetch --depth 1 origin "$branch" && \
+        git checkout FETCH_HEAD; \
+    fi
+WORKDIR /execution-specs/packages/testing
 RUN uv sync
 ENTRYPOINT ["/bin/bash"]
 ```
 
-This dockerfile will be our entry point for running EEST commands.
+This dockerfile will be our entry point for running simulator commands.
 
-### `eest/` Hive Directory Structure
+### `eels/` Hive Directory Structure
 
 ```tree
-├── eest
+├── eels
 │   ├── Dockerfile.dev
 │   ├── consume-block-rlp
 │   │   └── Dockerfile
@@ -108,10 +107,10 @@ This dockerfile will be our entry point for running EEST commands.
     ./hive --dev --dev.addr <LOCAL_IP>:3000 --client go-ethereum --client-file clients.yaml 
     ```
 
-3. In a separate terminal session, build the EEST development image:
+3. In a separate terminal session, build the EELS development image:
 
     ```bash
-    cd simulators/ethereum/eest/
+    cd simulators/ethereum/eels/
     docker build -t macos-consume-dev -f Dockerfile.dev .
     ```
 
@@ -136,7 +135,7 @@ When Hive runs in dev mode:
 3. Keeps the Hive Proxy container running between test executions.
 4. Waits for external simulator connections via the API.
 
-This allows EEST's consume commands to connect to the running Hive instance and execute tests interactively.
+This allows the EELS's consume commands to connect to the running Hive instance and execute tests interactively.
 
 ## More Options Available
 
diff --git a/docs/running_tests/hive/index.md b/docs/running_tests/hive/index.md
index 81f75be5ee..aba902eb49 100644
--- a/docs/running_tests/hive/index.md
+++ b/docs/running_tests/hive/index.md
@@ -1,6 +1,6 @@
 # Hive
 
-@ethereum/hive is a containerized testing framework that helps orchestrate test execution against Ethereum clients. Hive is incredibly extensible; new test suites can be implemented in a module manner as "simulators" that interact with clients to test certain aspects of their behavior. EEST implements several simulators, see [Running Tests](../running.md) for an overview.
+@ethereum/hive is a containerized testing framework that helps orchestrate test execution against Ethereum clients. Hive is incredibly extensible; new test suites can be implemented in a module manner as "simulators" that interact with clients to test certain aspects of their behavior. The execution-specs `testing` package implements several simulators, see [Running Tests](../running.md) for an overview.
 
 ## Quick Start
 
diff --git a/docs/running_tests/releases.md b/docs/running_tests/releases.md
index 2136f0401a..7f7ea06cef 100644
--- a/docs/running_tests/releases.md
+++ b/docs/running_tests/releases.md
@@ -7,10 +7,10 @@
 | Format                                                               | Consumed by the client                                                                                                                                                                                                                                                                    | Location in `.tar.gz` release                                       |
 | -------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------- |
 | [State Tests](./test_formats/state_test.md)                         | - directly via a `statetest`-like command<br/> (e.g., [go-ethereum/cmd/evm/staterunner.go](https://github.com/ethereum/go-ethereum/blob/4bb097b7ffc32256791e55ff16ca50ef83c4609b/cmd/evm/staterunner.go))                                                                                 | `./fixtures/state_tests/`                                           |
-| [Blockchain Tests](./test_formats/blockchain_test.md)               | - directly via a `blocktest`-like command<br/> (e.g., [go-ethereum/cmd/evm/blockrunner.go](https://github.com/ethereum/go-ethereum/blob/4bb097b7ffc32256791e55ff16ca50ef83c4609b/cmd/evm/blockrunner.go))</br>- using the [RLPeest/consume-rlp Simulator](./running.md#rlp) via block import | `./fixtures/blockchain_tests/`                                      |
-| [Blockchain Engine Tests](./test_formats/blockchain_test_engine.md) | - using the [eest/consume-engine Simulator](./running.md#engine) and the Engine API                                                                                                                                                                                                          | `./fixtures/blockchain_tests_engine/`                               |
+| [Blockchain Tests](./test_formats/blockchain_test.md)               | - directly via a `blocktest`-like command<br/> (e.g., [go-ethereum/cmd/evm/blockrunner.go](https://github.com/ethereum/go-ethereum/blob/4bb097b7ffc32256791e55ff16ca50ef83c4609b/cmd/evm/blockrunner.go))</br>- using the [eels/consume-rlp Simulator](./running.md#rlp) via block import | `./fixtures/blockchain_tests/`                                      |
+| [Blockchain Engine Tests](./test_formats/blockchain_test_engine.md) | - using the [eels/consume-engine Simulator](./running.md#engine) and the Engine API                                                                                                                                                                                                          | `./fixtures/blockchain_tests_engine/`                               |
 | [Transaction Tests](./test_formats/transaction_test.md)             | - using a new simulator coming soon                                                                                                                                                                                                                                                       | None; executed directly from Python source,</br>using a release tag |
-| Blob Transaction Tests                                               | - using the [eest/execute-blobs Simulator](./execute/hive.md#the-eestexecute-blobs-simulator) and                                                                                                                                                                                                                         | None; executed directly from Python source,</br>using a release tag |
+| Blob Transaction Tests                                               | - using the [eels/execute-blobs Simulator](./execute/hive.md#the-eelsexecute-blobs-simulator) and                                                                                                                                                                                                                         | None; executed directly from Python source,</br>using a release tag |
 
 ## Release URLs and Tarballs
 
diff --git a/docs/running_tests/running.md b/docs/running_tests/running.md
index 94272d7d4e..efd98c340d 100644
--- a/docs/running_tests/running.md
+++ b/docs/running_tests/running.md
@@ -23,9 +23,9 @@ Both `consume` and `execute` provide sub-commands which correspond to different
 
 The following sections describe the different methods in more detail.
 
-!!! note "`./hive --sim=eest/consume-engine` vs `consume engine`"
+!!! note "`./hive --sim=eels/consume-engine` vs `consume engine`"
 
-     EEST simulators can be ran either standalone using the `./hive` command or via an EEST command against a `./hive --dev` backend, more details are [provided below](#two-methods-to-run-eest-simulators).
+     The execution-specs simulators can be ran either standalone using the `./hive` command or via a `uv`/Python-based command against a `./hive --dev` backend, more details are [provided below](#two-methods-to-run-eels-simulators).
 
 ## Direct
 
@@ -48,7 +48,7 @@ The EEST `consume direct` command is a small wrapper around client direct interf
 | Nomenclature   |                          |
 | -------------- | ------------------------ |
 | Command        | `consume engine`         |
-| Simulator      | `eest/consume-engine`    |
+| Simulator      | `eels/consume-engine`    |
 | Fixture format | `blockchain_test_engine` |
 
 The consume engine method tests execution clients via the Engine API by sending block payloads and verifying the response (post-merge forks only). This method provides the most realistic testing environment for production Ethereum client behavior, covering consensus integration, payload validation, and state synchronization.
@@ -67,7 +67,7 @@ The `consume engine` command:
 | Nomenclature   |                    |
 | -------------- | ------------------ |
 | Command        | `consume rlp`      |
-| Simulator      | `eest/consume-rlp` |
+| Simulator      | `eels/consume-rlp` |
 | Fixture format | `blockchain_test`  |
 
 The RLP consumption method tests execution clients by providing them with RLP-encoded blocks to load upon startup, similar to the block import process during historical synchronization. This method tests the client's core block processing logic without the overhead of network protocols.
@@ -103,15 +103,15 @@ The `consume sync` command:
 
 ## Engine vs RLP Simulator
 
-The RLP Simulator (`eest/consume-rlp`) and the Engine Simulator (`eest/consume-engine`) should be seen as complimentary to one another. Although they execute the same underlying EVM test cases, the block validation logic is executed via different client code paths (using different [fixture formats](./test_formats/index.md)). Therefore, ideally, **both simulators should be executed for full coverage**.
+The RLP Simulator (`eels/consume-rlp`) and the Engine Simulator (`eels/consume-engine`) should be seen as complimentary to one another. Although they execute the same underlying EVM test cases, the block validation logic is executed via different client code paths (using different [fixture formats](./test_formats/index.md)). Therefore, ideally, **both simulators should be executed for full coverage**.
 
 ### Code Path Choices
 
-Clients consume fixtures in the `eest/consume-engine` simulator via the Engine API's `EngineNewPayloadv*` endpoint; a natural way to validate, respectively invalidate, block payloads. In this case, there is no flexibility in the choice of code path - it directly harnesses mainnet client functionality. The `eest/consume-rlp` Simulator, however, allows clients more freedom, as the rlp-encoded blocks are imported upon client startup. Clients are recommended to try and hook the block import into the code path used for historical syncing.
+Clients consume fixtures in the `eels/consume-engine` simulator via the Engine API's `EngineNewPayloadv*` endpoint; a natural way to validate, respectively invalidate, block payloads. In this case, there is no flexibility in the choice of code path - it directly harnesses mainnet client functionality. The `eels/consume-rlp` Simulator, however, allows clients more freedom, as the rlp-encoded blocks are imported upon client startup. Clients are recommended to try and hook the block import into the code path used for historical syncing.
 
 ### Differences
 
-|                         | `eest/consume-rlp`                                    | `eest/consume-engine`                                              |
+|                         | `eels/consume-rlp`                                    | `eels/consume-engine`                                              |
 | ----------------------- | ----------------------------------------------------- | ------------------------------------------------------------------ |
 | **Fixture Format Used** | [`BlockchainTest`](./test_formats/blockchain_test.md) | [`BlockchainTestEngine`](./test_formats/blockchain_test_engine.md) |
 | **Fork support**        | All forks (including pre-merge)                       | Post-merge forks only (Paris+)                                     |
@@ -128,9 +128,9 @@ Clients consume fixtures in the `eest/consume-engine` simulator via the Engine A
 
 See [Execute Command](./execute/index.md).
 
-## Two Methods to Run EEST Simulators
+## Two Methods to Run EELS Simulators
 
-Many of the methods use the Hive Testing Environment to interact with clients and run tests against them. These methods are also called Hive simulators. While Hive is always necessary to run simulators, they can be called in two different ways. Both of these commands execute the same simulator code, but in different environments, we take the example of the `eest/consume-engine` simulator:
+Many of the methods use the Hive Testing Environment to interact with clients and run tests against them. These methods are also called Hive simulators. While Hive is always necessary to run simulators, they can be called in two different ways. Both of these commands execute the same simulator code, but in different environments, we take the example of the `eels/consume-engine` simulator:
 
-1. `./hive --sim=eest/consume-engine` is a standalone command that installs EEST and the `consume` command in a dockerized container managed by Hive. This is the standard method to execute EEST [fixture releases](./releases.md) against clients in CI environments and is the method to generate the results at [hive.ethpandaops.io](https://hive.ethpandaops.io). See [Hive](./hive/index.md) and its [Common Options](./hive/common_options.md) for help with this method.
-2. `uv run consume engine` requires the user to clone and [install EEST](../getting_started/installation.md) and start a Hive server in [development mode](./hive/dev_mode.md). In this case, the simulator runs on the native system and communicate to the client via the Hive API. This is particularly useful during test development as fixtures on the local disk can be specified via `--input=fixtures/`. As the simulator runs natively, it is easy to drop into a debugger and inspect the simulator or client container state. See [Hive Developer Mode](./hive/dev_mode.md) for help with this method.
+1. `./hive --sim=eels/consume-engine` is a standalone command that installs and configures execution-specs and its `consume` command in a dockerized container managed by Hive. This is the standard method to execute EEST [fixture releases](./releases.md) against clients in CI environments and is the method to generate the results at [hive.ethpandaops.io](https://hive.ethpandaops.io). See [Hive](./hive/index.md) and its [Common Options](./hive/common_options.md) for help with this method.
+2. `uv run consume engine` requires the user to clone and [configure execution-specs](../getting_started/installation.md) and start a Hive server in [development mode](./hive/dev_mode.md). In this case, the simulator runs on the native system and communicate to the client via the Hive API. This is particularly useful during test development as fixtures on the local disk can be specified via `--input=fixtures/`. As the simulator runs natively, it is easy to drop into a debugger and inspect the simulator or client container state. See [Hive Developer Mode](./hive/dev_mode.md) for help with this method.

From 68cac702fdc41416a006b3dac16060b77913acaa Mon Sep 17 00:00:00 2001
From: LouisTsai <q1030176@gmail.com>
Date: Tue, 9 Dec 2025 14:29:29 +0800
Subject: [PATCH 02/11] feat: implement opcode verification

---
 .../plugins/shared/benchmarking.py            |  9 +--
 .../src/execution_testing/specs/benchmark.py  | 75 +++++++++++++++++--
 2 files changed, 72 insertions(+), 12 deletions(-)

diff --git a/packages/testing/src/execution_testing/cli/pytest_commands/plugins/shared/benchmarking.py b/packages/testing/src/execution_testing/cli/pytest_commands/plugins/shared/benchmarking.py
index acc29794ae..2c33937bf1 100644
--- a/packages/testing/src/execution_testing/cli/pytest_commands/plugins/shared/benchmarking.py
+++ b/packages/testing/src/execution_testing/cli/pytest_commands/plugins/shared/benchmarking.py
@@ -110,7 +110,7 @@ def get_opcode_counts_for_test(
 def pytest_collection_modifyitems(
     config: pytest.Config, items: list[pytest.Item]
 ) -> None:
-    """Filter tests based on repricing marker when benchmark options are used."""
+    """Filter tests based on repricing marker."""
     gas_benchmark_value = config.getoption("gas_benchmark_value")
     fixed_opcode_count = config.getoption("fixed_opcode_count")
 
@@ -140,11 +140,9 @@ def pytest_collection_modifyitems(
 
     filtered = []
     for item in items:
-        if not item.get_closest_marker("benchmark"):
-            continue
-
         repricing_marker = item.get_closest_marker("repricing")
         if not repricing_marker:
+            filtered.append(item)
             continue
 
         if not repricing_marker.kwargs:
@@ -251,7 +249,7 @@ def gas_benchmark_value(request: pytest.FixtureRequest) -> int:
     # Only use high gas limit if --fixed-opcode-count flag was provided
     fixed_opcode_count = request.config.getoption("fixed_opcode_count")
     if fixed_opcode_count is not None:
-        return HIGH_GAS_LIMIT
+        return BENCHMARKING_MAX_GAS
 
     return EnvironmentDefaults.gas_limit
 
@@ -266,7 +264,6 @@ def fixed_opcode_count(request: pytest.FixtureRequest) -> int | None:
 
 
 BENCHMARKING_MAX_GAS = 1_000_000_000_000
-HIGH_GAS_LIMIT = 1_000_000_000
 
 
 @pytest.fixture
diff --git a/packages/testing/src/execution_testing/specs/benchmark.py b/packages/testing/src/execution_testing/specs/benchmark.py
index 67e8770f04..4bdaf0ebac 100644
--- a/packages/testing/src/execution_testing/specs/benchmark.py
+++ b/packages/testing/src/execution_testing/specs/benchmark.py
@@ -19,6 +19,7 @@
 
 from execution_testing.base_types import Address, HexNumber
 from execution_testing.client_clis import TransitionTool
+from execution_testing.client_clis.cli_types import OpcodeCount
 from execution_testing.exceptions import (
     BlockException,
     TransactionException,
@@ -56,6 +57,7 @@ class BenchmarkCodeGenerator(ABC):
     fixed_opcode_count: int | None = None
     code_padding_opcode: Op | None = None
     _contract_address: Address | None = None
+    _inner_iterations: int = 1000
 
     @abstractmethod
     def deploy_contracts(self, *, pre: Alloc, fork: Fork) -> Address:
@@ -72,8 +74,13 @@ def deploy_fix_count_contracts(self, *, pre: Alloc, fork: Fork) -> Address:
         )
         self._target_contract_address = pre.deploy_contract(code=code)
 
-        iterations = self.fixed_opcode_count
-        assert iterations is not None, "fixed_opcode_count is not set"
+        assert self.fixed_opcode_count is not None, (
+            "fixed_opcode_count is not set"
+        )
+        # Adjust outer loop iterations based on inner iterations
+        # If inner is 500 instead of 1000, double the outer loop
+        outer_multiplier = 1000 // self._inner_iterations
+        iterations = self.fixed_opcode_count * outer_multiplier
 
         prefix = Op.CALLDATACOPY(
             Op.PUSH0, Op.PUSH0, Op.CALLDATASIZE
@@ -82,7 +89,7 @@ def deploy_fix_count_contracts(self, *, pre: Alloc, fork: Fork) -> Address:
             prefix
             + Op.JUMPDEST
             + Op.POP(
-                Op.STATICCALL(
+                Op.DELEGATECALL(
                     gas=Op.GAS,
                     address=self._target_contract_address,
                     args_offset=0,
@@ -148,11 +155,32 @@ def generate_repeated_code(
         max_iterations = available_space // len(repeated_code)
 
         # Use fixed_opcode_count if provided, otherwise fill to max
+        # Iteration Logic: The goal is to set the total operation count proportional to a
+        # 'fixed_opcode_count' multiplied by 1000, across two contracts (Loop M * Target N).
+
+        # --- 1. Determine Inner Iterations (N) ---
+        # The Target Contract's loop count is determined by block filling, capped at 1000.
+        #
+        # 1a. Calculate 'max_iterations' to fill the block.
+        # 1b. The Inner Iteration count (N) is capped at 1000.
+        # 1c. If the calculated N is less than 1000, use 500 as the fallback count.
+
+        # --- 2. Determine Outer Iterations (M) ---
+        # The Loop Contract's call count (M) is set to ensure the final total execution is consistent.
+        #
+        # 2a. If N is 1000: Set M = fixed_opcode_count. (Total ops: fixed_opcode_count * 1000)
+        # 2b. If N is 500: Set M = fixed_opcode_count * 2. (Total ops: (fixed_opcode_count * 2) * 500 = fixed_opcode_count * 1000)
         if self.fixed_opcode_count is not None:
-            max_iterations = min(max_iterations, 1000)
+            inner_iterations = 1000 if max_iterations >= 1000 else 500
+            self._inner_iterations = min(max_iterations, inner_iterations)
 
         # TODO: Unify the PUSH0 and PUSH1 usage.
-        code = setup + Op.JUMPDEST + repeated_code * max_iterations
+        iterations = (
+            self._inner_iterations
+            if self.fixed_opcode_count
+            else max_iterations
+        )
+        code = setup + Op.JUMPDEST + repeated_code * iterations
         if self.fixed_opcode_count is None:
             code += cleanup + (
                 Op.JUMP(len(setup)) if len(setup) > 0 else Op.PUSH0 + Op.JUMP
@@ -197,6 +225,7 @@ class BenchmarkTest(BaseTest):
         default_factory=lambda: int(Environment().gas_limit)
     )
     fixed_opcode_count: int | None = None
+    target_opcode: Op | None = None
     code_generator: BenchmarkCodeGenerator | None = None
 
     supported_fixture_formats: ClassVar[
@@ -392,6 +421,31 @@ def generate_blockchain_test(self) -> BlockchainTest:
             blocks=self.blocks,
         )
 
+    def _verify_target_opcode_count(
+        self, opcode_count: OpcodeCount | None
+    ) -> None:
+        """Verify the target opcode was executed the expected number of times."""
+        # Skip validation if opcode count is not available (e.g. currently only supported for evmone filling)
+        if opcode_count is None:
+            return
+
+        assert self.target_opcode is not None, "target_opcode is not set"
+        assert self.fixed_opcode_count is not None, (
+            "fixed_opcode_count is not set"
+        )
+
+        # fixed_opcode_count is in thousands units
+        expected = self.fixed_opcode_count * 1000
+
+        actual = opcode_count.root.get(self.target_opcode, 0)
+        tolerance = expected * 0.05  # 5% tolerance
+
+        if abs(actual - expected) > tolerance:
+            raise ValueError(
+                f"Target opcode {self.target_opcode} count mismatch: "
+                f"expected ~{expected} (±5%), got {actual}"
+            )
+
     def generate(
         self,
         t8n: TransitionTool,
@@ -402,9 +456,18 @@ def generate(
             exception=self.tx.error is not None if self.tx else False
         )
         if fixture_format in BlockchainTest.supported_fixture_formats:
-            return self.generate_blockchain_test().generate(
+            blockchain_test = self.generate_blockchain_test()
+            fixture = blockchain_test.generate(
                 t8n=t8n, fixture_format=fixture_format
             )
+
+            # Verify target opcode count if specified
+            if (
+                self.target_opcode is not None
+                and self.fixed_opcode_count is not None
+            ):
+                self._verify_target_opcode_count(blockchain_test._opcode_count)
+            return fixture
         else:
             raise Exception(f"Unsupported fixture format: {fixture_format}")
 

From ce66712472d81d51ea7d12a56316a8cde52b87c2 Mon Sep 17 00:00:00 2001
From: LouisTsai <q1030176@gmail.com>
Date: Tue, 9 Dec 2025 14:36:09 +0800
Subject: [PATCH 03/11] feat: label targeted opcode

---
 .../compute/instruction/test_account_query.py |  8 ++++++-
 .../compute/instruction/test_arithmetic.py    |  5 +++-
 .../compute/instruction/test_bitwise.py       | 23 +++++++++++++------
 .../compute/instruction/test_block_context.py |  2 ++
 .../compute/instruction/test_call_context.py  | 13 +++++++++--
 .../compute/instruction/test_comparison.py    |  2 ++
 .../compute/instruction/test_control_flow.py  | 18 ++++++++++++---
 .../compute/instruction/test_keccak.py        |  2 ++
 .../benchmark/compute/instruction/test_log.py |  1 +
 .../compute/instruction/test_memory.py        |  3 +++
 .../compute/instruction/test_stack.py         |  3 +++
 .../compute/instruction/test_system.py        |  6 ++++-
 .../compute/instruction/test_tx_context.py    |  2 ++
 13 files changed, 73 insertions(+), 15 deletions(-)

diff --git a/tests/benchmark/compute/instruction/test_account_query.py b/tests/benchmark/compute/instruction/test_account_query.py
index d5adb41f62..5f8f36997b 100644
--- a/tests/benchmark/compute/instruction/test_account_query.py
+++ b/tests/benchmark/compute/instruction/test_account_query.py
@@ -49,6 +49,7 @@ def test_selfbalance(
 ) -> None:
     """Benchmark SELFBALANCE instruction."""
     benchmark_test(
+        target_opcode=Op.SELFBALANCE,
         code_generator=ExtCallGenerator(
             attack_block=Op.SELFBALANCE,
             contract_balance=contract_balance,
@@ -62,6 +63,7 @@ def test_codesize(
 ) -> None:
     """Benchmark CODESIZE instruction."""
     benchmark_test(
+        target_opcode=Op.CODESIZE,
         code_generator=ExtCallGenerator(attack_block=Op.CODESIZE),
     )
 
@@ -99,11 +101,12 @@ def test_codecopy(
     attack_block = Op.CODECOPY(src_dst, src_dst, Op.DUP1)  # DUP1 copies size.
 
     benchmark_test(
+        target_opcode=Op.CODECOPY,
         code_generator=JumpLoopGenerator(
             setup=setup,
             attack_block=attack_block,
             code_padding_opcode=Op.STOP,
-        )
+        ),
     )
 
 
@@ -334,6 +337,7 @@ def test_extcodecopy_warm(
     )
 
     benchmark_test(
+        target_opcode=Op.EXTCODECOPY,
         code_generator=JumpLoopGenerator(
             setup=Op.PUSH10(copied_size) + Op.PUSH20(copied_contract_address),
             attack_block=Op.EXTCODECOPY(Op.DUP4, 0, 0, Op.DUP2),
@@ -411,6 +415,7 @@ def test_ext_account_query_warm(
         post[target_addr] = Account(**contract_kwargs)
 
     benchmark_test(
+        target_opcode=opcode,
         post=post,
         code_generator=JumpLoopGenerator(
             setup=Op.MSTORE(0, target_addr),
@@ -514,6 +519,7 @@ def test_ext_account_query_cold(
     blocks.append(Block(txs=[op_tx]))
 
     benchmark_test(
+        target_opcode=opcode,
         post=post,
         blocks=blocks,
     )
diff --git a/tests/benchmark/compute/instruction/test_arithmetic.py b/tests/benchmark/compute/instruction/test_arithmetic.py
index c7a517ca48..d625907539 100644
--- a/tests/benchmark/compute/instruction/test_arithmetic.py
+++ b/tests/benchmark/compute/instruction/test_arithmetic.py
@@ -149,6 +149,7 @@ def test_arithmetic(
     attack_block = Op.DUP2 + opcode
     cleanup = Op.POP + Op.POP + Op.DUP2 + Op.DUP2
     benchmark_test(
+        target_opcode=opcode,
         code_generator=JumpLoopGenerator(
             setup=setup,
             attack_block=attack_block,
@@ -388,4 +389,6 @@ def test_mod_arithmetic(
         sender=pre.fund_eoa(),
     )
 
-    benchmark_test(tx=tx)
+    benchmark_test(
+        tx=tx,
+    )
diff --git a/tests/benchmark/compute/instruction/test_bitwise.py b/tests/benchmark/compute/instruction/test_bitwise.py
index 01750454ae..462870ff85 100644
--- a/tests/benchmark/compute/instruction/test_bitwise.py
+++ b/tests/benchmark/compute/instruction/test_bitwise.py
@@ -101,6 +101,7 @@ def test_bitwise(
     attack_block = Op.DUP2 + opcode
     cleanup = Op.POP + Op.POP + Op.DUP2 + Op.DUP2
     benchmark_test(
+        target_opcode=opcode,
         code_generator=JumpLoopGenerator(
             setup=setup,
             attack_block=attack_block,
@@ -118,16 +119,17 @@ def test_not_op(
     Benchmark NOT instruction (takes one arg, pushes one value).
     """
     benchmark_test(
+        target_opcode=Op.NOT,
         code_generator=JumpLoopGenerator(setup=Op.PUSH0, attack_block=Op.NOT),
     )
 
 
-@pytest.mark.parametrize("shift_right", [Op.SHR, Op.SAR])
+@pytest.mark.parametrize("opcode", [Op.SHR, Op.SAR])
 def test_shifts(
     benchmark_test: BenchmarkTestFiller,
     pre: Alloc,
     fork: Fork,
-    shift_right: Op,
+    opcode: Op,
     gas_benchmark_value: int,
 ) -> None:
     """
@@ -138,13 +140,13 @@ def test_shifts(
     """
     max_code_size = fork.max_code_size()
 
-    match shift_right:
+    match opcode:
         case Op.SHR:
             shift_right_fn = shr
         case Op.SAR:
             shift_right_fn = sar
         case _:
-            raise ValueError(f"Unexpected shift op: {shift_right}")
+            raise ValueError(f"Unexpected shift op: {opcode}")
 
     rng = random.Random(1)  # Use random with a fixed seed.
     initial_value = 2**256 - 1  # The initial value to be shifted; should be
@@ -180,7 +182,7 @@ def select_shift_amount(
         v, i = select_shift_amount(shl, v)
         code_body += make_dup(len(shift_amounts) - i) + Op.SHL
         v, i = select_shift_amount(shift_right_fn, v)
-        code_body += make_dup(len(shift_amounts) - i) + shift_right
+        code_body += make_dup(len(shift_amounts) - i) + opcode
 
     code = code_prefix + code_body + code_suffix
     assert len(code) == max_code_size - 2
@@ -192,7 +194,10 @@ def select_shift_amount(
         sender=pre.fund_eoa(),
     )
 
-    benchmark_test(tx=tx)
+    benchmark_test(
+        target_opcode=opcode,
+        tx=tx,
+    )
 
 
 @pytest.mark.repricing
@@ -201,6 +206,7 @@ def test_clz_same(benchmark_test: BenchmarkTestFiller) -> None:
     """Benchmark CLZ instruction with same input."""
     magic_value = 248  # CLZ(248) = 248
     benchmark_test(
+        target_opcode=Op.CLZ,
         code_generator=JumpLoopGenerator(
             setup=Op.PUSH1(magic_value), attack_block=Op.CLZ
         ),
@@ -238,4 +244,7 @@ def test_clz_diff(
         sender=pre.fund_eoa(),
     )
 
-    benchmark_test(tx=tx)
+    benchmark_test(
+        target_opcode=Op.CLZ,
+        tx=tx,
+    )
diff --git a/tests/benchmark/compute/instruction/test_block_context.py b/tests/benchmark/compute/instruction/test_block_context.py
index ab0b0ab913..ce8513f251 100644
--- a/tests/benchmark/compute/instruction/test_block_context.py
+++ b/tests/benchmark/compute/instruction/test_block_context.py
@@ -43,6 +43,7 @@ def test_block_context_ops(
 ) -> None:
     """Benchmark zero-parameter block context instructions."""
     benchmark_test(
+        target_opcode=opcode,
         code_generator=ExtCallGenerator(attack_block=opcode),
     )
 
@@ -73,6 +74,7 @@ def test_blockhash(
     block_number = Op.AND(Op.GAS, 0xFF) if index is None else index
 
     benchmark_test(
+        target_opcode=Op.BLOCKHASH,
         setup_blocks=blocks,
         code_generator=ExtCallGenerator(
             attack_block=Op.BLOCKHASH(block_number)
diff --git a/tests/benchmark/compute/instruction/test_call_context.py b/tests/benchmark/compute/instruction/test_call_context.py
index cfa0f3830d..6ab2e0a29e 100644
--- a/tests/benchmark/compute/instruction/test_call_context.py
+++ b/tests/benchmark/compute/instruction/test_call_context.py
@@ -43,6 +43,7 @@ def test_call_frame_context_ops(
 ) -> None:
     """Benchmark call zero-parameter instructions."""
     benchmark_test(
+        target_opcode=opcode,
         code_generator=ExtCallGenerator(attack_block=opcode),
     )
 
@@ -55,6 +56,7 @@ def test_calldatasize(
 ) -> None:
     """Benchmark CALLDATASIZE instruction."""
     benchmark_test(
+        target_opcode=Op.CALLDATASIZE,
         code_generator=ExtCallGenerator(
             attack_block=Op.CALLDATASIZE,
             tx_kwargs={"data": b"\x00" * calldata_length},
@@ -71,6 +73,7 @@ def test_callvalue_from_origin(
     Benchmark CALLVALUE instruction from origin.
     """
     benchmark_test(
+        target_opcode=Op.CALLVALUE,
         code_generator=JumpLoopGenerator(
             attack_block=Op.POP(Op.CALLVALUE),
             tx_kwargs={"value": int(non_zero_value)},
@@ -123,6 +126,7 @@ def test_calldataload(
 ) -> None:
     """Benchmark CALLDATALOAD instruction."""
     benchmark_test(
+        target_opcode=Op.CALLDATALOAD,
         code_generator=JumpLoopGenerator(
             setup=Op.PUSH0,
             attack_block=Op.CALLDATALOAD,
@@ -195,11 +199,12 @@ def test_calldatacopy_from_origin(
     )
 
     benchmark_test(
+        target_opcode=Op.CALLDATACOPY,
         code_generator=JumpLoopGenerator(
             setup=setup,
             attack_block=attack_block,
             tx_kwargs={"data": data},
-        )
+        ),
     )
 
 
@@ -267,11 +272,12 @@ def test_calldatacopy_from_call(
     )
 
     benchmark_test(
+        target_opcode=Op.CALLDATACOPY,
         code_generator=ExtCallGenerator(
             setup=setup,
             attack_block=attack_block,
             tx_kwargs={"data": data},
-        )
+        ),
     )
 
 
@@ -316,6 +322,7 @@ def test_returndatasize_nonzero(
         )
 
     benchmark_test(
+        target_opcode=Op.RETURNDATASIZE,
         code_generator=JumpLoopGenerator(
             setup=setup, attack_block=Op.POP(Op.RETURNDATASIZE)
         ),
@@ -328,6 +335,7 @@ def test_returndatasize_zero(
 ) -> None:
     """Benchmark RETURNDATASIZE instruction with zero buffer."""
     benchmark_test(
+        target_opcode=Op.RETURNDATASIZE,
         code_generator=ExtCallGenerator(attack_block=Op.RETURNDATASIZE),
     )
 
@@ -377,6 +385,7 @@ def test_returndatacopy(
     attack_block = Op.RETURNDATACOPY(dst, Op.PUSH0, Op.RETURNDATASIZE)
 
     benchmark_test(
+        target_opcode=Op.RETURNDATACOPY,
         code_generator=JumpLoopGenerator(
             setup=returndata_gen,
             attack_block=attack_block,
diff --git a/tests/benchmark/compute/instruction/test_comparison.py b/tests/benchmark/compute/instruction/test_comparison.py
index bb93444100..eefacc276a 100644
--- a/tests/benchmark/compute/instruction/test_comparison.py
+++ b/tests/benchmark/compute/instruction/test_comparison.py
@@ -71,6 +71,7 @@ def test_comparison(
     attack_block = Op.DUP2 + opcode
     cleanup = Op.POP + Op.POP + Op.DUP2 + Op.DUP2
     benchmark_test(
+        target_opcode=opcode,
         code_generator=JumpLoopGenerator(
             setup=setup,
             attack_block=attack_block,
@@ -88,6 +89,7 @@ def test_iszero(
     Benchmark ISZERO instruction (takes one arg, pushes one value).
     """
     benchmark_test(
+        target_opcode=Op.ISZERO,
         code_generator=JumpLoopGenerator(
             setup=Op.PUSH0,
             attack_block=Op.ISZERO,
diff --git a/tests/benchmark/compute/instruction/test_control_flow.py b/tests/benchmark/compute/instruction/test_control_flow.py
index ce8e0e4f52..8b6de630eb 100644
--- a/tests/benchmark/compute/instruction/test_control_flow.py
+++ b/tests/benchmark/compute/instruction/test_control_flow.py
@@ -30,6 +30,7 @@ def test_gas_op(
 ) -> None:
     """Benchmark GAS instruction."""
     benchmark_test(
+        target_opcode=Op.GAS,
         code_generator=ExtCallGenerator(attack_block=Op.GAS),
     )
 
@@ -39,6 +40,7 @@ def test_pc_op(
 ) -> None:
     """Benchmark PC instruction."""
     benchmark_test(
+        target_opcode=Op.PC,
         code_generator=ExtCallGenerator(attack_block=Op.PC),
     )
 
@@ -53,7 +55,10 @@ def test_jumps(
         sender=pre.fund_eoa(),
     )
 
-    benchmark_test(tx=tx)
+    benchmark_test(
+        target_opcode=Op.JUMP,
+        tx=tx,
+    )
 
 
 @pytest.mark.repricing
@@ -62,6 +67,7 @@ def test_jumpi_fallthrough(
 ) -> None:
     """Benchmark JUMPI instruction with fallthrough."""
     benchmark_test(
+        target_opcode=Op.JUMPI,
         code_generator=JumpLoopGenerator(
             attack_block=Op.JUMPI(Op.PUSH0, Op.PUSH0)
         ),
@@ -80,7 +86,10 @@ def test_jumpis(
         sender=pre.fund_eoa(),
     )
 
-    benchmark_test(tx=tx)
+    benchmark_test(
+        target_opcode=Op.JUMPI,
+        tx=tx,
+    )
 
 
 @pytest.mark.repricing
@@ -88,4 +97,7 @@ def test_jumpdests(
     benchmark_test: BenchmarkTestFiller,
 ) -> None:
     """Benchmark JUMPDEST instruction."""
-    benchmark_test(code_generator=JumpLoopGenerator(attack_block=Op.JUMPDEST))
+    benchmark_test(
+        target_opcode=Op.JUMPDEST,
+        code_generator=JumpLoopGenerator(attack_block=Op.JUMPDEST),
+    )
diff --git a/tests/benchmark/compute/instruction/test_keccak.py b/tests/benchmark/compute/instruction/test_keccak.py
index 4cefa708a4..b392a64c8d 100644
--- a/tests/benchmark/compute/instruction/test_keccak.py
+++ b/tests/benchmark/compute/instruction/test_keccak.py
@@ -63,6 +63,7 @@ def test_keccak_max_permutations(
             optimal_input_length = i
 
     benchmark_test(
+        target_opcode=Op.SHA3,
         code_generator=JumpLoopGenerator(
             setup=Op.PUSH20[optimal_input_length],
             attack_block=Op.POP(Op.SHA3(Op.PUSH0, Op.DUP1)),
@@ -79,6 +80,7 @@ def test_keccak(
 ) -> None:
     """Benchmark KECCAK256 instruction with diff input data and offsets."""
     benchmark_test(
+        target_opcode=Op.SHA3,
         code_generator=JumpLoopGenerator(
             setup=Op.CALLDATACOPY(offset, Op.PUSH0, Op.CALLDATASIZE),
             attack_block=Op.POP(Op.SHA3(offset, Op.CALLDATASIZE)),
diff --git a/tests/benchmark/compute/instruction/test_log.py b/tests/benchmark/compute/instruction/test_log.py
index eaccaf5700..ca176128c9 100644
--- a/tests/benchmark/compute/instruction/test_log.py
+++ b/tests/benchmark/compute/instruction/test_log.py
@@ -81,6 +81,7 @@ def test_log(
     attack_block = Op.DUP1 * topic_count + size_op + offset + opcode
 
     benchmark_test(
+        target_opcode=opcode,
         code_generator=JumpLoopGenerator(
             setup=setup, attack_block=attack_block
         ),
diff --git a/tests/benchmark/compute/instruction/test_memory.py b/tests/benchmark/compute/instruction/test_memory.py
index e4a9decb8e..01f9670c1f 100644
--- a/tests/benchmark/compute/instruction/test_memory.py
+++ b/tests/benchmark/compute/instruction/test_memory.py
@@ -27,6 +27,7 @@ def test_msize(
 ) -> None:
     """Benchmark MSIZE instruction."""
     benchmark_test(
+        target_opcode=Op.MSIZE,
         code_generator=ExtCallGenerator(
             setup=Op.POP(Op.MLOAD(Op.SELFBALANCE)),
             attack_block=Op.MSIZE,
@@ -67,6 +68,7 @@ def test_memory_access(
     )
 
     benchmark_test(
+        target_opcode=opcode,
         code_generator=JumpLoopGenerator(
             setup=setup, attack_block=attack_block
         ),
@@ -107,6 +109,7 @@ def test_mcopy(
         else Bytecode()
     )
     benchmark_test(
+        target_opcode=Op.MCOPY,
         code_generator=JumpLoopGenerator(
             attack_block=attack_block, cleanup=mem_touch
         ),
diff --git a/tests/benchmark/compute/instruction/test_stack.py b/tests/benchmark/compute/instruction/test_stack.py
index 0dd197b026..5744b198fe 100644
--- a/tests/benchmark/compute/instruction/test_stack.py
+++ b/tests/benchmark/compute/instruction/test_stack.py
@@ -45,6 +45,7 @@ def test_swap(
 ) -> None:
     """Benchmark SWAP instruction."""
     benchmark_test(
+        target_opcode=opcode,
         code_generator=JumpLoopGenerator(
             attack_block=opcode, setup=Op.PUSH0 * opcode.min_stack_height
         ),
@@ -80,6 +81,7 @@ def test_dup(
     """Benchmark DUP instruction."""
     min_stack_height = opcode.min_stack_height
     benchmark_test(
+        target_opcode=opcode,
         code_generator=ExtCallGenerator(
             setup=Op.PUSH0 * min_stack_height,
             attack_block=opcode,
@@ -132,6 +134,7 @@ def test_push(
 ) -> None:
     """Benchmark PUSH instruction."""
     benchmark_test(
+        target_opcode=opcode,
         code_generator=ExtCallGenerator(
             attack_block=opcode[1] if opcode.has_data_portion() else opcode
         ),
diff --git a/tests/benchmark/compute/instruction/test_system.py b/tests/benchmark/compute/instruction/test_system.py
index 78fde39b9a..71d24089fb 100644
--- a/tests/benchmark/compute/instruction/test_system.py
+++ b/tests/benchmark/compute/instruction/test_system.py
@@ -349,11 +349,12 @@ def test_create(
     )
 
     benchmark_test(
+        target_opcode=opcode,
         code_generator=JumpLoopGenerator(
             setup=setup,
             attack_block=attack_block,
             contract_balance=1_000_000_000 if value > 0 else 0,
-        )
+        ),
     )
 
 
@@ -422,6 +423,7 @@ def test_creates_collisions(
             pre.deploy_contract(address=addr, code=Op.INVALID)
 
     benchmark_test(
+        target_opcode=opcode,
         code_generator=JumpLoopGenerator(
             setup=setup, attack_block=attack_block
         ),
@@ -465,6 +467,7 @@ def test_return_revert(
         Op.CODECOPY(size=return_size) if return_non_zero_data else Bytecode()
     )
     benchmark_test(
+        target_opcode=opcode,
         code_generator=ExtCallGenerator(
             setup=mem_preparation,
             attack_block=opcode(size=return_size),
@@ -623,6 +626,7 @@ def test_selfdestruct_existing(
 
     benchmark_test(
         post=post,
+        target_opcode=Op.SELFDESTRUCT,
         blocks=[
             Block(txs=[contracts_deployment_tx]),
             Block(txs=[opcode_tx], fee_recipient=fee_recipient),
diff --git a/tests/benchmark/compute/instruction/test_tx_context.py b/tests/benchmark/compute/instruction/test_tx_context.py
index 12cc1e2acf..e6eb35b330 100644
--- a/tests/benchmark/compute/instruction/test_tx_context.py
+++ b/tests/benchmark/compute/instruction/test_tx_context.py
@@ -36,6 +36,7 @@ def test_call_frame_context_ops(
 ) -> None:
     """Benchmark call zero-parameter instructions."""
     benchmark_test(
+        target_opcode=opcode,
         code_generator=ExtCallGenerator(attack_block=opcode),
     )
 
@@ -67,6 +68,7 @@ def test_blobhash(
         )
 
     benchmark_test(
+        target_opcode=Op.BLOBHASH,
         code_generator=ExtCallGenerator(
             attack_block=Op.BLOBHASH(blob_index),
             tx_kwargs=tx_kwargs,

From b719ddf15088c1013be606ec962ebf2675024c8d Mon Sep 17 00:00:00 2001
From: LouisTsai <q1030176@gmail.com>
Date: Tue, 9 Dec 2025 14:36:39 +0800
Subject: [PATCH 04/11] chore: ignore unsupported tests

---
 tests/benchmark/compute/instruction/test_account_query.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/benchmark/compute/instruction/test_account_query.py b/tests/benchmark/compute/instruction/test_account_query.py
index 5f8f36997b..b045ac4989 100644
--- a/tests/benchmark/compute/instruction/test_account_query.py
+++ b/tests/benchmark/compute/instruction/test_account_query.py
@@ -445,10 +445,14 @@ def test_ext_account_query_cold(
     absent_accounts: bool,
     env: Environment,
     gas_benchmark_value: int,
+    fixed_opcode_count: int,
 ) -> None:
     """
     Benchmark stateful opcodes accessing cold accounts.
     """
+    if fixed_opcode_count:
+        pytest.skip("Fixed opcode count is not supported for this test")
+
     attack_gas_limit = gas_benchmark_value
 
     gas_costs = fork.gas_costs()

From adb7be98e3966f2ed7257b56317a4ee3d2ee3c8d Mon Sep 17 00:00:00 2001
From: LouisTsai <q1030176@gmail.com>
Date: Tue, 9 Dec 2025 15:54:20 +0800
Subject: [PATCH 05/11] refactor: update worst scenario for repricing marker

---
 .../compute/instruction/test_account_query.py        | 11 ++++++++++-
 .../benchmark/compute/instruction/test_arithmetic.py |  4 ++--
 tests/benchmark/compute/instruction/test_bitwise.py  |  1 +
 .../compute/instruction/test_call_context.py         | 12 +++++++-----
 .../compute/instruction/test_control_flow.py         |  2 ++
 tests/benchmark/compute/instruction/test_log.py      |  6 +++---
 tests/benchmark/compute/instruction/test_memory.py   |  6 +++---
 tests/benchmark/compute/instruction/test_storage.py  |  6 +++++-
 tests/benchmark/compute/instruction/test_system.py   |  8 +++++++-
 .../benchmark/compute/instruction/test_tx_context.py |  2 +-
 10 files changed, 41 insertions(+), 17 deletions(-)

diff --git a/tests/benchmark/compute/instruction/test_account_query.py b/tests/benchmark/compute/instruction/test_account_query.py
index b045ac4989..2ada0a8f9b 100644
--- a/tests/benchmark/compute/instruction/test_account_query.py
+++ b/tests/benchmark/compute/instruction/test_account_query.py
@@ -41,7 +41,7 @@
 )
 
 
-@pytest.mark.repricing(contract_balance=0)
+@pytest.mark.repricing(contract_balance=1)
 @pytest.mark.parametrize("contract_balance", [0, 1])
 def test_selfbalance(
     benchmark_test: BenchmarkTestFiller,
@@ -68,6 +68,7 @@ def test_codesize(
     )
 
 
+@pytest.mark.repricing(max_code_size_ratio=0, fixed_src_dst=True)
 @pytest.mark.parametrize(
     "max_code_size_ratio",
     [
@@ -110,6 +111,7 @@ def test_codecopy(
     )
 
 
+@pytest.mark.repricing
 @pytest.mark.parametrize(
     "opcode",
     [
@@ -318,6 +320,7 @@ def test_extcode_ops(
     )
 
 
+@pytest.mark.repricing(copied_size=512)
 @pytest.mark.parametrize(
     "copied_size",
     [
@@ -345,6 +348,11 @@ def test_extcodecopy_warm(
     )
 
 
+@pytest.mark.repricing(
+    empty_code=True,
+    initial_balance=True,
+    initial_storage=True,
+)
 @pytest.mark.parametrize(
     "opcode",
     [
@@ -424,6 +432,7 @@ def test_ext_account_query_warm(
     )
 
 
+@pytest.mark.repricing(absent_accounts=True)
 @pytest.mark.parametrize(
     "opcode",
     [
diff --git a/tests/benchmark/compute/instruction/test_arithmetic.py b/tests/benchmark/compute/instruction/test_arithmetic.py
index d625907539..d89fd7b9a5 100644
--- a/tests/benchmark/compute/instruction/test_arithmetic.py
+++ b/tests/benchmark/compute/instruction/test_arithmetic.py
@@ -159,7 +159,7 @@ def test_arithmetic(
     )
 
 
-@pytest.mark.repricing(mod_bits=255)
+@pytest.mark.repricing(mod_bits=127)
 @pytest.mark.parametrize("mod_bits", [255, 191, 127, 63])
 @pytest.mark.parametrize("opcode", [Op.MOD, Op.SMOD])
 def test_mod(
@@ -278,7 +278,7 @@ def test_mod(
     )
 
 
-@pytest.mark.repricing(mod_bits=255)
+@pytest.mark.repricing(mod_bits=191)
 @pytest.mark.parametrize("mod_bits", [255, 191, 127, 63])
 @pytest.mark.parametrize("opcode", [Op.ADDMOD, Op.MULMOD])
 def test_mod_arithmetic(
diff --git a/tests/benchmark/compute/instruction/test_bitwise.py b/tests/benchmark/compute/instruction/test_bitwise.py
index 462870ff85..08d47268a0 100644
--- a/tests/benchmark/compute/instruction/test_bitwise.py
+++ b/tests/benchmark/compute/instruction/test_bitwise.py
@@ -124,6 +124,7 @@ def test_not_op(
     )
 
 
+@pytest.mark.repricing
 @pytest.mark.parametrize("opcode", [Op.SHR, Op.SAR])
 def test_shifts(
     benchmark_test: BenchmarkTestFiller,
diff --git a/tests/benchmark/compute/instruction/test_call_context.py b/tests/benchmark/compute/instruction/test_call_context.py
index 6ab2e0a29e..5bea574282 100644
--- a/tests/benchmark/compute/instruction/test_call_context.py
+++ b/tests/benchmark/compute/instruction/test_call_context.py
@@ -48,7 +48,7 @@ def test_call_frame_context_ops(
     )
 
 
-@pytest.mark.repricing(calldata_length=1_000)
+@pytest.mark.repricing(calldata_length=10_000)
 @pytest.mark.parametrize("calldata_length", [0, 1_000, 10_000])
 def test_calldatasize(
     benchmark_test: BenchmarkTestFiller,
@@ -64,6 +64,7 @@ def test_calldatasize(
     )
 
 
+@pytest.mark.repricing(non_zero_value=True)
 @pytest.mark.parametrize("non_zero_value", [True, False])
 def test_callvalue_from_origin(
     benchmark_test: BenchmarkTestFiller,
@@ -111,7 +112,7 @@ def test_callvalue_from_call(
     )
 
 
-@pytest.mark.repricing(calldata=b"")
+@pytest.mark.repricing(calldata=b"\x00")
 @pytest.mark.parametrize(
     "calldata",
     [
@@ -135,6 +136,7 @@ def test_calldataload(
     )
 
 
+@pytest.mark.repricing(size=0, fixed_src_dst=True, non_zero_data=False)
 @pytest.mark.parametrize(
     "size",
     [
@@ -282,8 +284,8 @@ def test_calldatacopy_from_call(
 
 
 @pytest.mark.repricing(
-    returned_size=1,
-    return_data_style=ReturnDataStyle.RETURN,
+    returned_size=0,
+    return_data_style=ReturnDataStyle.IDENTITY,
 )
 @pytest.mark.parametrize(
     "return_data_style",
@@ -340,7 +342,7 @@ def test_returndatasize_zero(
     )
 
 
-@pytest.mark.repricing(size=10 * 1024, fixed_dst=True)
+@pytest.mark.repricing(size=0, fixed_dst=True)
 @pytest.mark.parametrize(
     "size",
     [
diff --git a/tests/benchmark/compute/instruction/test_control_flow.py b/tests/benchmark/compute/instruction/test_control_flow.py
index 8b6de630eb..80ee287fc1 100644
--- a/tests/benchmark/compute/instruction/test_control_flow.py
+++ b/tests/benchmark/compute/instruction/test_control_flow.py
@@ -35,6 +35,7 @@ def test_gas_op(
     )
 
 
+@pytest.mark.repricing
 def test_pc_op(
     benchmark_test: BenchmarkTestFiller,
 ) -> None:
@@ -45,6 +46,7 @@ def test_pc_op(
     )
 
 
+@pytest.mark.repricing
 def test_jumps(
     benchmark_test: BenchmarkTestFiller,
     pre: Alloc,
diff --git a/tests/benchmark/compute/instruction/test_log.py b/tests/benchmark/compute/instruction/test_log.py
index ca176128c9..62c916d177 100644
--- a/tests/benchmark/compute/instruction/test_log.py
+++ b/tests/benchmark/compute/instruction/test_log.py
@@ -19,10 +19,10 @@
 
 
 @pytest.mark.repricing(
-    size=1024 * 1024,
+    size=0,
     non_zero_data=True,
-    zeros_topic=False,
-    fixed_offset=True,
+    zeros_topic=0,
+    fixed_offset=False,
 )
 @pytest.mark.parametrize(
     "opcode",
diff --git a/tests/benchmark/compute/instruction/test_memory.py b/tests/benchmark/compute/instruction/test_memory.py
index 01f9670c1f..31c27ce0b1 100644
--- a/tests/benchmark/compute/instruction/test_memory.py
+++ b/tests/benchmark/compute/instruction/test_memory.py
@@ -19,7 +19,7 @@
 )
 
 
-@pytest.mark.repricing(mem_size=1_000)
+@pytest.mark.repricing(mem_size=1)
 @pytest.mark.parametrize("mem_size", [0, 1, 1_000, 100_000, 1_000_000])
 def test_msize(
     benchmark_test: BenchmarkTestFiller,
@@ -37,7 +37,7 @@ def test_msize(
 
 
 @pytest.mark.repricing(
-    offset=31,
+    offset=0,
     offset_initialized=True,
     big_memory_expansion=True,
 )
@@ -75,7 +75,7 @@ def test_memory_access(
     )
 
 
-@pytest.mark.repricing(size=10 * 1024, fixed_src_dst=True)
+@pytest.mark.repricing(size=0, fixed_src_dst=True)
 @pytest.mark.parametrize(
     "size",
     [
diff --git a/tests/benchmark/compute/instruction/test_storage.py b/tests/benchmark/compute/instruction/test_storage.py
index af6c6e6810..d66f4e268a 100644
--- a/tests/benchmark/compute/instruction/test_storage.py
+++ b/tests/benchmark/compute/instruction/test_storage.py
@@ -28,7 +28,7 @@
 from tests.benchmark.compute.helpers import StorageAction, TransactionResult
 
 
-@pytest.mark.repricing(fixed_key=False, fixed_value=False)
+@pytest.mark.repricing(fixed_key=True, fixed_value=True)
 @pytest.mark.parametrize("fixed_key", [True, False])
 @pytest.mark.parametrize("fixed_value", [True, False])
 def test_tload(
@@ -87,6 +87,9 @@ def test_tstore(
     )
 
 
+@pytest.mark.repricing(
+    storage_action=StorageAction.WRITE_SAME_VALUE, absent_slots=False
+)
 @pytest.mark.parametrize(
     "storage_action,tx_result",
     [
@@ -285,6 +288,7 @@ def test_storage_access_cold(
     )
 
 
+@pytest.mark.repricing(storage_action=StorageAction.WRITE_SAME_VALUE)
 @pytest.mark.parametrize(
     "storage_action",
     [
diff --git a/tests/benchmark/compute/instruction/test_system.py b/tests/benchmark/compute/instruction/test_system.py
index 71d24089fb..4f58d5a005 100644
--- a/tests/benchmark/compute/instruction/test_system.py
+++ b/tests/benchmark/compute/instruction/test_system.py
@@ -40,6 +40,7 @@
 from tests.benchmark.compute.helpers import XOR_TABLE
 
 
+@pytest.mark.repricing
 @pytest.mark.parametrize(
     "opcode",
     [
@@ -247,6 +248,7 @@ def test_xcall(
     )
 
 
+@pytest.mark.repricing(max_code_size_ratio=0)
 @pytest.mark.parametrize(
     "opcode",
     [
@@ -358,6 +360,7 @@ def test_create(
     )
 
 
+@pytest.mark.repricing
 @pytest.mark.parametrize(
     "opcode",
     [
@@ -430,7 +433,7 @@ def test_creates_collisions(
     )
 
 
-@pytest.mark.repricing(return_size=1024, return_non_zero_data=True)
+@pytest.mark.repricing(return_size=0, return_non_zero_data=False)
 @pytest.mark.parametrize(
     "opcode",
     [Op.RETURN, Op.REVERT],
@@ -476,6 +479,7 @@ def test_return_revert(
     )
 
 
+@pytest.mark.repricing(value_bearing=True)
 @pytest.mark.parametrize("value_bearing", [True, False])
 def test_selfdestruct_existing(
     benchmark_test: BenchmarkTestFiller,
@@ -635,6 +639,7 @@ def test_selfdestruct_existing(
     )
 
 
+@pytest.mark.repricing(value_bearing=True)
 @pytest.mark.parametrize("value_bearing", [True, False])
 def test_selfdestruct_created(
     state_test: StateTestFiller,
@@ -743,6 +748,7 @@ def test_selfdestruct_created(
     )
 
 
+@pytest.mark.repricing(value_bearing=False)
 @pytest.mark.parametrize("value_bearing", [True, False])
 def test_selfdestruct_initcode(
     state_test: StateTestFiller,
diff --git a/tests/benchmark/compute/instruction/test_tx_context.py b/tests/benchmark/compute/instruction/test_tx_context.py
index e6eb35b330..cded6cab60 100644
--- a/tests/benchmark/compute/instruction/test_tx_context.py
+++ b/tests/benchmark/compute/instruction/test_tx_context.py
@@ -41,7 +41,7 @@ def test_call_frame_context_ops(
     )
 
 
-@pytest.mark.repricing(blob_index=0, blobs_present=0)
+@pytest.mark.repricing(blob_index=0, blobs_present=1)
 @pytest.mark.parametrize(
     "blob_index, blobs_present",
     [

From 540080b4b66007c9bd21bacd29e1dc0bc5fb69b6 Mon Sep 17 00:00:00 2001
From: LouisTsai <q1030176@gmail.com>
Date: Tue, 9 Dec 2025 16:28:48 +0800
Subject: [PATCH 06/11] chore: remove slow fixed opcount label

---
 tests/benchmark/compute/instruction/test_system.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/benchmark/compute/instruction/test_system.py b/tests/benchmark/compute/instruction/test_system.py
index 4f58d5a005..5c553df60f 100644
--- a/tests/benchmark/compute/instruction/test_system.py
+++ b/tests/benchmark/compute/instruction/test_system.py
@@ -360,7 +360,6 @@ def test_create(
     )
 
 
-@pytest.mark.repricing
 @pytest.mark.parametrize(
     "opcode",
     [
@@ -433,7 +432,6 @@ def test_creates_collisions(
     )
 
 
-@pytest.mark.repricing(return_size=0, return_non_zero_data=False)
 @pytest.mark.parametrize(
     "opcode",
     [Op.RETURN, Op.REVERT],
@@ -479,7 +477,6 @@ def test_return_revert(
     )
 
 
-@pytest.mark.repricing(value_bearing=True)
 @pytest.mark.parametrize("value_bearing", [True, False])
 def test_selfdestruct_existing(
     benchmark_test: BenchmarkTestFiller,

From 81ee8d712d2b64b90c848395fac23742db3ff722 Mon Sep 17 00:00:00 2001
From: LouisTsai <q1030176@gmail.com>
Date: Wed, 10 Dec 2025 23:18:24 +0800
Subject: [PATCH 07/11] refactor: remove non benchmark test wrapper fixed
 opcode count feature

---
 .../plugins/shared/benchmarking.py               | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/packages/testing/src/execution_testing/cli/pytest_commands/plugins/shared/benchmarking.py b/packages/testing/src/execution_testing/cli/pytest_commands/plugins/shared/benchmarking.py
index 2c33937bf1..cb0bf3b8a2 100644
--- a/packages/testing/src/execution_testing/cli/pytest_commands/plugins/shared/benchmarking.py
+++ b/packages/testing/src/execution_testing/cli/pytest_commands/plugins/shared/benchmarking.py
@@ -118,6 +118,22 @@ def pytest_collection_modifyitems(
     if not gas_benchmark_value and fixed_opcode_count is None:
         return
 
+    # In --fixed-opcode-count mode, we only support tests that meet all of the following:
+    #   - The test uses the benchmark_test fixture
+    #   - The benchmark test uses a code generator
+    #
+    # Here we filter out tests that do not use the benchmark_test fixture.
+    # Note: At this stage we cannot filter based on whether a code generator is used.
+    if fixed_opcode_count:
+        filtered = []
+        for item in items:
+            if (
+                hasattr(item, "fixturenames")
+                and "benchmark_test" in item.fixturenames
+            ):
+                filtered.append(item)
+        items[:] = filtered
+
     # Load config data if --fixed-opcode-count flag provided without value
     if fixed_opcode_count == "":
         config_data = load_opcode_counts_config(config)

From d0edf35a839307d3bd0801788a3670d89288e2f9 Mon Sep 17 00:00:00 2001
From: LouisTsai <q1030176@gmail.com>
Date: Thu, 11 Dec 2025 15:19:02 +0800
Subject: [PATCH 08/11] refactor: update filter logic with detailed comment

---
 .../cli/pytest_commands/plugins/shared/benchmarking.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/packages/testing/src/execution_testing/cli/pytest_commands/plugins/shared/benchmarking.py b/packages/testing/src/execution_testing/cli/pytest_commands/plugins/shared/benchmarking.py
index cb0bf3b8a2..7af315c86b 100644
--- a/packages/testing/src/execution_testing/cli/pytest_commands/plugins/shared/benchmarking.py
+++ b/packages/testing/src/execution_testing/cli/pytest_commands/plugins/shared/benchmarking.py
@@ -149,22 +149,28 @@ def pytest_collection_modifyitems(
                 stacklevel=1,
             )
 
-    # Check if -m repricing marker filter was specified
+    # Extract the specified flag from the command line.
+    # If the `-m repricing` flag is not specified, or is negated,
+    # we skip filtering tests by the repricing marker.
     markexpr = config.getoption("markexpr", "")
     if "repricing" not in markexpr or "not repricing" in markexpr:
         return
 
     filtered = []
     for item in items:
+        # If the test does not have the repricing marker, skip it
         repricing_marker = item.get_closest_marker("repricing")
         if not repricing_marker:
-            filtered.append(item)
             continue
 
+        # If the test has the repricing marker but no specific kwargs,
+        # include the entire parametrized test in the filtered list.
         if not repricing_marker.kwargs:
             filtered.append(item)
             continue
 
+        # If the test has the repricing marker with specific kwargs,
+        # filter the test cases according to those kwargs.
         if hasattr(item, "callspec"):
             if all(
                 item.callspec.params.get(key) == value

From 7423004b9a34460a6c8fa21e3c9085495973d555 Mon Sep 17 00:00:00 2001
From: LouisTsai <q1030176@gmail.com>
Date: Thu, 11 Dec 2025 16:06:13 +0800
Subject: [PATCH 09/11] tests: fixed opcode count filtered logic

---
 .../plugins/filler/tests/test_benchmarking.py | 324 ++++++++++++++++--
 1 file changed, 289 insertions(+), 35 deletions(-)

diff --git a/packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/tests/test_benchmarking.py b/packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/tests/test_benchmarking.py
index 82077315fd..30363d66ff 100644
--- a/packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/tests/test_benchmarking.py
+++ b/packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/tests/test_benchmarking.py
@@ -9,44 +9,99 @@
 test_module_dummy = textwrap.dedent(
     """\
     import pytest
+    from execution_testing import BenchmarkTestFiller, JumpLoopGenerator, Op
 
-    from execution_testing import Environment
-
-    @pytest.mark.valid_at("Istanbul")
-    def test_dummy_benchmark_test(state_test, gas_benchmark_value) -> None:
-        state_test(
-            env=env,pre={},post={},tx=None)
+    @pytest.mark.valid_at("Prague")
+    def test_dummy_benchmark_test(benchmark_test: BenchmarkTestFiller) -> None:
+        benchmark_test(
+            target_opcode=Op.JUMPDEST,
+            code_generator=JumpLoopGenerator(attack_block=Op.JUMPDEST),
+        )
     """
 )
 
 test_module_without_fixture = textwrap.dedent(
     """\
     import pytest
+    from execution_testing import BenchmarkTestFiller, JumpLoopGenerator, Op
 
-    from execution_testing import Environment
-
-    @pytest.mark.valid_at("Istanbul")
-    def test_dummy_no_benchmark_test(state_test) -> None:
-        state_test(env=env, pre={}, post={}, tx=None)
+    @pytest.mark.valid_at("Prague")
+    def test_dummy_no_benchmark_test(benchmark_test: BenchmarkTestFiller) -> None:
+        benchmark_test(
+            target_opcode=Op.JUMPDEST,
+            code_generator=JumpLoopGenerator(attack_block=Op.JUMPDEST),
+        )
     """
 )
 
 test_module_with_repricing = textwrap.dedent(
     """\
     import pytest
-
-    from execution_testing import Environment
+    from execution_testing import BenchmarkTestFiller, JumpLoopGenerator, Op
 
     @pytest.mark.valid_at("Prague")
-    @pytest.mark.benchmark
     @pytest.mark.repricing
-    def test_benchmark_with_repricing(state_test, gas_benchmark_value) -> None:
-        state_test(env=env, pre={}, post={}, tx=None)
+    def test_benchmark_with_repricing(benchmark_test: BenchmarkTestFiller) -> None:
+        benchmark_test(
+            target_opcode=Op.JUMPDEST,
+            code_generator=JumpLoopGenerator(attack_block=Op.JUMPDEST),
+        )
+
+    @pytest.mark.valid_at("Prague")
+    def test_benchmark_without_repricing(benchmark_test: BenchmarkTestFiller) -> None:
+        benchmark_test(
+            target_opcode=Op.JUMPDEST,
+            code_generator=JumpLoopGenerator(attack_block=Op.JUMPDEST),
+        )
+    """
+)
+
+test_module_without_benchmark_test_fixture = textwrap.dedent(
+    """\
+    import pytest
+    from execution_testing import BenchmarkTestFiller, JumpLoopGenerator, Op
+
+    @pytest.mark.valid_at("Prague")
+    def test_with_gas_benchmark_value(state_test, gas_benchmark_value: int) -> None:
+        # This test intentionally uses state_test instead of benchmark_test
+        # to verify that --fixed-opcode-count filters it out
+        state_test(pre={}, post={}, tx=None)
+
+    @pytest.mark.valid_at("Prague")
+    def test_with_benchmark_test(benchmark_test: BenchmarkTestFiller) -> None:
+        benchmark_test(
+            target_opcode=Op.JUMPDEST,
+            code_generator=JumpLoopGenerator(attack_block=Op.JUMPDEST),
+        )
+    """
+)
+
+test_module_with_repricing_kwargs = textwrap.dedent(
+    """\
+    import pytest
+    from execution_testing import BenchmarkTestFiller, ExtCallGenerator, Op
+
+    @pytest.mark.valid_at("Prague")
+    @pytest.mark.repricing(opcode=Op.ADD)
+    @pytest.mark.parametrize("opcode", [Op.ADD, Op.SUB, Op.MUL])
+    def test_parametrized_with_repricing_kwargs(
+        benchmark_test: BenchmarkTestFiller, opcode
+    ) -> None:
+        benchmark_test(
+            target_opcode=opcode,
+            code_generator=ExtCallGenerator(attack_block=opcode),
+        )
 
     @pytest.mark.valid_at("Prague")
-    @pytest.mark.benchmark
-    def test_benchmark_without_repricing(state_test, gas_benchmark_value) -> None:
-        state_test(env=env, pre={}, post={}, tx=None)
+    @pytest.mark.repricing
+    @pytest.mark.parametrize("opcode", [Op.ADD, Op.SUB])
+    def test_parametrized_with_repricing_no_kwargs(
+        benchmark_test: BenchmarkTestFiller, opcode
+    ) -> None:
+        benchmark_test(
+            target_opcode=opcode,
+            code_generator=ExtCallGenerator(attack_block=opcode),
+        )
     """
 )
 
@@ -66,9 +121,9 @@ def setup_test_directory_structure(
 
     """
     tests_dir = pytester.mkdir("tests")
-    istanbul_tests_dir = tests_dir / "istanbul"
-    istanbul_tests_dir.mkdir()
-    dummy_dir = istanbul_tests_dir / "dummy_test_module"
+    benchmark_tests_dir = tests_dir / "benchmark"
+    benchmark_tests_dir.mkdir()
+    dummy_dir = benchmark_tests_dir / "dummy_test_module"
     dummy_dir.mkdir()
     test_module = dummy_dir / test_filename
     test_module.write_text(test_content)
@@ -86,7 +141,7 @@ def test_gas_benchmark_option_added(pytester: pytest.Pytester) -> None:
         name="src/execution_testing/cli/pytest_commands/pytest_ini_files/pytest-fill.ini"
     )
 
-    # Command: pytest -p pytest_plugins.filler.benchmarking --help
+    # Equivalent to: fill --help
     result = pytester.runpytest("-c", "pytest-fill.ini", "--help")
 
     assert result.ret == 0
@@ -101,8 +156,7 @@ def test_benchmarking_mode_configured_with_option(
     pytester: pytest.Pytester,
 ) -> None:
     """
-    Test that fill_mode is set to BENCHMARKING when --gas-benchmark-values is
-    used.
+    Test that op_mode is set to BENCHMARKING when --gas-benchmark-values used.
     """
     setup_test_directory_structure(
         pytester, test_module_dummy, "test_dummy_benchmark.py"
@@ -113,16 +167,16 @@ def test_benchmarking_mode_configured_with_option(
         "-c",
         "pytest-fill.ini",
         "--fork",
-        "Istanbul",
+        "Prague",
         "--gas-benchmark-values",
         "10,20,30",
-        "tests/istanbul/dummy_test_module/",
+        "tests/benchmark/dummy_test_module/",
         "--collect-only",
         "-q",
     )
 
     assert result.ret == 0
-    assert any("9 tests collected" in line for line in result.outlines)
+    assert any("6 tests collected" in line for line in result.outlines)
     # Check that the test names include the benchmark gas values
     assert any("benchmark-gas-value_10M" in line for line in result.outlines)
     assert any("benchmark-gas-value_20M" in line for line in result.outlines)
@@ -133,8 +187,8 @@ def test_benchmarking_mode_not_configured_without_option(
     pytester: pytest.Pytester,
 ) -> None:
     """
-    Test that fill_mode is not set to BENCHMARKING when --gas-benchmark-values
-    is not used.
+    Test that op_mode is not set to BENCHMARKING when --gas-benchmark-values
+    not used.
     """
     setup_test_directory_structure(
         pytester, test_module_dummy, "test_dummy_benchmark.py"
@@ -145,15 +199,15 @@ def test_benchmarking_mode_not_configured_without_option(
         "-c",
         "pytest-fill.ini",
         "--fork",
-        "Istanbul",
-        "tests/istanbul/dummy_test_module/",
+        "Prague",
+        "tests/benchmark/dummy_test_module/",
         "--collect-only",
         "-q",
     )
 
     assert result.ret == 0
-    # Should generate normal test variants (3) without parametrization
-    assert any("3 tests collected" in line for line in result.outlines)
+    # Should generate normal test variants (2) without parametrization
+    assert any("2 tests collected" in line for line in result.outlines)
     assert not any(
         "benchmark-gas-value_10M" in line for line in result.outlines
     )
@@ -206,7 +260,7 @@ def test_repricing_marker_filter_with_benchmark_options(
         *benchmark_args,
         "-m",
         "repricing",
-        "tests/istanbul/dummy_test_module/",
+        "tests/benchmark/dummy_test_module/",
         "--collect-only",
         "-q",
     )
@@ -220,3 +274,203 @@ def test_repricing_marker_filter_with_benchmark_options(
     assert not any(
         "test_benchmark_without_repricing" in line for line in result.outlines
     )
+
+
+def test_fixed_opcode_count_filters_tests_without_benchmark_test_fixture(
+    pytester: pytest.Pytester,
+) -> None:
+    """
+    Test that --fixed-opcode-count filters out tests that don't use the
+    benchmark_test fixture.
+
+    Only tests with the benchmark_test fixture should be collected when
+    --fixed-opcode-count is provided.
+    """
+    setup_test_directory_structure(
+        pytester,
+        test_module_without_benchmark_test_fixture,
+        "test_fixture_filter.py",
+    )
+
+    result = pytester.runpytest(
+        "-c",
+        "pytest-fill.ini",
+        "--fork",
+        "Prague",
+        "--fixed-opcode-count",
+        "1",
+        "tests/benchmark/dummy_test_module/",
+        "--collect-only",
+        "-q",
+    )
+
+    assert result.ret == 0
+    # Test with benchmark_test fixture should be collected
+    assert any("test_with_benchmark_test" in line for line in result.outlines)
+    # Test with only gas_benchmark_value fixture should NOT be collected
+    assert not any(
+        "test_with_gas_benchmark_value" in line for line in result.outlines
+    )
+
+
+def test_repricing_marker_with_kwargs_filters_parametrized_tests(
+    pytester: pytest.Pytester,
+) -> None:
+    """
+    Test that repricing marker with kwargs filters parametrized tests to only
+    include matching parameter combinations.
+
+    When @pytest.mark.repricing(opcode=Op.ADD) is used, only test variants
+    where opcode=Op.ADD should be collected.
+    """
+    setup_test_directory_structure(
+        pytester,
+        test_module_with_repricing_kwargs,
+        "test_repricing_kwargs.py",
+    )
+
+    result = pytester.runpytest(
+        "-c",
+        "pytest-fill.ini",
+        "--fork",
+        "Prague",
+        "--fixed-opcode-count",
+        "1",
+        "-m",
+        "repricing",
+        "tests/benchmark/dummy_test_module/",
+        "--collect-only",
+        "-q",
+    )
+
+    assert result.ret == 0
+    # For test with repricing(opcode=Op.ADD), only ADD variant should be collected
+    collected_lines = [
+        line for line in result.outlines if "test_parametrized" in line
+    ]
+
+    # test_parametrized_with_repricing_kwargs should only have ADD variants
+    # (multiple test types like blockchain_test and blockchain_test_engine)
+    kwargs_test_lines = [
+        line
+        for line in collected_lines
+        if "test_parametrized_with_repricing_kwargs" in line
+    ]
+    # All collected variants should be ADD only (no SUB or MUL)
+    assert all("ADD" in line for line in kwargs_test_lines)
+    assert not any("SUB" in line for line in kwargs_test_lines)
+    assert not any("MUL" in line for line in kwargs_test_lines)
+
+    # test_parametrized_with_repricing_no_kwargs should have all variants (ADD and SUB)
+    no_kwargs_test_lines = [
+        line
+        for line in collected_lines
+        if "test_parametrized_with_repricing_no_kwargs" in line
+    ]
+    # Should have both ADD and SUB variants
+    assert any("ADD" in line for line in no_kwargs_test_lines)
+    assert any("SUB" in line for line in no_kwargs_test_lines)
+
+
+def test_not_repricing_marker_negation(
+    pytester: pytest.Pytester,
+) -> None:
+    """
+    Test that -m 'not repricing' does not apply the repricing filter.
+
+    When -m 'not repricing' is specified, the custom repricing filter should
+    be skipped and pytest's built-in marker filtering should be used.
+    """
+    setup_test_directory_structure(
+        pytester, test_module_with_repricing, "test_repricing_negation.py"
+    )
+
+    result = pytester.runpytest(
+        "-c",
+        "pytest-fill.ini",
+        "--fork",
+        "Prague",
+        "--fixed-opcode-count",
+        "1",
+        "-m",
+        "not repricing",
+        "tests/benchmark/dummy_test_module/",
+        "--collect-only",
+        "-q",
+    )
+
+    assert result.ret == 0
+    # The repricing test should NOT be collected (negated)
+    assert not any(
+        "test_benchmark_with_repricing" in line for line in result.outlines
+    )
+    # The non-repricing test should be collected
+    assert any(
+        "test_benchmark_without_repricing" in line for line in result.outlines
+    )
+
+
+def test_mutual_exclusivity_of_benchmark_options(
+    pytester: pytest.Pytester,
+) -> None:
+    """
+    Test that --gas-benchmark-values and --fixed-opcode-count cannot be used
+    together.
+    """
+    setup_test_directory_structure(
+        pytester, test_module_with_repricing, "test_mutual_exclusivity.py"
+    )
+
+    result = pytester.runpytest(
+        "-c",
+        "pytest-fill.ini",
+        "--fork",
+        "Prague",
+        "--gas-benchmark-values",
+        "10",
+        "--fixed-opcode-count",
+        "1",
+        "tests/benchmark/dummy_test_module/",
+        "--collect-only",
+        "-q",
+    )
+
+    # Should fail with usage error
+    assert result.ret != 0
+    assert any(
+        "mutually exclusive" in line
+        for line in result.outlines + result.errlines
+    )
+
+
+def test_without_repricing_flag_collects_all_tests(
+    pytester: pytest.Pytester,
+) -> None:
+    """
+    Test that without -m repricing flag, both repricing and non-repricing
+    tests are collected.
+    """
+    setup_test_directory_structure(
+        pytester, test_module_with_repricing, "test_no_filter.py"
+    )
+
+    result = pytester.runpytest(
+        "-c",
+        "pytest-fill.ini",
+        "--fork",
+        "Prague",
+        "--fixed-opcode-count",
+        "1",
+        "tests/benchmark/dummy_test_module/",
+        "--collect-only",
+        "-q",
+    )
+
+    assert result.ret == 0
+    # Both tests should be collected
+    assert any(
+        "test_benchmark_with_repricing" in line for line in result.outlines
+    )
+    assert any(
+        "test_benchmark_without_repricing" in line for line in result.outlines
+    )

From 635cfe3e202bb35f723d930e5e36492a1a7812bc Mon Sep 17 00:00:00 2001
From: LouisTsai <q1030176@gmail.com>
Date: Thu, 11 Dec 2025 17:46:18 +0800
Subject: [PATCH 10/11] feat: implement benchmark coverage script

---
 packages/testing/pyproject.toml               |   1 +
 .../cli/benchmark_coverage.py                 | 501 ++++++++++++++++++
 2 files changed, 502 insertions(+)
 create mode 100644 packages/testing/src/execution_testing/cli/benchmark_coverage.py

diff --git a/packages/testing/pyproject.toml b/packages/testing/pyproject.toml
index 669163764e..423ba6038e 100644
--- a/packages/testing/pyproject.toml
+++ b/packages/testing/pyproject.toml
@@ -102,6 +102,7 @@ compare_fixtures = "execution_testing.cli.compare_fixtures:main"
 modify_static_test_gas_limits = "execution_testing.cli.modify_static_test_gas_limits:main"
 validate_changelog = "execution_testing.cli.tox_helpers:validate_changelog"
 benchmark_parser = "execution_testing.cli.benchmark_parser:main"
+benchmark_coverage = "execution_testing.cli.benchmark_coverage:main"
 
 [tool.setuptools.packages.find]
 where = ["src"]
diff --git a/packages/testing/src/execution_testing/cli/benchmark_coverage.py b/packages/testing/src/execution_testing/cli/benchmark_coverage.py
new file mode 100644
index 0000000000..0aa3e62fff
--- /dev/null
+++ b/packages/testing/src/execution_testing/cli/benchmark_coverage.py
@@ -0,0 +1,501 @@
+"""
+Analyze benchmark test coverage for opcodes across different benchmark modes.
+
+This script scans benchmark tests to determine which opcodes are covered by:
+- worst-case-benchmark mode (--gas-benchmark-values): tests using benchmark_test
+- fixed-opcode-count mode (--fixed-opcode-count): tests using benchmark_test + code_generator
+
+Usage:
+    uv run benchmark_coverage           # Generate markdown coverage report
+    uv run benchmark_coverage --json    # Output as JSON
+"""
+
+import argparse
+import ast
+import json
+import sys
+from collections import defaultdict
+from pathlib import Path
+
+from ethereum.forks.osaka.vm.instructions import Ops
+
+
+def get_repo_root() -> Path:
+    """Get the repository root directory."""
+    current = Path.cwd()
+    while current != current.parent:
+        if (current / "tests" / "benchmark").exists():
+            return current
+        current = current.parent
+    raise FileNotFoundError("Could not find repository root")
+
+
+def get_benchmark_dir() -> Path:
+    """Get the benchmark tests directory."""
+    return get_repo_root() / "tests" / "benchmark"
+
+
+def get_opcode_values() -> dict[str, int]:
+    """Build opcode name -> value mapping from Ops enum."""
+    return {op.name: op.value for op in Ops}
+
+
+# Build OPCODE_VALUES from the Ops enum
+OPCODE_VALUES: dict[str, int] = get_opcode_values()
+
+# Opcode aliases: map old/alternate names to canonical names in Ops enum
+# When tests use an alias, coverage is attributed to the canonical opcode
+OPCODE_ALIASES: dict[str, str] = {
+    "SHA3": "KECCAK",
+    "KECCAK256": "KECCAK",
+    "DIFFICULTY": "PREVRANDAO",
+}
+
+
+def normalize_opcode(opcode: str) -> str:
+    """Normalize opcode name using aliases."""
+    return OPCODE_ALIASES.get(opcode, opcode)
+
+
+def opcode_sort_key(opcode: str) -> tuple[int, str]:
+    """Return sort key for an opcode (by opcode value, then alphabetically)."""
+    if opcode in OPCODE_VALUES:
+        return (OPCODE_VALUES[opcode], opcode)
+    return (0x1000, opcode)  # Unknown opcodes go at the end
+
+
+class BenchmarkCoverageExtractor(ast.NodeVisitor):
+    """Extract benchmark test coverage information from test functions."""
+
+    def __init__(self, source_code: str, file_path: Path):
+        self.source_code = source_code
+        self.file_path = file_path
+        # Maps opcode -> list of (test_name, supports_fixed_opcode_count)
+        self.coverage: dict[str, list[tuple[str, bool]]] = defaultdict(list)
+
+    def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
+        """Visit function definitions and extract opcode coverage."""
+        if not node.name.startswith("test_"):
+            return
+
+        # Check if function has benchmark_test parameter
+        if not self._has_benchmark_test_param(node):
+            return
+
+        # Check if function uses code_generator
+        uses_code_generator = self._uses_code_generator(node)
+
+        # Check if target_opcode= is explicitly used in benchmark_test() call
+        has_target_opcode = self._has_target_opcode(node)
+
+        # Extract opcodes for coverage tracking
+        # For fixed-opcode-count: ONLY if target_opcode= is present AND code_generator=
+        # For worst-case-benchmark: from target_opcode= OR parametrized opcode
+        opcodes_from_target = self._extract_opcodes_from_target_opcode(node)
+        opcodes_from_parametrize = self._extract_parametrized_opcodes(node)
+
+        test_name = node.name
+
+        if has_target_opcode:
+            # Test explicitly declares target_opcode - eligible for fixed-opcode-count
+            opcodes = (
+                opcodes_from_target
+                if opcodes_from_target
+                else opcodes_from_parametrize
+            )
+            supports_fixed = uses_code_generator
+            for opcode in opcodes:
+                self.coverage[opcode].append((test_name, supports_fixed))
+        else:
+            # No target_opcode= - worst-case-benchmark only (from parametrize)
+            for opcode in opcodes_from_parametrize:
+                self.coverage[opcode].append((test_name, False))
+
+    def _has_benchmark_test_param(self, node: ast.FunctionDef) -> bool:
+        """Check if function has benchmark_test parameter."""
+        return any(arg.arg == "benchmark_test" for arg in node.args.args)
+
+    def _uses_code_generator(self, node: ast.FunctionDef) -> bool:
+        """Check if function body uses code_generator parameter."""
+        func_start = node.lineno - 1
+        func_end = node.end_lineno
+        if func_end is None:
+            return False
+        func_source = "\n".join(
+            self.source_code.splitlines()[func_start:func_end]
+        )
+        return "code_generator=" in func_source
+
+    def _has_target_opcode(self, node: ast.FunctionDef) -> bool:
+        """Check if target_opcode= is used in benchmark_test() call."""
+        func_start = node.lineno - 1
+        func_end = node.end_lineno
+        if func_end is None:
+            return False
+        func_source = "\n".join(
+            self.source_code.splitlines()[func_start:func_end]
+        )
+        return "target_opcode=" in func_source
+
+    def _extract_opcodes_from_target_opcode(
+        self, node: ast.FunctionDef
+    ) -> list[str]:
+        """
+        Extract opcodes from target_opcode= in benchmark_test() call.
+
+        Handles:
+        1. target_opcode=Op.XXX (direct opcode)
+        2. target_opcode=opcode (parametrized variable)
+        """
+        # First try direct target_opcode=Op.XXX
+        direct_opcodes = self._extract_direct_target_opcodes(node)
+        if direct_opcodes:
+            return direct_opcodes
+
+        # Check if target_opcode= references a parametrized variable
+        if self._has_target_opcode(node):
+            return self._extract_parametrized_opcodes(node)
+
+        return []
+
+    def _extract_direct_target_opcodes(
+        self, node: ast.FunctionDef
+    ) -> list[str]:
+        """Extract direct target_opcode=Op.XXX from benchmark_test() calls."""
+        opcodes: list[str] = []
+
+        for child in ast.walk(node):
+            if not isinstance(child, ast.Call):
+                continue
+
+            # Look for benchmark_test(...) calls
+            if (
+                isinstance(child.func, ast.Name)
+                and child.func.id == "benchmark_test"
+            ):
+                for keyword in child.keywords:
+                    if keyword.arg == "target_opcode":
+                        opcode = self._extract_opcode_from_expr(keyword.value)
+                        if opcode:
+                            opcodes.append(opcode)
+
+        return opcodes
+
+    def _extract_opcode_from_expr(self, expr: ast.expr) -> str | None:
+        """Extract opcode name from an expression like Op.ADD."""
+        if isinstance(expr, ast.Attribute):
+            # Op.ADD -> "ADD"
+            return expr.attr
+        return None
+
+    def _extract_parametrized_opcodes(
+        self, node: ast.FunctionDef
+    ) -> list[str]:
+        """Extract opcodes from @pytest.mark.parametrize decorators."""
+        opcodes: list[str] = []
+
+        for decorator in node.decorator_list:
+            if not self._is_parametrize_decorator(decorator):
+                continue
+
+            if not isinstance(decorator, ast.Call) or len(decorator.args) < 2:
+                continue
+
+            # Get parameter names (first arg)
+            param_names = decorator.args[0]
+            if isinstance(param_names, ast.Constant):
+                param_str = str(param_names.value).lower()
+            else:
+                continue
+
+            # Check if "opcode" is in parameter names
+            if "opcode" not in param_str:
+                continue
+
+            # Extract opcode values from second arg (the list)
+            param_values = decorator.args[1]
+            opcodes.extend(self._parse_opcode_values(param_values))
+
+        return opcodes
+
+    def _is_parametrize_decorator(self, decorator: ast.expr) -> bool:
+        """Check if decorator is @pytest.mark.parametrize."""
+        if isinstance(decorator, ast.Call):
+            if isinstance(decorator.func, ast.Attribute):
+                if (
+                    isinstance(decorator.func.value, ast.Attribute)
+                    and decorator.func.value.attr == "mark"
+                    and decorator.func.attr == "parametrize"
+                ):
+                    return True
+        return False
+
+    def _parse_opcode_values(self, values_node: ast.expr) -> list[str]:
+        """Parse opcode values from the parametrize list."""
+        opcodes: list[str] = []
+
+        if not isinstance(values_node, (ast.List, ast.Tuple)):
+            return opcodes
+
+        for element in values_node.elts:
+            opcode_name = self._extract_opcode_name(element)
+            if opcode_name:
+                opcodes.append(opcode_name)
+
+        return opcodes
+
+    def _extract_opcode_name(self, node: ast.expr) -> str | None:
+        """
+        Extract opcode name from various AST node types.
+
+        Handles:
+        - Op.ADD (direct)
+        - pytest.param(Op.ADD, ...)
+        - pytest.param((Op.ADD, x), ...)
+        - (Op.ADD, x) tuple
+        """
+        # Direct opcode - Op.ADD
+        if isinstance(node, ast.Attribute):
+            return node.attr
+
+        # pytest.param(Op.ADD, ...) or pytest.param((Op.ADD, x), ...)
+        if isinstance(node, ast.Call):
+            if len(node.args) > 0:
+                first_arg = node.args[0]
+                if isinstance(first_arg, ast.Attribute):
+                    return first_arg.attr
+                elif isinstance(first_arg, ast.Tuple) and first_arg.elts:
+                    first_elem = first_arg.elts[0]
+                    if isinstance(first_elem, ast.Attribute):
+                        return first_elem.attr
+
+        # Plain tuple - (Op.ADD, args)
+        if isinstance(node, ast.Tuple) and node.elts:
+            first_elem = node.elts[0]
+            if isinstance(first_elem, ast.Attribute):
+                return first_elem.attr
+
+        return None
+
+
+def scan_benchmark_tests(
+    base_path: Path,
+) -> dict[str, dict[str, list[str]]]:
+    """
+    Scan benchmark test files and extract opcode coverage.
+
+    Returns:
+        Dict mapping opcode -> {
+            "worst_case_benchmark": [test_names...],
+            "fixed_opcode_count": [test_names...]
+        }
+    """
+    # opcode -> {"worst_case_benchmark": [...], "fixed_opcode_count": [...]}
+    coverage: dict[str, dict[str, list[str]]] = defaultdict(
+        lambda: {"worst_case_benchmark": [], "fixed_opcode_count": []}
+    )
+
+    test_files = [
+        f
+        for f in base_path.rglob("test_*.py")
+        if "configs" not in str(f) and "stateful" not in str(f)
+    ]
+
+    for test_file in test_files:
+        try:
+            source = test_file.read_text()
+            tree = ast.parse(source)
+
+            extractor = BenchmarkCoverageExtractor(source, test_file)
+            extractor.visit(tree)
+
+            for opcode, tests in extractor.coverage.items():
+                # Normalize opcode name (handle aliases like SHA3 -> KECCAK256)
+                canonical_opcode = normalize_opcode(opcode)
+                for test_name, uses_code_generator in tests:
+                    # All benchmark tests support worst-case-benchmark mode
+                    if (
+                        test_name
+                        not in coverage[canonical_opcode][
+                            "worst_case_benchmark"
+                        ]
+                    ):
+                        coverage[canonical_opcode][
+                            "worst_case_benchmark"
+                        ].append(test_name)
+
+                    # Only tests with code_generator support fixed-opcode-count mode
+                    if uses_code_generator:
+                        if (
+                            test_name
+                            not in coverage[canonical_opcode][
+                                "fixed_opcode_count"
+                            ]
+                        ):
+                            coverage[canonical_opcode][
+                                "fixed_opcode_count"
+                            ].append(test_name)
+
+        except Exception as e:
+            print(
+                f"Warning: Failed to parse {test_file}: {e}", file=sys.stderr
+            )
+            continue
+
+    return dict(coverage)
+
+
+def generate_markdown_table(coverage: dict[str, dict[str, list[str]]]) -> str:
+    """Generate markdown table from coverage data."""
+    lines = [
+        "# Benchmark Test Coverage by Opcode",
+        "",
+        "| Opcode | Name | worst-case-benchmark | fixed-opcode-count |",
+        "|--------|------|---------------------|-------------------|",
+    ]
+
+    # Include all known opcodes, not just those with tests
+    # Exclude aliases (they're consolidated into canonical names)
+    all_opcodes = (set(OPCODE_VALUES.keys()) | set(coverage.keys())) - set(
+        OPCODE_ALIASES.keys()
+    )
+    sorted_opcodes = sorted(all_opcodes, key=opcode_sort_key)
+
+    for opcode in sorted_opcodes:
+        data = coverage.get(
+            opcode, {"worst_case_benchmark": [], "fixed_opcode_count": []}
+        )
+        worst_case = ", ".join(sorted(data["worst_case_benchmark"])) or "-"
+        fixed_opcode = ", ".join(sorted(data["fixed_opcode_count"])) or "-"
+        # Get hex value, default to "?" for unknown opcodes
+        hex_value = (
+            f"0x{OPCODE_VALUES[opcode]:02X}"
+            if opcode in OPCODE_VALUES
+            else "?"
+        )
+        lines.append(
+            f"| {hex_value} | {opcode} | {worst_case} | {fixed_opcode} |"
+        )
+
+    return "\n".join(lines) + "\n"
+
+
+def generate_summary(coverage: dict[str, dict[str, list[str]]]) -> str:
+    """Generate summary statistics."""
+    # Exclude aliases from all_opcodes
+    all_opcodes = (set(OPCODE_VALUES.keys()) | set(coverage.keys())) - set(
+        OPCODE_ALIASES.keys()
+    )
+    total_known_opcodes = len(all_opcodes)
+    opcodes_with_tests = len(coverage)
+    opcodes_with_worst_case = sum(
+        1 for data in coverage.values() if data["worst_case_benchmark"]
+    )
+    opcodes_with_fixed = sum(
+        1 for data in coverage.values() if data["fixed_opcode_count"]
+    )
+
+    lines = [
+        "",
+        "## Summary",
+        "",
+        f"- Total known opcodes: {total_known_opcodes}",
+        f"- Opcodes with benchmark tests: {opcodes_with_tests}",
+        f"- Opcodes with worst-case-benchmark coverage: {opcodes_with_worst_case}",
+        f"- Opcodes with fixed-opcode-count coverage: {opcodes_with_fixed}",
+    ]
+
+    # Opcodes with no coverage at all
+    no_coverage = [
+        opcode
+        for opcode in all_opcodes
+        if opcode not in coverage
+        or not coverage[opcode]["worst_case_benchmark"]
+    ]
+    no_coverage.sort(key=opcode_sort_key)
+
+    lines.extend(
+        [
+            "",
+            "### Opcodes with no benchmark coverage:",
+            "",
+        ]
+    )
+
+    if no_coverage:
+        for opcode in no_coverage:
+            lines.append(f"- {opcode}")
+    else:
+        lines.append("- None (all covered)")
+
+    # Opcodes missing fixed-opcode-count coverage
+    missing_fixed = [
+        opcode
+        for opcode, data in coverage.items()
+        if data["worst_case_benchmark"] and not data["fixed_opcode_count"]
+    ]
+    missing_fixed.sort(key=opcode_sort_key)
+
+    lines.extend(
+        [
+            "",
+            "### Opcodes missing fixed-opcode-count coverage:",
+            "",
+        ]
+    )
+
+    if missing_fixed:
+        for opcode in missing_fixed:
+            lines.append(f"- {opcode}")
+    else:
+        lines.append("- None (all covered)")
+
+    return "\n".join(lines) + "\n"
+
+
+def main() -> int:
+    """Main entry point."""
+    parser = argparse.ArgumentParser(
+        description="Analyze benchmark test coverage for opcodes"
+    )
+    parser.add_argument(
+        "--json",
+        action="store_true",
+        help="Output as JSON instead of markdown",
+    )
+    parser.add_argument(
+        "--output",
+        "-o",
+        type=str,
+        help="Output file path (default: stdout)",
+    )
+    args = parser.parse_args()
+
+    try:
+        benchmark_dir = get_benchmark_dir()
+    except FileNotFoundError as e:
+        print(f"Error: {e}", file=sys.stderr)
+        return 1
+
+    print(f"Scanning benchmark tests in {benchmark_dir}...", file=sys.stderr)
+    coverage = scan_benchmark_tests(benchmark_dir)
+    print(
+        f"Found {len(coverage)} opcodes with benchmark tests", file=sys.stderr
+    )
+
+    if args.json:
+        output = json.dumps(coverage, indent=2, sort_keys=True) + "\n"
+    else:
+        output = generate_markdown_table(coverage) + generate_summary(coverage)
+
+    if args.output:
+        Path(args.output).write_text(output)
+        print(f"Output written to {args.output}", file=sys.stderr)
+    else:
+        print(output)
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())

From 35d6d67a1f2510fb8f01e006fcf28cb7045856ad Mon Sep 17 00:00:00 2001
From: LouisTsai <q1030176@gmail.com>
Date: Thu, 11 Dec 2025 17:47:37 +0800
Subject: [PATCH 11/11] refactor: update the benchmark test target opcode

---
 .../compute/instruction/test_account_query.py |  6 ++--
 .../compute/instruction/test_storage.py       | 28 +++++++++++++++----
 .../compute/instruction/test_system.py        |  6 ++--
 .../scenario/test_transaction_types.py        |  5 ++--
 4 files changed, 28 insertions(+), 17 deletions(-)

diff --git a/tests/benchmark/compute/instruction/test_account_query.py b/tests/benchmark/compute/instruction/test_account_query.py
index 2ada0a8f9b..367a4d34a0 100644
--- a/tests/benchmark/compute/instruction/test_account_query.py
+++ b/tests/benchmark/compute/instruction/test_account_query.py
@@ -22,7 +22,6 @@
     Alloc,
     BenchmarkTestFiller,
     Block,
-    BlockchainTestFiller,
     Bytecode,
     Environment,
     ExtCallGenerator,
@@ -121,7 +120,7 @@ def test_codecopy(
     ],
 )
 def test_extcode_ops(
-    blockchain_test: BlockchainTestFiller,
+    benchmark_test: BenchmarkTestFiller,
     pre: Alloc,
     fork: Fork,
     opcode: Op,
@@ -309,14 +308,13 @@ def test_extcode_ops(
             sender=pre.fund_eoa(),
         )
 
-    blockchain_test(
+    benchmark_test(
         pre=pre,
         post=post,
         blocks=[
             Block(txs=[contracts_deployment_tx]),
             Block(txs=[opcode_tx]),
         ],
-        exclude_full_post_state_in_output=True,
     )
 
 
diff --git a/tests/benchmark/compute/instruction/test_storage.py b/tests/benchmark/compute/instruction/test_storage.py
index d66f4e268a..d2dd74417b 100644
--- a/tests/benchmark/compute/instruction/test_storage.py
+++ b/tests/benchmark/compute/instruction/test_storage.py
@@ -52,6 +52,7 @@ def test_tload(
     tx_data = b"42" if fixed_key and not fixed_value else b""
 
     benchmark_test(
+        target_opcode=Op.TLOAD,
         code_generator=ExtCallGenerator(
             setup=setup,
             attack_block=attack_block,
@@ -81,6 +82,7 @@ def test_tstore(
     cleanup = Op.POP + Op.GAS if not fixed_key else Bytecode()
 
     benchmark_test(
+        target_opcode=Op.TSTORE,
         code_generator=JumpLoopGenerator(
             setup=setup, attack_block=attack_block, cleanup=cleanup
         ),
@@ -91,39 +93,46 @@ def test_tstore(
     storage_action=StorageAction.WRITE_SAME_VALUE, absent_slots=False
 )
 @pytest.mark.parametrize(
-    "storage_action,tx_result",
+    "opcode,storage_action,tx_result",
     [
         pytest.param(
+            Op.SLOAD,
             StorageAction.READ,
             TransactionResult.SUCCESS,
             id="SSLOAD",
         ),
         pytest.param(
+            Op.SSTORE,
             StorageAction.WRITE_SAME_VALUE,
             TransactionResult.SUCCESS,
             id="SSTORE same value",
         ),
         pytest.param(
+            Op.SSTORE,
             StorageAction.WRITE_SAME_VALUE,
             TransactionResult.REVERT,
             id="SSTORE same value, revert",
         ),
         pytest.param(
+            Op.SSTORE,
             StorageAction.WRITE_SAME_VALUE,
             TransactionResult.OUT_OF_GAS,
             id="SSTORE same value, out of gas",
         ),
         pytest.param(
+            Op.SSTORE,
             StorageAction.WRITE_NEW_VALUE,
             TransactionResult.SUCCESS,
             id="SSTORE new value",
         ),
         pytest.param(
+            Op.SSTORE,
             StorageAction.WRITE_NEW_VALUE,
             TransactionResult.REVERT,
             id="SSTORE new value, revert",
         ),
         pytest.param(
+            Op.SSTORE,
             StorageAction.WRITE_NEW_VALUE,
             TransactionResult.OUT_OF_GAS,
             id="SSTORE new value, out of gas",
@@ -141,6 +150,7 @@ def test_storage_access_cold(
     benchmark_test: BenchmarkTestFiller,
     pre: Alloc,
     fork: Fork,
+    opcode: Op,
     storage_action: StorageAction,
     absent_slots: bool,
     env: Environment,
@@ -279,6 +289,7 @@ def test_storage_access_cold(
     blocks.append(Block(txs=[op_tx]))
 
     benchmark_test(
+        target_opcode=opcode,
         blocks=blocks,
         expected_benchmark_gas_used=(
             total_gas_used
@@ -290,16 +301,21 @@ def test_storage_access_cold(
 
 @pytest.mark.repricing(storage_action=StorageAction.WRITE_SAME_VALUE)
 @pytest.mark.parametrize(
-    "storage_action",
+    "opcode,storage_action",
     [
-        pytest.param(StorageAction.READ, id="SLOAD"),
-        pytest.param(StorageAction.WRITE_SAME_VALUE, id="SSTORE same value"),
-        pytest.param(StorageAction.WRITE_NEW_VALUE, id="SSTORE new value"),
+        pytest.param(Op.SLOAD, StorageAction.READ, id="SLOAD"),
+        pytest.param(
+            Op.SSTORE, StorageAction.WRITE_SAME_VALUE, id="SSTORE same value"
+        ),
+        pytest.param(
+            Op.SSTORE, StorageAction.WRITE_NEW_VALUE, id="SSTORE new value"
+        ),
     ],
 )
 def test_storage_access_warm(
     benchmark_test: BenchmarkTestFiller,
     pre: Alloc,
+    opcode: Op,
     storage_action: StorageAction,
     gas_benchmark_value: int,
     env: Environment,
@@ -357,4 +373,4 @@ def test_storage_access_warm(
         )
         blocks.append(Block(txs=[op_tx]))
 
-    benchmark_test(blocks=blocks)
+    benchmark_test(target_opcode=opcode, blocks=blocks)
diff --git a/tests/benchmark/compute/instruction/test_system.py b/tests/benchmark/compute/instruction/test_system.py
index 5c553df60f..4a41df8c89 100644
--- a/tests/benchmark/compute/instruction/test_system.py
+++ b/tests/benchmark/compute/instruction/test_system.py
@@ -21,7 +21,6 @@
     Alloc,
     BenchmarkTestFiller,
     Block,
-    BlockchainTestFiller,
     Bytecode,
     Environment,
     ExtCallGenerator,
@@ -51,7 +50,7 @@
     ],
 )
 def test_xcall(
-    blockchain_test: BlockchainTestFiller,
+    benchmark_test: BenchmarkTestFiller,
     pre: Alloc,
     fork: Fork,
     opcode: Op,
@@ -237,14 +236,13 @@ def test_xcall(
             sender=pre.fund_eoa(),
         )
 
-    blockchain_test(
+    benchmark_test(
         pre=pre,
         post=post,
         blocks=[
             Block(txs=[contracts_deployment_tx]),
             Block(txs=[opcode_tx]),
         ],
-        exclude_full_post_state_in_output=True,
     )
 
 
diff --git a/tests/benchmark/compute/scenario/test_transaction_types.py b/tests/benchmark/compute/scenario/test_transaction_types.py
index 8c337a0ef8..ac39843773 100644
--- a/tests/benchmark/compute/scenario/test_transaction_types.py
+++ b/tests/benchmark/compute/scenario/test_transaction_types.py
@@ -13,7 +13,6 @@
     AuthorizationTuple,
     BenchmarkTestFiller,
     Block,
-    BlockchainTestFiller,
     Fork,
     Hash,
     Op,
@@ -401,7 +400,7 @@ def test_block_full_access_list_and_data(
 @pytest.mark.parametrize("empty_authority", [True, False])
 @pytest.mark.parametrize("zero_delegation", [True, False])
 def test_auth_transaction(
-    blockchain_test: BlockchainTestFiller,
+    benchmark_test: BenchmarkTestFiller,
     pre: Alloc,
     intrinsic_cost: int,
     gas_benchmark_value: int,
@@ -455,7 +454,7 @@ def test_auth_transaction(
             * iteration_count,
         )
 
-    blockchain_test(
+    benchmark_test(
         pre=pre,
         post={},
         blocks=[Block(txs=[tx])],