diff --git a/.github/workflows/benchmark.yaml b/.github/workflows/benchmark.yaml index ee7a816c3d..7fe33f86c2 100644 --- a/.github/workflows/benchmark.yaml +++ b/.github/workflows/benchmark.yaml @@ -32,8 +32,15 @@ jobs: enable-cache: false version: ${{ vars.UV_VERSION }} + - uses: ./.github/actions/build-evm-base + id: evm-builder + with: + type: benchmark + - name: Run benchmark unit tests run: uvx tox -e tests_benchmark_pytest_py3 + env: + EVM_BIN: ${{ steps.evm-builder.outputs.evm-bin }} sanity-checks: name: ${{ matrix.name }} diff --git a/packages/testing/src/execution_testing/cli/benchmark_parser.py b/packages/testing/src/execution_testing/cli/benchmark_parser.py index f05612fea6..8c5fdfa698 100644 --- a/packages/testing/src/execution_testing/cli/benchmark_parser.py +++ b/packages/testing/src/execution_testing/cli/benchmark_parser.py @@ -6,11 +6,12 @@ Usage: uv run benchmark_parser # Update `.fixed_opcode_counts.json` - uv run benchmark_parser --check # Check for new/missing entries (CI) + uv run benchmark_parser --check # Check for new/missing entries """ import argparse import ast +import re import sys from pathlib import Path @@ -19,6 +20,31 @@ ) +def is_related_pattern(pattern: str, detected_patterns: set[str]) -> bool: + """ + Check if a pattern is related to any detected patterns or more specific. + Related patterns are preserved as they're intentional overrides. + """ + # Check if existing pattern is BROADER than detected + try: + compiled = re.compile(pattern) + for detected in detected_patterns: + if compiled.search(detected): + return True + except re.error: + pass + + # Check if existing pattern is MORE SPECIFIC than detected + for detected in detected_patterns: + try: + if re.search(detected, pattern): + return True + except re.error: + continue + + return False + + def get_repo_root() -> Path: """Get the repository root directory.""" current = Path.cwd() @@ -190,20 +216,15 @@ def _extract_opcode_name(self, node: ast.expr) -> str | None: return None -def scan_benchmark_tests( - base_path: Path, -) -> tuple[dict[str, list[int]], dict[str, Path]]: +def scan_benchmark_tests(base_path: Path) -> dict[str, list[float]]: """ Scan benchmark test files and extract opcode patterns. Returns: - Tuple of (config, pattern_sources) where: - - config: mapping of pattern -> opcode counts - - pattern_sources: mapping of pattern -> source file path + Mapping of pattern -> opcode counts (default [1] for new patterns). """ - config: dict[str, list[int]] = {} - pattern_sources: dict[str, Path] = {} - default_counts = [1] + config: dict[str, list[float]] = {} + default_counts: list[float] = [1.0] test_files = [ f @@ -222,12 +243,11 @@ def scan_benchmark_tests( for pattern in extractor.patterns: if pattern not in config: config[pattern] = default_counts - pattern_sources[pattern] = test_file except Exception as e: print(f"Warning: Failed to parse {test_file}: {e}") continue - return config, pattern_sources + return config def load_existing_config(config_file: Path) -> OpcodeCountsConfig: @@ -237,47 +257,12 @@ def load_existing_config(config_file: Path) -> OpcodeCountsConfig: return OpcodeCountsConfig.model_validate_json(config_file.read_bytes()) -def categorize_patterns( - config: dict[str, list[int]], pattern_sources: dict[str, Path] -) -> dict[str, list[str]]: - """ - Categorize patterns by deriving category from source file name. - - Example: test_arithmetic.py -> ARITHMETIC - """ - categories: dict[str, list[str]] = {} - - for pattern in config.keys(): - if pattern in pattern_sources: - source_file = pattern_sources[pattern] - file_name = source_file.stem - if file_name.startswith("test_"): - category = file_name[5:].upper() # Remove "test_" prefix - else: - category = "OTHER" - else: - category = "OTHER" - - if category not in categories: - categories[category] = [] - categories[category].append(pattern) - - return {k: sorted(v) for k, v in sorted(categories.items())} - - def generate_config_json( - config: dict[str, list[int]], - pattern_sources: dict[str, Path], - default_counts: list[int], + config: dict[str, list[float]], + default_counts: list[float], ) -> OpcodeCountsConfig: - """Generate the JSON config file content.""" - categories = categorize_patterns(config, pattern_sources) - - scenario_configs: dict[str, list[int]] = {} - for _, patterns in categories.items(): - for pattern in patterns: - scenario_configs[pattern] = config[pattern] - + """Generate the JSON config file content with sorted patterns.""" + scenario_configs = {k: config[k] for k in sorted(config.keys())} return OpcodeCountsConfig( scenario_configs=scenario_configs, default_counts=default_counts, @@ -304,7 +289,7 @@ def main() -> int: return 1 print(f"Scanning benchmark tests in {benchmark_dir}...") - detected, pattern_sources = scan_benchmark_tests(benchmark_dir) + detected = scan_benchmark_tests(benchmark_dir) print(f"Detected {len(detected)} opcode patterns") existing_file = load_existing_config(config_file) @@ -314,11 +299,25 @@ def main() -> int: detected_keys = set(detected.keys()) existing_keys = set(existing.keys()) new_patterns = sorted(detected_keys - existing_keys) - obsolete_patterns = sorted(existing_keys - detected_keys) + # Separate truly obsolete patterns from related patterns that should be kept + potentially_obsolete = existing_keys - detected_keys + related_patterns: set[str] = set() + obsolete_patterns: set[str] = set() + for pattern in potentially_obsolete: + if is_related_pattern(pattern, detected_keys): + related_patterns.add(pattern) + else: + obsolete_patterns.add(pattern) + + # Merge: start with detected, preserve existing counts, keep related patterns merged = detected.copy() for pattern, counts in existing.items(): if pattern in detected_keys: + # Preserve existing counts for detected patterns + merged[pattern] = counts + elif pattern in related_patterns: + # Keep related patterns (broader or more specific) with their existing counts merged[pattern] = counts print("\n" + "=" * 60) @@ -332,14 +331,21 @@ def main() -> int: if len(new_patterns) > 15: print(f" ... and {len(new_patterns) - 15} more") + if related_patterns: + print(f"\n~ Preserving {len(related_patterns)} RELATED patterns:") + for p in sorted(related_patterns)[:15]: + print(f" {p}") + if len(related_patterns) > 15: + print(f" ... and {len(related_patterns) - 15} more") + if obsolete_patterns: print(f"\n- Found {len(obsolete_patterns)} OBSOLETE patterns:") - for p in obsolete_patterns[:15]: + for p in sorted(obsolete_patterns)[:15]: print(f" {p}") if len(obsolete_patterns) > 15: print(f" ... and {len(obsolete_patterns) - 15} more") - if not new_patterns and not obsolete_patterns: + if not new_patterns and not obsolete_patterns and not related_patterns: print("\nConfiguration is up to date!") print("=" * 60) @@ -350,14 +356,7 @@ def main() -> int: return 1 return 0 - for pattern in obsolete_patterns: - print(f"Removing obsolete: {pattern}") - if pattern in merged: - del merged[pattern] - - content = generate_config_json( - merged, pattern_sources, existing_file.default_counts - ) + content = generate_config_json(merged, existing_file.default_counts) config_file.write_text( content.model_dump_json(exclude_defaults=True, indent=2) ) diff --git a/packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/tests/test_benchmarking.py b/packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/tests/test_benchmarking.py index 30363d66ff..4be43f5994 100644 --- a/packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/tests/test_benchmarking.py +++ b/packages/testing/src/execution_testing/cli/pytest_commands/plugins/filler/tests/test_benchmarking.py @@ -1,17 +1,27 @@ """Test the benchmarking pytest plugin for gas benchmark values.""" +import json import textwrap from pathlib import Path from typing import List +from unittest.mock import MagicMock import pytest +from execution_testing.cli.pytest_commands.plugins.shared.benchmarking import ( + OpcodeCountsConfig, +) + +# EVM binary for tests that actually fill (not just collect) +BENCHMARK_EVM_T8N = "evmone-t8n" + test_module_dummy = textwrap.dedent( """\ import pytest from execution_testing import BenchmarkTestFiller, JumpLoopGenerator, Op @pytest.mark.valid_at("Prague") + @pytest.mark.benchmark def test_dummy_benchmark_test(benchmark_test: BenchmarkTestFiller) -> None: benchmark_test( target_opcode=Op.JUMPDEST, @@ -26,6 +36,7 @@ def test_dummy_benchmark_test(benchmark_test: BenchmarkTestFiller) -> None: from execution_testing import BenchmarkTestFiller, JumpLoopGenerator, Op @pytest.mark.valid_at("Prague") + @pytest.mark.benchmark def test_dummy_no_benchmark_test(benchmark_test: BenchmarkTestFiller) -> None: benchmark_test( target_opcode=Op.JUMPDEST, @@ -40,6 +51,7 @@ def test_dummy_no_benchmark_test(benchmark_test: BenchmarkTestFiller) -> None: from execution_testing import BenchmarkTestFiller, JumpLoopGenerator, Op @pytest.mark.valid_at("Prague") + @pytest.mark.benchmark @pytest.mark.repricing def test_benchmark_with_repricing(benchmark_test: BenchmarkTestFiller) -> None: benchmark_test( @@ -48,6 +60,7 @@ def test_benchmark_with_repricing(benchmark_test: BenchmarkTestFiller) -> None: ) @pytest.mark.valid_at("Prague") + @pytest.mark.benchmark def test_benchmark_without_repricing(benchmark_test: BenchmarkTestFiller) -> None: benchmark_test( target_opcode=Op.JUMPDEST, @@ -62,12 +75,14 @@ def test_benchmark_without_repricing(benchmark_test: BenchmarkTestFiller) -> Non from execution_testing import BenchmarkTestFiller, JumpLoopGenerator, Op @pytest.mark.valid_at("Prague") + @pytest.mark.benchmark def test_with_gas_benchmark_value(state_test, gas_benchmark_value: int) -> None: # This test intentionally uses state_test instead of benchmark_test # to verify that --fixed-opcode-count filters it out state_test(pre={}, post={}, tx=None) @pytest.mark.valid_at("Prague") + @pytest.mark.benchmark def test_with_benchmark_test(benchmark_test: BenchmarkTestFiller) -> None: benchmark_test( target_opcode=Op.JUMPDEST, @@ -79,28 +94,32 @@ def test_with_benchmark_test(benchmark_test: BenchmarkTestFiller) -> None: test_module_with_repricing_kwargs = textwrap.dedent( """\ import pytest - from execution_testing import BenchmarkTestFiller, ExtCallGenerator, Op + from execution_testing import BenchmarkTestFiller, JumpLoopGenerator, Op @pytest.mark.valid_at("Prague") + @pytest.mark.benchmark @pytest.mark.repricing(opcode=Op.ADD) @pytest.mark.parametrize("opcode", [Op.ADD, Op.SUB, Op.MUL]) def test_parametrized_with_repricing_kwargs( benchmark_test: BenchmarkTestFiller, opcode ) -> None: + # Use JUMPDEST for actual benchmarking; opcode param is just for filtering benchmark_test( - target_opcode=opcode, - code_generator=ExtCallGenerator(attack_block=opcode), + target_opcode=Op.JUMPDEST, + code_generator=JumpLoopGenerator(attack_block=Op.JUMPDEST), ) @pytest.mark.valid_at("Prague") + @pytest.mark.benchmark @pytest.mark.repricing @pytest.mark.parametrize("opcode", [Op.ADD, Op.SUB]) def test_parametrized_with_repricing_no_kwargs( benchmark_test: BenchmarkTestFiller, opcode ) -> None: + # Use JUMPDEST for actual benchmarking; opcode param is just for filtering benchmark_test( - target_opcode=opcode, - code_generator=ExtCallGenerator(attack_block=opcode), + target_opcode=Op.JUMPDEST, + code_generator=JumpLoopGenerator(attack_block=Op.JUMPDEST), ) """ ) @@ -474,3 +493,462 @@ def test_without_repricing_flag_collects_all_tests( assert any( "test_benchmark_without_repricing" in line for line in result.outlines ) + + +def test_fixed_opcode_count_exact_match_priority() -> None: + """ + Exact match takes priority over regex patterns. + + When using a config file, patterns are matched against test names. An exact + string match should take priority over a regex pattern that also matches. + """ + config = OpcodeCountsConfig( + scenario_configs={ + "test_dup": [10], + "test_dup.*": [1], + }, + default_counts=[99], + ) + + params = config.get_test_parameters("test_dup") + assert params[0].values[0] == 10 + + +def test_fixed_opcode_count_longest_pattern_wins() -> None: + """ + Longest matching pattern takes priority. + + When using a config file, if multiple regex patterns match a test name, the + longest pattern should win. This allows more specific patterns to override + broader ones. + """ + config = OpcodeCountsConfig( + scenario_configs={ + "test_dup.*": [1], + "test_dup.*DUP1.*": [5], + }, + default_counts=[99], + ) + + # Longer pattern should win for DUP1 + params = config.get_test_parameters( + "test_dup[fork_Prague-opcount_1K-opcode_DUP1]" + ) + assert params[0].values[0] == 5 + + # Shorter pattern should match for DUP2 + params = config.get_test_parameters( + "test_dup[fork_Prague-opcount_1K-opcode_DUP2]" + ) + assert params[0].values[0] == 1 + + +def test_fixed_opcode_count_default_fallback() -> None: + """ + Default counts are used when no pattern matches. + + When using a config file, if no pattern matches the test name, the + default_counts should be used as a fallback. + """ + config = OpcodeCountsConfig( + scenario_configs={ + "test_dup.*": [1], + }, + default_counts=[99], + ) + + params = config.get_test_parameters("test_other") + assert params[0].values[0] == 99 + + +def test_fixed_opcode_count_multiple_patterns() -> None: + """ + Multiple overlapping patterns are handled correctly. + + Verifies that multiple overlapping patterns of different lengths are handled + correctly. The most specific (longest) matching pattern should win. + """ + config = OpcodeCountsConfig( + scenario_configs={ + "test_.*": [1], + "test_bitwise.*": [2], + "test_bitwise.*AND.*": [3], + }, + default_counts=[99], + ) + + # Most specific pattern should win + params = config.get_test_parameters("test_bitwise[fork_Prague-opcode_AND]") + assert params[0].values[0] == 3 + + # Middle specificity + params = config.get_test_parameters("test_bitwise[fork_Prague-opcode_OR]") + assert params[0].values[0] == 2 + + # Least specific + params = config.get_test_parameters("test_other[fork_Prague]") + assert params[0].values[0] == 1 + + +@pytest.mark.parametrize( + "cli_input,expected_counts", + [ + ("1", [1]), # Single integer + ("1,2,3", [1, 2, 3]), # Multiple integers + ("0.5", [0.5]), # Single float + ("0.1,0.5,1", [0.1, 0.5, 1]), # Multiple floats + ("1,0.5,2", [1, 0.5, 2]), # Mixed int/float + # 10 mixed values + ( + "0.1,0.25,0.5,0.75,1,1.25,1.5,1.75,2,3", + [0.1, 0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 3], + ), + ], +) +def test_fixed_opcode_count_valid_input( + cli_input: str, expected_counts: list +) -> None: + """ + Valid comma-separated numbers are accepted. + + The flag accepts comma-separated numbers (integers or floats) as default + opcode counts. This test verifies valid inputs are parsed correctly. + """ + mock_config = MagicMock() + mock_config.rootpath = Path("/tmp") + + result = OpcodeCountsConfig.from_parameter_value(mock_config, cli_input) + assert result is not None + assert result.default_counts == expected_counts + + +def test_fixed_opcode_count_invalid_input() -> None: + """ + Invalid values like test paths are rejected. + + The flag should reject invalid inputs like test paths that get accidentally + consumed by argparse. This prevents confusing errors when users forget to + specify opcode counts before the test path. + """ + mock_config = MagicMock() + mock_config.rootpath = Path("/tmp") + + with pytest.raises(pytest.UsageError) as exc_info: + OpcodeCountsConfig.from_parameter_value( + mock_config, "tests/benchmark/compute/test_foo.py" + ) + + assert "Invalid value for --fixed-opcode-count" in str(exc_info.value) + + +def test_fixed_opcode_count_missing_config() -> None: + """ + Missing config file raises UsageError with helpful message. + + When used without arguments, it expects to load config from + .fixed_opcode_counts.json. If the file is missing, a helpful UsageError + should be raised explaining where to create the config file. + """ + mock_config = MagicMock() + mock_config.rootpath = Path("/nonexistent/path") + + with pytest.raises(pytest.UsageError) as exc_info: + OpcodeCountsConfig.from_parameter_value(mock_config, "") + + assert ".fixed_opcode_counts.json" in str(exc_info.value) + assert "was not found" in str(exc_info.value) + + +def test_fixed_opcode_count_float_values() -> None: + """ + Float values are supported for sub-1K opcode iterations. + + For expensive precompiles that can't run 1000+ iterations within gas limits, + float values like 0.001 (1 opcode) or 0.5 (500 opcodes) can be used. + """ + config = OpcodeCountsConfig( + scenario_configs={ + "test_precompile.*": [0.001, 0.01, 0.1], + }, + default_counts=[1.0], + ) + + counts = config.get_opcode_counts("test_precompile_bn128") + assert counts == [0.001, 0.01, 0.1] + + params = config.get_test_parameters("test_precompile_bn128") + assert len(params) == 3 + assert params[0].id == "opcount_0.001K" + assert params[1].id == "opcount_0.01K" + assert params[2].id == "opcount_0.1K" + + +def test_fixed_opcode_count_invalid_regex_raises_error() -> None: + """ + Invalid regex patterns raise an error. + + If a pattern in the config file contains invalid regex syntax, it should + raise a ValueError with a helpful message indicating which pattern is invalid. + """ + config = OpcodeCountsConfig( + scenario_configs={ + "[invalid(regex": [10.0], # Invalid regex + "test_valid.*": [5.0], + }, + default_counts=[1.0], + ) + + # Should raise error when trying to match against invalid regex + with pytest.raises(ValueError) as exc_info: + config.get_opcode_counts("test_other") + + assert "Invalid regex pattern" in str(exc_info.value) + assert "[invalid(regex" in str(exc_info.value) + + +@pytest.mark.parametrize( + "config_counts,expected_tests,expected_ids", + [ + pytest.param([1], 2, ["opcount_1"], id="single_int"), + pytest.param( + [1, 2, 3], + 6, + ["opcount_1", "opcount_2", "opcount_3"], + id="multiple_ints", + ), + pytest.param([0.5], 2, ["opcount_0.5"], id="single_float"), + pytest.param( + [0.5, 1, 2], + 6, + ["opcount_0.5", "opcount_1", "opcount_2"], + id="multiple_floats", + ), + pytest.param( + [1, 0.5, 2], + 6, + ["opcount_1", "opcount_0.5", "opcount_2"], + id="mixed_int_float", + ), + pytest.param( + [1, 2, 3, 5], + 8, + ["opcount_1", "opcount_2", "opcount_3", "opcount_5"], + id="four_ints", + ), + ], +) +def test_fixed_opcode_count_config_file_parametrized( + pytester: pytest.Pytester, + config_counts: list, + expected_tests: int, + expected_ids: list, +) -> None: + """ + Config file opcode counts create correct test variants. + + The config file can specify single counts, multiple counts, or float values. + Each should parametrize tests correctly. + """ + setup_test_directory_structure( + pytester, test_module_dummy, "test_config_counts.py" + ) + + config_file = pytester.path / ".fixed_opcode_counts.json" + config_file.write_text( + json.dumps( + { + "scenario_configs": { + "test_dummy_benchmark_test.*": config_counts + } + } + ) + ) + + # Place --fixed-opcode-count after test path to avoid argparse consuming + # the path as the option value (nargs='?' behavior) + result = pytester.runpytest( + "-c", + "pytest-fill.ini", + "--fork", + "Prague", + "tests/benchmark/dummy_test_module/", + f"--evm-bin={BENCHMARK_EVM_T8N}", + "--fixed-opcode-count", + "-v", + ) + + assert result.ret == 0 + # Check expected number of tests (2 test types * len(counts)) + assert any(f"{expected_tests} passed" in line for line in result.outlines) + # Check opcode count IDs are present + for expected_id in expected_ids: + assert any(expected_id in line for line in result.outlines) + + +# Test module with parametrized test for per-parameter pattern matching +test_module_parametrized = textwrap.dedent( + """\ + import pytest + from execution_testing import BenchmarkTestFiller, JumpLoopGenerator, Op + + @pytest.mark.valid_at("Prague") + @pytest.mark.benchmark + @pytest.mark.parametrize("size", [0, 32, 256, 1024]) + def test_parametrized_benchmark(benchmark_test: BenchmarkTestFiller, size: int) -> None: + benchmark_test( + target_opcode=Op.JUMPDEST, + code_generator=JumpLoopGenerator(attack_block=Op.JUMPDEST), + ) + """ +) + + +@pytest.mark.parametrize( + "config,expected_test_ids", + [ + # Single count per parameter - different counts for different sizes + pytest.param( + { + "test_parametrized_benchmark.*size_0.*": [5], + "test_parametrized_benchmark.*size_256.*": [3], + "test_parametrized_benchmark.*size_1024.*": [2], + }, + [ + # size_0 -> 5, size_32 -> default (1), size_256 -> 3, size_1024 -> 2 + "size_0-opcount_5", + "size_32-opcount_1", + "size_256-opcount_3", + "size_1024-opcount_2", + ], + id="single_count_per_param", + ), + # Multiple counts per parameter (floats and ints) + pytest.param( + { + "test_parametrized_benchmark.*size_0.*": [0.5, 1, 2], + "test_parametrized_benchmark.*size_1024.*": [0.5, 0.75], + }, + [ + # size_0 gets [0.5, 1, 2], size_32 gets default [1], size_1024 gets [0.5, 0.75] + "size_0-opcount_0.5", + "size_0-opcount_1", + "size_0-opcount_2", + "size_32-opcount_1", + "size_256-opcount_1", + "size_1024-opcount_0.5", + "size_1024-opcount_0.75", + ], + id="multiple_counts_per_param", + ), + # Per-param patterns with test_.* fallback for unmatched params + pytest.param( + { + "test_parametrized_benchmark.*size_0.*": [5], + "test_parametrized_benchmark.*size_1024.*": [10], + "test_.*": [2, 3], # Fallback for size_32, size_256 + }, + [ + # size_0 -> [5] (specific), size_32 -> [2,3] (fallback), + # size_256 -> [2,3] (fallback), size_1024 -> [10] (specific) + "size_0-opcount_5", + "size_32-opcount_2", + "size_32-opcount_3", + "size_256-opcount_2", + "size_256-opcount_3", + "size_1024-opcount_10", + ], + id="per_param_with_fallback", + ), + # All params same counts via broad pattern + pytest.param( + { + "test_parametrized_benchmark.*": [1, 2, 3], + }, + [ + # All sizes get [1, 2, 3] + "size_0-opcount_1", + "size_0-opcount_2", + "size_0-opcount_3", + "size_32-opcount_1", + "size_1024-opcount_3", + ], + id="all_same_counts", + ), + ], +) +def test_fixed_opcode_count_per_parameter_patterns( + pytester: pytest.Pytester, + config: dict, + expected_test_ids: List[str], +) -> None: + """ + Per-parameter opcode count patterns work correctly. + + Patterns like "test_foo.*size_256.*" should match tests with that specific + parameter value and apply the corresponding opcode counts. + """ + setup_test_directory_structure( + pytester, test_module_parametrized, "test_param_benchmark.py" + ) + + config_file = pytester.path / ".fixed_opcode_counts.json" + config_file.write_text(json.dumps({"scenario_configs": config})) + + result = pytester.runpytest( + "-c", + "pytest-fill.ini", + "--fork", + "Prague", + "tests/benchmark/dummy_test_module/", + f"--evm-bin={BENCHMARK_EVM_T8N}", + "--fixed-opcode-count", + "-v", + ) + + assert result.ret == 0 + + # Verify expected test IDs are present + output = "\n".join(result.outlines) + for expected_id in expected_test_ids: + assert expected_id in output, ( + f"Expected '{expected_id}' in output but not found.\n" + f"Output:\n{output}" + ) + + +def test_cli_mode_ignores_per_parameter_patterns( + pytester: pytest.Pytester, +) -> None: + """ + CLI mode applies same counts to all parameters. + + When using --fixed-opcode-count=1,5 (explicit CLI values), all test + variants should get the same opcode counts regardless of their parameters. + This verifies CLI mode doesn't accidentally use per-parameter matching. + """ + setup_test_directory_structure( + pytester, test_module_parametrized, "test_cli_mode.py" + ) + + result = pytester.runpytest( + "-c", + "pytest-fill.ini", + "--fork", + "Prague", + "--fixed-opcode-count=1,5", + "tests/benchmark/dummy_test_module/", + f"--evm-bin={BENCHMARK_EVM_T8N}", + "-v", + ) + + assert result.ret == 0 + output = "\n".join(result.outlines) + + # All size variants should have both opcount_1 and opcount_5 + for size in ["size_0", "size_32", "size_256", "size_1024"]: + assert ( + f"{size}-opcount_1.0K" in output or f"{size}-opcount_1K" in output + ), f"Expected {size} with opcount_1 in output" + assert ( + f"{size}-opcount_5.0K" in output or f"{size}-opcount_5K" in output + ), f"Expected {size} with opcount_5 in output" diff --git a/packages/testing/src/execution_testing/cli/pytest_commands/plugins/shared/benchmarking.py b/packages/testing/src/execution_testing/cli/pytest_commands/plugins/shared/benchmarking.py index 03e261770c..04f4db57be 100644 --- a/packages/testing/src/execution_testing/cli/pytest_commands/plugins/shared/benchmarking.py +++ b/packages/testing/src/execution_testing/cli/pytest_commands/plugins/shared/benchmarking.py @@ -41,7 +41,12 @@ def pytest_addoption(parser: pytest.Parser) -> None: const="", help=( "Opcode counts (in thousands) for benchmark tests. " - "Example: '1,10,100' runs tests with 1K, 10K, 100K opcodes. " + "Granularity rules (for ≤10%% CALL overhead): " + "cheap ops (1-2 gas): integers only, no sub-1K; " + "medium ops (3-5 gas): 0.5 increments, min 0.5K; " + "expensive ops (6+ gas): 0.25 increments, min 0.25K; " + "very expensive (100+ gas): 0.25 increments, min 0.01K. " + "Example: '0.5,1,2' runs 500, 1K, 2K opcodes. " "Without value, uses .fixed_opcode_counts.json config. " f"Cannot be used with {GasBenchmarkValues.flag}." ), @@ -134,12 +139,12 @@ class GasBenchmarkValues(RootModel, BenchmarkParametrizer): @classmethod def from_parameter_value( - cls, config: pytest.Config, value: str + cls, _config: pytest.Config, value: str ) -> Self | None: """Given the parameter value and config, return the expected object.""" return cls.model_validate(value.split(",")) - def get_test_parameters(self, test_name: str) -> list[ParameterSet]: + def get_test_parameters(self, _test_name: str) -> list[ParameterSet]: """Get benchmark values. All tests have the same list.""" return [ pytest.param( @@ -153,8 +158,9 @@ def get_test_parameters(self, test_name: str) -> list[ParameterSet]: class OpcodeCountsConfig(BaseModel, BenchmarkParametrizer): """Opcode counts configuration object.""" - scenario_configs: Dict[str, List[int]] = Field(default_factory=dict) - default_counts: List[int] = Field(default_factory=lambda: [1]) + scenario_configs: Dict[str, List[float]] = Field(default_factory=dict) + default_counts: List[float] = Field(default_factory=lambda: [1.0]) + uses_config_file: bool = Field(default=False) default_config_file_name: ClassVar[str] = ".fixed_opcode_counts.json" flag: ClassVar[str] = "--fixed-opcode-count" @@ -169,48 +175,176 @@ def from_parameter_value( if value == "": default_file = Path(config.rootpath) / cls.default_config_file_name if default_file.exists(): - return cls.model_validate_json(default_file.read_bytes()) + data = default_file.read_bytes() + instance = cls.model_validate_json(data) + instance.uses_config_file = True + return instance else: - pytest.UsageError( + raise pytest.UsageError( "--fixed-opcode-count was provided without a value, but " f"{cls.default_config_file_name} was not found. " "Run 'uv run benchmark_parser' to generate it, or provide " "explicit values (e.g., --fixed-opcode-count 1,10,100)." ) - return cls.model_validate({"default_counts": value.split(",")}) + # Validate that value looks like comma-separated numbers (int or float) + # This catches the case where argparse greedily consumes a test path + parts = value.split(",") + + def is_number(s: str) -> bool: + try: + float(s.strip()) + return True + except ValueError: + return False + + if not all(is_number(part) for part in parts): + raise pytest.UsageError( + f"Invalid value for --fixed-opcode-count: '{value}'. " + "Expected comma-separated numbers (e.g., '1,10,100' or '0.25,0.5,1') or no value " + "to use the config file. If providing a value, use --fixed-opcode-count=VALUE " + "syntax to avoid argparse consuming test paths as the value." + ) + return cls.model_validate( + {"default_counts": parts, "uses_config_file": False} + ) - def get_test_parameters(self, test_name: str) -> list[ParameterSet]: + def get_opcode_counts(self, test_name: str) -> list[float]: """ - Get opcode counts for a test using regex pattern matching. + Get opcode counts for a test using pattern matching. + + Matching priority: + 1. Exact match in scenario_configs + 2. Regex pattern match (longest pattern wins for specificity) + 3. Default counts as fallback + + Example with config: + {"test_dup": [10], "test_dup.*": [1], "test_dup.*DUP1.*": [5]} + + - "test_dup" -> [10] (exact match) + - "test_dup[fork_Prague-opcode_DUP1]" -> [5] (longest pattern matches) + - "test_dup[fork_Prague-opcode_DUP2]" -> [1] (matches "test_dup.*") + - "test_other" -> default_counts (no match) + + Note: In config file mode, test names don't have opcount yet when this + is called - we look up the count first, then add it to the test name. """ counts = self.default_counts - # Try exact match first (faster) + if test_name in self.scenario_configs: counts = self.scenario_configs[test_name] else: - # Try regex patterns + matches: list[tuple[str, list[float]]] = [] for pattern, pattern_counts in self.scenario_configs.items(): if pattern == test_name: continue try: if re.search(pattern, test_name): - counts = pattern_counts - break - except re.error: - continue + matches.append((pattern, pattern_counts)) + except re.error as e: + raise ValueError( + f"Invalid regex pattern '{pattern}' in config: {e}" + ) + + if matches: + matches.sort(key=lambda x: len(x[0]), reverse=True) + counts = matches[0][1] + + return counts + + def get_test_parameters(self, test_name: str) -> list[ParameterSet]: + """Get opcode counts as pytest parameters.""" + # Deduplicate while preserving order + unique_counts = list(dict.fromkeys(self.get_opcode_counts(test_name))) return [ - pytest.param( - opcode_count, - id=f"opcount_{opcode_count}K", - ) - for opcode_count in counts + pytest.param(opcode_count, id=f"opcount_{opcode_count}K") + for opcode_count in unique_counts ] + def parametrize(self, metafunc: pytest.Metafunc) -> None: + """ + Parametrize a test with opcode counts. + + In config file mode with existing parametrizations (from metafunc._calls), + generates opcode counts per-parameter by matching patterns against simulated + test IDs built from existing params. + + In CLI mode (explicit counts), uses the function name for pattern matching. + """ + # Check for direct or indirect use of fixed_opcode_count. + # The benchmark_test fixture depends on fixed_opcode_count, so if the test + # uses benchmark_test, we need to parametrize fixed_opcode_count. + if self.parameter_name not in metafunc.fixturenames: + if "benchmark_test" not in metafunc.fixturenames: + return + # benchmark_test uses fixed_opcode_count - add it to fixtures + metafunc.fixturenames.append(self.parameter_name) + + test_name = metafunc.function.__name__ + + if ( + self.uses_config_file + and hasattr(metafunc, "_calls") + and metafunc._calls + ): + # Config file mode with existing parametrizations: + # Build simulated IDs from existing params and match patterns + self._parametrize_with_existing_params(metafunc, test_name) + elif self.uses_config_file: + # Config file mode, no existing params: match against function name + metafunc.parametrize( + self.parameter_name, + self.get_test_parameters(test_name), + scope="function", + ) + else: + # CLI mode: use function name matching (original behavior) + metafunc.parametrize( + self.parameter_name, + self.get_test_parameters(test_name), + scope="function", + ) + + def _parametrize_with_existing_params( + self, metafunc: pytest.Metafunc, test_name: str + ) -> None: + """ + Parametrize opcode counts based on existing test parameters. + + For each existing parameter combination in metafunc._calls, build a simulated + test ID and match patterns to get the appropriate opcode counts. + + We collect ALL unique counts across all parameter combinations and add them + as a simple parametrization. This creates all combinations (cartesian product). + Unwanted combinations are filtered out later in pytest_collection_modifyitems. + """ + # Collect opcode counts for each call (indexed by position) + all_unique_counts: set[float] = set() + + for call in metafunc._calls: + # Build simulated test ID using call.id which is already properly formatted + # Format: test_name[fork_--] + simulated_id = f"{test_name}[{call.id}]" if call.id else test_name + + # Get opcode counts for this simulated ID and add to unique set + counts = self.get_opcode_counts(simulated_id) + all_unique_counts.update(counts) + + # Add all unique counts as simple parametrization (multiplies with existing) + # Unwanted combinations will be filtered in pytest_collection_modifyitems + metafunc.parametrize( + self.parameter_name, + [ + pytest.param(count, id=f"opcount_{count}K") + for count in sorted(all_unique_counts) + ], + scope="function", + ) + def pytest_collection_modifyitems( config: pytest.Config, items: list[pytest.Item] ) -> None: - """Filter tests based on repricing marker.""" + """Filter tests based on repricing marker and opcode count patterns.""" gas_benchmark_value = GasBenchmarkValues.from_config(config) fixed_opcode_count = OpcodeCountsConfig.from_config(config) @@ -234,6 +368,10 @@ def pytest_collection_modifyitems( filtered.append(item) items[:] = filtered + # Filter per-parameter opcode counts if using config file mode + if fixed_opcode_count.uses_config_file: + _filter_opcode_count_combinations(items, fixed_opcode_count) + # Extract the specified flag from the command line. # If the `-m repricing` flag is not specified, or is negated, # we skip filtering tests by the repricing marker. @@ -266,8 +404,60 @@ def pytest_collection_modifyitems( items[:] = filtered +def _filter_opcode_count_combinations( + items: list[pytest.Item], opcode_config: "OpcodeCountsConfig" +) -> None: + """ + Filter test items to only keep valid opcode count combinations. + + When using config file mode with per-parameter patterns, we generate all + combinations (cartesian product) in pytest_generate_tests. Here we filter + out combinations where the opcode count doesn't match the pattern for + that specific parameter combination. + """ + filtered = [] + + for item in items: + if not hasattr(item, "callspec"): + filtered.append(item) + continue + + params = item.callspec.params + opcode_count = params.get(OpcodeCountsConfig.parameter_name) + + if opcode_count is None: + filtered.append(item) + continue + + # Build simulated test ID WITHOUT the opcode count for pattern matching + # The test name format is: test_func[fork_X-fixture_format-params-opcount_Y] + # We need: test_func[fork_X-fixture_format-params] + test_name = item.name + + # Remove the opcode count part from the test ID for pattern matching + # Pattern: -opcount_X.XK or -opcount_XK at the end before ] + import re + + simulated_id = re.sub(r"-opcount_[\d.]+K\]$", "]", test_name) + + # Get valid counts for this parameter combination + valid_counts = opcode_config.get_opcode_counts(simulated_id) + + # Keep item only if its opcode count is valid for this combination + if opcode_count in valid_counts: + filtered.append(item) + + items[:] = filtered + + +@pytest.hookimpl(trylast=True) def pytest_generate_tests(metafunc: pytest.Metafunc) -> None: - """Generate tests for the gas benchmark values and fixed opcode counts.""" + """ + Generate tests for the gas benchmark values and fixed opcode counts. + + Uses trylast=True to run after other parametrizations so we can access + existing parameters in metafunc._calls for pattern matching. + """ parametrizer = GasBenchmarkValues.from_config( metafunc.config ) or OpcodeCountsConfig.from_config(metafunc.config) diff --git a/packages/testing/src/execution_testing/specs/benchmark.py b/packages/testing/src/execution_testing/specs/benchmark.py index ce4cb2aca1..d7c587eb5e 100644 --- a/packages/testing/src/execution_testing/specs/benchmark.py +++ b/packages/testing/src/execution_testing/specs/benchmark.py @@ -54,7 +54,7 @@ class BenchmarkCodeGenerator(ABC): setup: Bytecode = field(default_factory=Bytecode) cleanup: Bytecode = field(default_factory=Bytecode) tx_kwargs: Dict[str, Any] = field(default_factory=dict) - fixed_opcode_count: int | None = None + fixed_opcode_count: float | None = None code_padding_opcode: Op | None = None _contract_address: Address | None = None _inner_iterations: int = 1000 @@ -78,10 +78,13 @@ def deploy_fix_count_contracts(self, *, pre: Alloc, fork: Fork) -> Address: "fixed_opcode_count is not set" ) # Adjust outer loop iterations based on inner iterations - # If inner is 500 instead of 1000, double the outer loop - outer_multiplier = 1000 // self._inner_iterations - iterations = self.fixed_opcode_count * outer_multiplier - + if self.fixed_opcode_count < 1.0: + # < 1000 opcodes, outer = 1 as inner already set to exact count + iterations = 1 + else: + # >= 1000: calculate outer iterations from target / inner + target_opcodes = int(self.fixed_opcode_count * 1000) + iterations = target_opcodes // self._inner_iterations prefix = Op.CALLDATACOPY( Op.PUSH0, Op.PUSH0, Op.CALLDATASIZE ) + Op.PUSH4(iterations) @@ -193,9 +196,40 @@ def generate_repeated_code( # # 2a. If N is 1000: Set M = fixed_opcode_count. (Total ops: fixed_opcode_count * 1000) # 2b. If N is 500: Set M = fixed_opcode_count * 2. (Total ops: (fixed_opcode_count * 2) * 500 = fixed_opcode_count * 1000) + # + # --- 3. Sub-1K Case (fixed_opcode_count < 1.0) --- + # For Sub-1K counts (e.g., 0.25 = 250 opcodes), set N = exact count, M = 1. if self.fixed_opcode_count is not None: - inner_iterations = 1000 if max_iterations >= 1000 else 500 - self._inner_iterations = min(max_iterations, inner_iterations) + if self.fixed_opcode_count < 0.001: + raise ValueError( + f"fixed_opcode_count must be >= 0.001 (1 opcode), " + f"got {self.fixed_opcode_count}" + ) + if self.fixed_opcode_count < 1.0: + # < 1000 opcodes, inner = exact count, outer = 1 + self._inner_iterations = min( + max_iterations, int(self.fixed_opcode_count * 1000) + ) + else: + # >= 1000 opcodes: use 250 inner iterations (0.25K granularity) + target_opcodes = int(self.fixed_opcode_count * 1000) + + if max_iterations >= 250 and target_opcodes % 250 == 0: + inner_iterations = 250 + elif max_iterations >= target_opcodes: + # Use exact count as inner with outer = 1 + inner_iterations = target_opcodes + else: + raise ValueError( + f"fixed_opcode_count {self.fixed_opcode_count} ({target_opcodes} opcodes) " + f"exceeds max contract size for this attack block.\n" + f"Contract size limit allows up to {max_iterations} opcodes " + f"({max_iterations / 1000:.3f}K) in the inner loop.\n" + f"For counts above this limit, use multiples of 0.25K " + f"(e.g., {((target_opcodes // 250) * 250) / 1000:.2f} or " + f"{((target_opcodes // 250 + 1) * 250) / 1000:.2f})." + ) + self._inner_iterations = inner_iterations # TODO: Unify the PUSH0 and PUSH1 usage. iterations = ( @@ -247,7 +281,7 @@ class BenchmarkTest(BaseTest): gas_benchmark_value: int = Field( default_factory=lambda: int(Environment().gas_limit) ) - fixed_opcode_count: int | None = None + fixed_opcode_count: float | None = None target_opcode: Op | None = None code_generator: BenchmarkCodeGenerator | None = None diff --git a/tests/benchmark/README.md b/tests/benchmark/README.md new file mode 100644 index 0000000000..c1a8aafffa --- /dev/null +++ b/tests/benchmark/README.md @@ -0,0 +1,298 @@ +# Benchmark Tests + +Benchmark tests measure EVM opcode and precompile performance to support gas repricing analysis. These tests are designed to stress specific operations under controlled conditions, allowing accurate measurement of execution costs. + +## Two Benchmarking Modes + +The framework supports two distinct benchmarking approaches: + +| Mode | Flag | Unit | Use Case | +|------|------|------|----------| +| Gas-based | `--gas-benchmark-values` | Millions of gas | CI pipelines, traditional benchmarking | +| Fixed opcode count | `--fixed-opcode-count` | Thousands of opcodes | Fast iteration for gas repricing research | + +## Gas Benchmark Values + +The gas-based mode runs benchmark tests with specified gas limits. This is the traditional approach used in CI pipelines. Values are specified in **millions** of gas (e.g., `100` means 100 million gas). + +```bash +uv run fill --fork Prague --gas-benchmark-values 1 -m benchmark ./tests/benchmark --evm-bin=evmone-t8n +``` + +This mode is useful for measuring how much work can be done within a given gas budget, simulating real-world block execution conditions. + +## Fixed Opcode Count + +The fixed opcode count mode runs benchmark tests with a predetermined number of opcode iterations rather than gas-based limits. This approach enables rapid iteration when analyzing gas costs for repricing proposals, as you can directly compare execution times across different opcode counts. + +**Important:** Tests must be marked with `@pytest.mark.repricing` to be compatible with fixed opcode count mode. This marker identifies tests that have been specifically designed for gas repricing analysis with proper code generators. + +### CLI Mode + +When you want to apply the same opcode counts to all tests, pass the values directly on the command line: + +```bash +# Run all repricing tests with 1K opcodes +uv run fill --fork Prague --fixed-opcode-count=1 -m repricing tests/benchmark --evm-bin=evmone-t8n + +# Run with multiple counts (1K, 5K, and 10K opcodes) +uv run fill --fork Prague --fixed-opcode-count=1,5,10 -m repricing tests/benchmark --evm-bin=evmone-t8n +``` + +### Config File Mode + +For more granular control, you can use a configuration file that specifies different opcode counts for different tests. This is particularly useful when benchmarking a mix of cheap and expensive operations that require different iteration counts. + +```bash +uv run fill --fork Prague --fixed-opcode-count -m repricing tests/benchmark --evm-bin=evmone-t8n +``` + +When invoked without a value, the framework reads from `.fixed_opcode_counts.json` in the repository root. + +### Config File Format + +Create a `.fixed_opcode_counts.json` file in the repository root with the following structure: + +```json +{ + "scenario_configs": { + "test_add.*": [1, 5, 10], + "test_ecrecover.*": [0.01, 0.1], + "test_bn128.*size_256.*": [0.001], + "test_.*": [1] + } +} +``` + +The pattern matching system works as follows: + +- **Regex patterns**: Each key in `scenario_configs` is a regular expression matched against test names +- **Longest match wins**: When multiple patterns match, the longest (most specific) pattern takes precedence +- **Per-parameter matching**: Patterns can target specific parametrized test variants (see below) +- **Global fallback**: Use `"test_.*"` as a catch-all pattern for any unmatched tests + +### Per-Parameter Pattern Matching + +For parametrized tests, you can specify different opcode counts for each parameter value. This is essential when different parameter values have significantly different execution costs. + +Consider a `test_codecopy_benchmark` test parametrized by `code_size`. Larger code sizes are more expensive per iteration, so you need fewer iterations to achieve meaningful measurements: + +```json +{ + "scenario_configs": { + "test_codecopy_benchmark.*code_size_0.*": [2500, 5000, 7500, 10000], + "test_codecopy_benchmark.*code_size_32.*": [2000, 4000, 6000, 8000], + "test_codecopy_benchmark.*code_size_256.*": [750, 1500, 2250, 3000], + "test_codecopy_benchmark.*code_size_1024.*": [250, 500, 750, 1000], + "test_codecopy_benchmark.*code_size_24576.*": [10, 20, 30, 40], + "test_codecopy_benchmark.*": [1] + } +} +``` + +**How pattern ordering works:** + +1. When a test like `test_codecopy_benchmark[code_size_256]` runs, the framework checks all patterns +2. Multiple patterns may match: both `test_codecopy_benchmark.*code_size_256.*` and `test_codecopy_benchmark.*` +3. The **longest matching pattern wins**, so `code_size_256` gets `[750, 1500, 2250, 3000]` +4. The shorter fallback pattern `test_codecopy_benchmark.*` only applies to parameter values without specific patterns + +**Best practices for per-parameter configs:** + +- Start with specific patterns for each parameter value you want to customize +- Add a broader fallback pattern (e.g., `test_foo.*`) for any unconfigured parameter values +- Use the `test_.*` pattern as a global default for entirely unmatched tests +- Order doesn't matter in the JSON - the framework always uses longest match, not first match + +### Generating the Config File + +The benchmark parser tool can automatically generate and update the configuration file by scanning your test modules: + +```bash +# Generate or update .fixed_opcode_counts.json +uv run benchmark_parser + +# Validate that config is in sync +uv run benchmark_parser --check +``` + +The parser preserves any custom counts you've configured while adding new tests with default values. + +### Understanding Opcode Count Values + +Values represent **thousands of opcodes**: + +- `1` = 1,000 opcodes (1K) +- `0.5` = 500 opcodes +- `0.25` = 250 opcodes +- `0.001` = 1 opcode + +The minimum supported value is `0.001` (a single opcode iteration). + +### Granularity Rules + +The benchmark framework uses outer CALL loops to achieve the target opcode count. Each CALL adds approximately 150 gas of overhead. To keep this overhead at or below 10% of the measured work, follow these granularity guidelines when setting the opcode count: + +| Opcode Gas Cost | Recommended Granularity | Minimum Value | Example Opcodes | +|-----------------|------------------------|---------------|-----------------| +| 1-2 gas | Integers only | 1K | JUMPDEST, POP, PUSH | +| 3-5 gas | 0.5 increments | 0.5K | ADD, MUL, SUB, DIV | +| 6+ gas | 0.25 increments | 0.25K | ADDMOD, MULMOD, EXP | +| 100+ gas | 0.25 increments | 0.01K | CALL, SLOAD, precompiles | + +For example, testing JUMPDEST (1 gas) with only 100 iterations would mean the CALL overhead dominates the measurement. Using 1,000+ iterations ensures the actual opcode cost is the primary factor. + +### Example Configuration + +Here's a comprehensive configuration demonstrating granularity rules and per-parameter matching: + +```json +{ + "scenario_configs": { + "test_jumpdest.*": [1, 2, 5], + "test_add.*": [0.5, 1, 2], + "test_keccak.*": [0.25, 0.5, 1], + "test_ecrecover.*": [0.01, 0.1], + "test_bn128_pairing.*": [0.001], + + "test_codecopy.*code_size_0.*": [2500, 5000, 10000], + "test_codecopy.*code_size_256.*": [750, 1500, 3000], + "test_codecopy.*code_size_24576.*": [10, 20, 50], + "test_codecopy.*": [1], + + "test_.*": [1] + } +} +``` + +This configuration: + +1. **Cheap opcodes** (JUMPDEST, ADD): Use integer counts ≥1K +2. **Medium opcodes** (KECCAK256): Use 0.25K-1K range +3. **Expensive precompiles** (ECRECOVER, BN128): Use sub-1K counts +4. **Per-parameter tests** (CODECOPY): Different counts based on `code_size` parameter +5. **Global fallback** (`test_.*`): Catches any test not explicitly configured + +## The Repricing Marker + +Tests intended for gas repricing analysis must be marked with `@pytest.mark.repricing`. This marker serves two purposes: + +1. **Filtering**: Use `-m repricing` to run only repricing-relevant tests +2. **Compatibility**: Fixed opcode count mode requires this marker to ensure tests have proper code generators + +### Listing Repricing Tests + +To see all available repricing tests without running them: + +```bash +# List all repricing test names +uv run fill --fork Prague --fixed-opcode-count -m repricing tests/benchmark --collect-only -q + +# List with full test IDs (includes parameters) +uv run fill --fork Prague --fixed-opcode-count -m repricing tests/benchmark --collect-only +``` + +### Running Repricing Tests + +```bash +# Run only repricing tests (recommended for fixed opcode count) +uv run fill --fork Prague --fixed-opcode-count -m repricing tests/benchmark --evm-bin=evmone-t8n + +# Run all benchmark tests (gas-based mode typically) +uv run fill --fork Prague --gas-benchmark-values 1 -m benchmark tests/benchmark --evm-bin=evmone-t8n +``` + +## Execute Remote + +To run benchmarks against a live network for real-world performance measurement: + +```bash +uv run execute remote --fixed-opcode-count --fork Prague -m repricing tests/benchmark \ + --rpc-seed-key --rpc-endpoint --chain-id +``` + +## Writing Benchmark Tests + +Here's an example of a properly structured benchmark test for fixed opcode count mode: + +```python +import pytest +from execution_testing import BenchmarkTestFiller, JumpLoopGenerator, Op + +@pytest.mark.valid_at("Prague") +@pytest.mark.repricing # Required for fixed opcode count mode +def test_add(benchmark_test: BenchmarkTestFiller) -> None: + """Benchmark the ADD opcode with representative operands.""" + benchmark_test( + target_opcode=Op.ADD, + code_generator=JumpLoopGenerator( + attack_block=Op.ADD(1, 2) + Op.POP, + ), + ) +``` + +Key elements: + +- `@pytest.mark.repricing`: Marks this test for gas repricing analysis +- `benchmark_test` fixture: Provides the `BenchmarkTestFiller` for fixed opcode count mode +- `target_opcode`: The opcode being measured (used for validation) +- `code_generator`: Defines the bytecode pattern to repeat + +## Continuous Integration + +The benchmark CI pipeline runs automatically on pushes to `mainnet` and `forks/**` branches, as well as on pull requests. It validates benchmark functionality and generates fixture artifacts. + +### CI Workflow + +The `.github/workflows/benchmark.yaml` workflow runs in stages: + +1. **Unit Tests**: Runs benchmark framework unit tests +2. **Sanity Checks**: Validates both benchmarking modes work correctly +3. **Build Artifact**: Generates benchmark fixture artifacts (on push only) + +### Tox Environments + +Run these locally to validate benchmark functionality: + +```bash +# Run benchmark framework unit tests +uvx tox -e tests_benchmark_pytest_py3 + +# Test gas-based benchmarking mode +uvx tox -e benchmark-gas-values + +# Test fixed opcode count CLI mode +uvx tox -e benchmark-fixed-opcode-cli + +# Test fixed opcode count config file mode (runs benchmark_parser first) +uvx tox -e benchmark-fixed-opcode-config +``` + +| Tox Environment | Description | +|-----------------|-------------| +| `tests_benchmark_pytest_py3` | Unit tests for the benchmarking plugin | +| `benchmark-gas-values` | Fills tests with `--gas-benchmark-values 1` | +| `benchmark-fixed-opcode-cli` | Fills tests with `--fixed-opcode-count 1` | +| `benchmark-fixed-opcode-config` | Runs `benchmark_parser`, then fills with config file | + +### Fixture Configurations + +The `.github/configs/feature.yaml` defines benchmark fixture builds: + +| Config | Fork | Gas Values | Description | +|--------|------|------------|-------------| +| `benchmark` | Prague | 1,5,10,30,60,100,150 | Full benchmark suite | +| `benchmark_develop` | Osaka | 1,5,10,30,60,100,150 | Development fork benchmarks | +| `benchmark_fast` | Prague | 100 | Quick benchmark for CI artifacts | + +### Artifact Generation + +Every time a PR touching benchmark code is merged to `mainnet` or `forks/**` branches, the CI automatically generates benchmark fixture artifacts. These artifacts use the `benchmark_fast` configuration with **100M gas only** (not the full 1-150M range) to keep build times reasonable. + +The artifacts are published as GitHub releases and can be used for execution testing against client implementations. + +To build the same fixtures locally: + +```bash +uv run fill --fork Prague --gas-benchmark-values 100 -m benchmark tests/benchmark --evm-bin=evmone-t8n +``` diff --git a/tests/benchmark/compute/instruction/test_arithmetic.py b/tests/benchmark/compute/instruction/test_arithmetic.py index 765541e4e9..c6a7261fe1 100644 --- a/tests/benchmark/compute/instruction/test_arithmetic.py +++ b/tests/benchmark/compute/instruction/test_arithmetic.py @@ -185,7 +185,6 @@ def test_arithmetic( ) -@pytest.mark.repricing(mod_bits=127) @pytest.mark.parametrize("mod_bits", [255, 191, 127, 63]) @pytest.mark.parametrize("opcode", [Op.MOD, Op.SMOD]) def test_mod( diff --git a/tests/benchmark/compute/instruction/test_log.py b/tests/benchmark/compute/instruction/test_log.py index 956dc8acb2..5e904552d1 100644 --- a/tests/benchmark/compute/instruction/test_log.py +++ b/tests/benchmark/compute/instruction/test_log.py @@ -21,11 +21,11 @@ @pytest.mark.parametrize( "opcode", [ - pytest.param(Op.LOG0, id="log0"), - pytest.param(Op.LOG1, id="log1"), - pytest.param(Op.LOG2, id="log2"), - pytest.param(Op.LOG3, id="log3"), - pytest.param(Op.LOG4, id="log4"), + Op.LOG0, + Op.LOG1, + Op.LOG2, + Op.LOG3, + Op.LOG4, ], ) @pytest.mark.parametrize( @@ -88,11 +88,11 @@ def test_log( @pytest.mark.parametrize( "opcode", [ - pytest.param(Op.LOG0, id="log0"), - pytest.param(Op.LOG1, id="log1"), - pytest.param(Op.LOG2, id="log2"), - pytest.param(Op.LOG3, id="log3"), - pytest.param(Op.LOG4, id="log4"), + Op.LOG0, + Op.LOG1, + Op.LOG2, + Op.LOG3, + Op.LOG4, ], ) @pytest.mark.parametrize("mem_size", [0, 32, 256, 1024]) diff --git a/tests/benchmark/compute/instruction/test_system.py b/tests/benchmark/compute/instruction/test_system.py index b7fbdc10a9..365d8c39b7 100644 --- a/tests/benchmark/compute/instruction/test_system.py +++ b/tests/benchmark/compute/instruction/test_system.py @@ -352,8 +352,17 @@ def test_create( ) if opcode == Op.CREATE2: - # For CREATE2, we provide an initial salt. - setup += Op.PUSH1(42) + # For CREATE2, load salt from storage (persist across outer loop calls) + # If storage is 0 (first call), use initial salt of 42. + # Stack after setup: [..., value, code_size, salt] + setup += ( + Op.SLOAD(0) # Load saved salt + + Op.DUP1 # Duplicate for check + + Op.ISZERO # Check if zero + + Op.PUSH1(42) # Default salt + + Op.MUL # 42 if zero, 0 if not + + Op.ADD # Add to get final salt (saved or 42) + ) attack_block = ( # For CREATE: @@ -363,10 +372,16 @@ def test_create( if opcode == Op.CREATE # For CREATE2: we manually push the arguments because we leverage the # return value of previous CREATE2 calls as salt for the next CREATE2 - # call. + # call. After CREATE2, save result to storage for next outer loop call. # - DUP4 is targeting the PUSH1(value) from the code_prefix. # - DUP3 is targeting the EXTCODESIZE value pushed in code_prefix. - else Op.DUP3 + Op.PUSH0 + Op.DUP4 + Op.CREATE2 + else Op.DUP3 + + Op.PUSH0 + + Op.DUP4 + + Op.CREATE2 + + Op.DUP1 + + Op.PUSH0 + + Op.SSTORE ) benchmark_test( diff --git a/tests/benchmark/compute/precompile/test_ecrecover.py b/tests/benchmark/compute/precompile/test_ecrecover.py index 63c4c72e08..22aca3b5ca 100644 --- a/tests/benchmark/compute/precompile/test_ecrecover.py +++ b/tests/benchmark/compute/precompile/test_ecrecover.py @@ -12,6 +12,7 @@ from tests.benchmark.compute.helpers import concatenate_parameters +@pytest.mark.repricing @pytest.mark.parametrize( "precompile_address,calldata", [ @@ -28,7 +29,6 @@ ] ), id="ecrecover", - marks=pytest.mark.repricing, ) ], ) diff --git a/tests/benchmark/compute/precompile/test_point_evaluation.py b/tests/benchmark/compute/precompile/test_point_evaluation.py index e179f27526..7597a9fa1d 100644 --- a/tests/benchmark/compute/precompile/test_point_evaluation.py +++ b/tests/benchmark/compute/precompile/test_point_evaluation.py @@ -13,6 +13,7 @@ from tests.cancun.eip4844_blobs.spec import Spec as BlobsSpec +@pytest.mark.repricing @pytest.mark.parametrize( "precompile_address,calldata", [ @@ -28,7 +29,6 @@ ] ), id="point_evaluation", - marks=pytest.mark.repricing, ), ], ) diff --git a/whitelist.txt b/whitelist.txt index 4f73730fd5..0b2d511e4f 100644 --- a/whitelist.txt +++ b/whitelist.txt @@ -494,6 +494,7 @@ epilog eq ERC Erigon +errlines esbenp etc ETH @@ -770,6 +771,7 @@ makereport marcin marioevz markdownlint +markexpr master matchers mcopy @@ -856,6 +858,7 @@ ommers oneliner oob opc +opcount opcode's OpenSSL oprypin @@ -876,6 +879,7 @@ P7692 paradigmxyz param parametrization +parametrizations parametrize parametrized parametrizer