From 9fc2239f9e3b4c02224e407d4904f729166d82c2 Mon Sep 17 00:00:00 2001
From: haosdent <haosdent@gmail.com>
Date: Fri, 8 May 2026 15:05:54 +0800
Subject: [PATCH] [CI][Bugfix] Make test_gpt2_cache_hit observable across V1
 EngineCore
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The test patches `torch.fx.experimental.symbolic_shapes.make_symbol` in
the parent process and counts via a `multiprocessing.Value`. In V1 the
actual compile runs inside an `EngineCore` subprocess that vLLM spawns
whenever CUDA is initialized in the parent (via `_maybe_force_spawn`),
so the parent-process patch never reaches the compile path and the
counter stays at 0.

Replace the brittle torch-internal patch with `LLM.collective_rpc` to
snapshot `compilation_counter` from the EngineCore subprocess itself.
This is process-model agnostic — works under default V1 multiprocessing
without sharing memory between the test and the engine. Each phase
creates a fresh `LLM(...)` and tears it down via
`cleanup_dist_env_and_memory()`, so per-phase counters start at zero
without needing the previous activation-registry workaround. Phase 2
also sets `VLLM_FORCE_AOT_LOAD=1` as a fail-loud guard (raises
FileNotFoundError on cache miss) on top of the counter assertion.

`collective_rpc(callable)` requires pickle-based serialization, so the
test sets `VLLM_ALLOW_INSECURE_SERIALIZATION=1` (the same pattern other
collective_rpc-using tests follow, e.g. `tests/v1/e2e/general/test_pooling_chunked_prefill.py`).

Signed-off-by: haosdent <haosdent@gmail.com>
---
 tests/compile/test_aot_compile.py | 92 ++++++++++++++-----------------
 1 file changed, 41 insertions(+), 51 deletions(-)

diff --git a/tests/compile/test_aot_compile.py b/tests/compile/test_aot_compile.py
index c3a065c56142..13e988307047 100644
--- a/tests/compile/test_aot_compile.py
+++ b/tests/compile/test_aot_compile.py
@@ -1,9 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import functools
 import hashlib
-import multiprocessing
 import os
 import pickle
 import tempfile
@@ -15,7 +13,6 @@
 import torch
 
 import vllm.envs as envs
-import vllm.model_executor.layers.activation
 from vllm.compilation.backends import VllmBackend
 from vllm.compilation.caching import (
     StandaloneCompiledArtifacts,
@@ -476,64 +473,57 @@ def test_standalone_compile_correctness():
 @create_new_process_for_each_test("spawn")
 def test_gpt2_cache_hit(monkeypatch: pytest.MonkeyPatch):
     """
-    Test that compiling gpt2 twice results in a cache hit and
-    capture torch dynamic symbol creations to ensure make_symbol
-    not called on cache hit.
-    """
+    Test that compiling gpt2 twice results in a cache hit.
 
-    import torch.fx.experimental.symbolic_shapes as symbolic_shapes_module
-    from torch.utils._sympy.symbol import make_symbol
+    Counter values are read from the EngineCore subprocess via
+    ``LLM.collective_rpc`` so the test works under default V1
+    multiprocessing (no shared memory between test and engine).
+    """
 
     from vllm import LLM
 
-    create_symbol_counter = multiprocessing.Value("i", 0)
-    original_make_symbol = make_symbol
+    def _snap(self):
+        from vllm.compilation.counter import compilation_counter
 
-    @functools.wraps(original_make_symbol)
-    def counting_make_symbol(prefix, idx, **kwargs):
-        with create_symbol_counter.get_lock():
-            create_symbol_counter.value += 1
-        return original_make_symbol(prefix, idx, **kwargs)
-
-    symbolic_shapes_module.make_symbol = counting_make_symbol
-    try:
-        with monkeypatch.context() as m, tempfile.TemporaryDirectory() as tmpdirname:
-            m.setenv("VLLM_CACHE_ROOT", tmpdirname)
-            m.setenv("VLLM_USE_AOT_COMPILE", "1")
-            # First compilation - initialize model and generate
-            llm_model = LLM(
-                model="gpt2",
-                compilation_config=CompilationConfig(
-                    mode=CompilationMode.VLLM_COMPILE,
-                ),
-                max_model_len=256,
-            )
+        return (
+            compilation_counter.num_aot_compiles,
+            compilation_counter.num_aot_artifacts_saved,
+            compilation_counter.num_aot_artifacts_loaded,
+        )
 
-            llm_model.generate("Hello, my name is")
-            assert create_symbol_counter.value == 2
-            create_symbol_counter.value = 0
+    # collective_rpc(callable) requires pickle-based serialization.
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
 
-            # Clean up first model
-            del llm_model
-            disable_envs_cache()
-            vllm.model_executor.layers.activation._ACTIVATION_REGISTRY._dict.clear()
+    with monkeypatch.context() as m, tempfile.TemporaryDirectory() as tmpdirname:
+        m.setenv("VLLM_CACHE_ROOT", tmpdirname)
+        m.setenv("VLLM_USE_AOT_COMPILE", "1")
+        # First compilation - initialize model and generate
+        llm_model = LLM(
+            model="gpt2",
+            compilation_config=CompilationConfig(
+                mode=CompilationMode.VLLM_COMPILE,
+            ),
+            max_model_len=256,
+        )
 
-            # Second compilation - should hit cache
-            m.setenv("VLLM_FORCE_AOT_LOAD", "1")
-            llm_model = LLM(
-                model="gpt2",
-                compilation_config=CompilationConfig(
-                    mode=CompilationMode.VLLM_COMPILE,
-                ),
-                max_model_len=256,
-            )
-            llm_model.generate("Hello, my name is")
+        llm_model.generate("Hello, my name is")
+        assert llm_model.collective_rpc(_snap)[0] == (1, 1, 0)
 
-            assert create_symbol_counter.value == 0
+        # Clean up first model
+        del llm_model
+        disable_envs_cache()
 
-    finally:
-        # Restore original method
-        symbolic_shapes_module.make_symbol = original_make_symbol
+        # Second compilation - should hit cache
+        m.setenv("VLLM_FORCE_AOT_LOAD", "1")
+        llm_model = LLM(
+            model="gpt2",
+            compilation_config=CompilationConfig(
+                mode=CompilationMode.VLLM_COMPILE,
+            ),
+            max_model_len=256,
+        )
+        llm_model.generate("Hello, my name is")
+        assert llm_model.collective_rpc(_snap)[0] == (0, 0, 1)
 
 
 @pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")