vllm-project · wangxiyuan · Apr 21, 2026 · Apr 20, 2026
@@ -127,6 +127,7 @@ def test_init_batch_invariance(self, batch_invariant_enabled, has_backend, expec
         """Test init_batch_invariance under different conditions"""
         # Mock dependencies
         import vllm.envs as envs
+
         envs.VLLM_BATCH_INVARIANT = batch_invariant_enabled
         batch_invariant.HAS_TRITON = has_backend
         batch_invariant.HAS_ASCENDC_BATCH_INVARIANT = has_backend
@@ -151,11 +152,8 @@ def test_init_batch_invariance(self, batch_invariant_enabled, has_backend, expec
     def test_add_rms_norm(self, mock_torch_npu):
         """Test add_rms_norm function"""
         # Mock dependencies
-        mock_torch = batch_invariant.torch
 
         # Create mock tensors
-        batch_size = 2
-        hidden_size = 4
         x = MagicMock(spec=torch.Tensor)
         residual = MagicMock(spec=torch.Tensor)
         weight = MagicMock(spec=torch.Tensor)
@@ -187,8 +185,6 @@ def test_add_rms_norm(self, mock_torch_npu):
     def test_add_rms_norm_consistency(self, mock_torch_npu):
         """Test that add_rms_norm produces the same output as torch_npu.npu_add_rms_norm"""
         # Create mock tensors
-        batch_size = 2
-        hidden_size = 4
         x = MagicMock(spec=torch.Tensor)
         residual = MagicMock(spec=torch.Tensor)
         weight = MagicMock(spec=torch.Tensor)

@@ -13,8 +13,7 @@
 # This file is a part of the vllm-ascend project.
 #
 
-from vllm_ascend.compilation.passes.utils.npugraph_ex_utils_check import \
-    extra_stream_scope_check
+from vllm_ascend.compilation.passes.utils.npugraph_ex_utils_check import extra_stream_scope_check
 
 
 def test_extra_stream_scope_check_logic():
@@ -24,31 +23,25 @@ def test_extra_stream_scope_check_logic():
     """
 
     class MockNode:
-
         def __init__(self, stream_label=None):
             self.op = "call_function"
             self.meta = {"stream_label": stream_label}
 
     class MockMatch:
-
         def __init__(self, nodes):
             self.nodes = nodes
 
     # Test 1: all default → OK
-    assert extra_stream_scope_check(
-        MockMatch([MockNode(None), MockNode(None)])) is True
+    assert extra_stream_scope_check(MockMatch([MockNode(None), MockNode(None)])) is True
 
     # Test 2: same non-default → OK
-    assert extra_stream_scope_check(
-        MockMatch([MockNode("s1"), MockNode("s1")])) is True
+    assert extra_stream_scope_check(MockMatch([MockNode("s1"), MockNode("s1")])) is True
 
     # Test 3: mixed non-default → FAIL
-    assert extra_stream_scope_check(
-        MockMatch([MockNode("s1"), MockNode("s2")])) is False
+    assert extra_stream_scope_check(MockMatch([MockNode("s1"), MockNode("s2")])) is False
 
     # Test 4: default + non-default → FAIL
-    assert extra_stream_scope_check(
-        MockMatch([MockNode(None), MockNode("s1")])) is False
+    assert extra_stream_scope_check(MockMatch([MockNode(None), MockNode("s1")])) is False
 
     # Test 5: empty → OK
     assert extra_stream_scope_check(MockMatch([])) is True
@@ -20,17 +20,17 @@
 
 triton_runtime = MagicMock()
 triton_runtime.driver.active.utils.get_device_properties.return_value = {
-    'num_aic': 8,
-    'num_vectorcore': 8,
+    "num_aic": 8,
+    "num_vectorcore": 8,
 }
-sys.modules['triton.runtime'] = triton_runtime
+sys.modules["triton.runtime"] = triton_runtime
 
 from vllm_ascend.utils import adapt_patch  # noqa E402
 from vllm_ascend.utils import register_ascend_customop  # noqa E402
 
 # triton and torch_npu is not available in the environment, so we need to mock them
-sys.modules['torch_npu'].npu.current_device = MagicMock(return_value=0)
-sys.modules['torch_npu._inductor'] = MagicMock()
+sys.modules["torch_npu"].npu.current_device = MagicMock(return_value=0)
+sys.modules["torch_npu._inductor"] = MagicMock()
 
 adapt_patch()
 adapt_patch(True)

@@ -14,30 +14,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-from typing import Any, Dict, Optional
 from unittest.mock import MagicMock, patch
 
 import torch
-from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
-                         SchedulerConfig, VllmConfig)
+from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
 from vllm.sampling_params import SamplingParams
 from vllm.utils.hashing import sha256
-from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
-                                         init_none_hash)
-from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
-                                        KVCacheGroupSpec)
+from vllm.v1.core.kv_cache_utils import get_request_block_hasher, init_none_hash
+from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request
 from vllm.v1.structured_output import StructuredOutputManager
 
 from tests.ut.base import TestBase
-from vllm_ascend.ascend_config import (ProfilingChunkConfig,
-                                       clear_ascend_config, init_ascend_config)
-from vllm_ascend.core.profiling_chunk_predictor import (ChunkSizePredictor,
-                                                        ProfilingChunkManager)
-from vllm_ascend.core.scheduler_profiling_chunk import \
-    ProfilingChunkScheduler
-
+from vllm_ascend.ascend_config import ProfilingChunkConfig, clear_ascend_config, init_ascend_config
+from vllm_ascend.core.profiling_chunk_predictor import ChunkSizePredictor, ProfilingChunkManager
+from vllm_ascend.core.scheduler_profiling_chunk import ProfilingChunkScheduler
 
 MODEL = "Qwen/Qwen3-0.6B"
 BLOCK_SIZE = 16
@@ -63,10 +55,7 @@ def create_requests(num_requests, num_tokens=10, max_tokens=16):
 
 def make_output(scheduler):
     req_ids = [req.request_id for req in scheduler.running]
-    req_id_to_index = {
-        req.request_id: i
-        for i, req in enumerate(scheduler.running)
-    }
+    req_id_to_index = {req.request_id: i for i, req in enumerate(scheduler.running)}
     sampled_token_ids = [[1000]] * len(scheduler.running)
     return ModelRunnerOutput(
         req_ids=req_ids,
@@ -84,7 +73,6 @@ def make_output(scheduler):
 
 
 class TestProfilingChunkConfig(TestBase):
-
     def test_default_values(self):
         cfg = ProfilingChunkConfig()
         self.assertFalse(cfg.enabled)
@@ -144,10 +132,9 @@ def test_disabled_without_pp_ok(self, _mock):
 
 
 class TestChunkSizePredictor(TestBase):
-
     @staticmethod
     def _make_data(a, b, c, seq_lens):
-        return [a * l * l + b * l + c for l in seq_lens]
+        return [a * seq_len * seq_len + b * seq_len + c for seq_len in seq_lens]
 
     def test_fit_and_predict(self):
         predictor = ChunkSizePredictor()
@@ -158,8 +145,7 @@ def test_fit_and_predict(self):
         predictor.set_target_latency(8192)
         predictor.is_ready = True
 
-        chunk = predictor.predict(
-            num_computed_tokens=0, base_chunk_size=8192, page_size=128)
+        chunk = predictor.predict(num_computed_tokens=0, base_chunk_size=8192, page_size=128)
         self.assertIsNotNone(chunk)
         self.assertEqual(chunk % 128, 0)
 
@@ -204,7 +190,6 @@ def test_fit_chunk_and_predict_with_history(self):
 
 
 class TestProfilingChunkManager(TestBase):
-
     def test_not_ready_before_profiling(self):
         mgr = ProfilingChunkManager(base_chunk_size=8192, page_size=128)
         self.assertFalse(mgr.is_ready)
@@ -213,7 +198,7 @@ def test_not_ready_before_profiling(self):
     def test_run_profiling_success(self):
         mgr = ProfilingChunkManager(base_chunk_size=8192, page_size=128)
         seq_lens = list(range(64, 8256, 128))
-        latencies = [1e-6 * l * l + 0.01 * l + 1.0 for l in seq_lens]
+        latencies = [1e-6 * seq_len * seq_len + 0.01 * seq_len + 1.0 for seq_len in seq_lens]
         self.assertTrue(mgr.predictor.fit(seq_lens, latencies))
         mgr.predictor.set_target_latency(8192)
         mgr.predictor.is_ready = True
@@ -233,15 +218,14 @@ def test_run_profiling_all_fail(self):
     def test_record_batch_refines_model(self):
         mgr = ProfilingChunkManager(base_chunk_size=8192, page_size=128)
         seq_lens = list(range(64, 8256, 128))
-        latencies = [1e-6 * l * l + 0.01 * l + 1.0 for l in seq_lens]
+        latencies = [1e-6 * seq_len * seq_len + 0.01 * seq_len + 1.0 for seq_len in seq_lens]
         mgr.predictor.fit(seq_lens, latencies)
         mgr.predictor.set_target_latency(8192)
         mgr.predictor.is_ready = True
         mgr._profiling_done = True
 
         for i in range(10):
-            mgr.record_batch_execution_time(
-                [(4096 - i * 100, i * 500)], 0.05 + i * 0.01)
+            mgr.record_batch_execution_time([(4096 - i * 100, i * 500)], 0.05 + i * 0.01)
         self.assertGreaterEqual(len(mgr.chunked_fit_data), 10)
         self.assertTrue(mgr.history_ready)
 
@@ -252,7 +236,6 @@ def test_record_batch_refines_model(self):
 
 
 class TestProfilingChunkScheduler(TestBase):
-
     @patch("vllm_ascend.ascend_config.AscendConfig.__init__", MagicMock(return_value=None))
     @patch("vllm_ascend.ascend_config.get_ascend_config")
     @patch("vllm.config.ModelConfig.__post_init__", MagicMock())
@@ -262,8 +245,7 @@ def create_scheduler(self, mock_get_ascend_config):
         profiling_cfg.enabled = True
         profiling_cfg.smooth_factor = 0.8
         profiling_cfg.min_chunk = 256
-        mock_get_ascend_config.return_value = MagicMock(
-            profiling_chunk_config=profiling_cfg)
+        mock_get_ascend_config.return_value = MagicMock(profiling_chunk_config=profiling_cfg)
 
         mock_hf_config = MagicMock()
         mock_hf_config.model_type = "qwen3"
@@ -295,7 +277,8 @@ def create_scheduler(self, mock_get_ascend_config):
         scheduler_config.chunked_prefill_enabled = True
 
         cache_config = CacheConfig(
-            block_size=BLOCK_SIZE, gpu_memory_utilization=0.9,
+            block_size=BLOCK_SIZE,
+            gpu_memory_utilization=0.9,
             cache_dtype="auto",
         )
 
@@ -306,6 +289,7 @@ def create_scheduler(self, mock_get_ascend_config):
         )
         vllm_config.parallel_config.pipeline_parallel_size = 2
         from unittest.mock import PropertyMock
+
         type(model_config).is_encoder_decoder = PropertyMock(return_value=False)
         vllm_config.model_config.hf_config.is_encoder_decoder = False
 
@@ -314,13 +298,8 @@ def create_scheduler(self, mock_get_ascend_config):
             kv_cache_tensors=[],
             kv_cache_groups=[
                 KVCacheGroupSpec(
-                    ['layer'],
-                    FullAttentionSpec(
-                        block_size=BLOCK_SIZE,
-                        num_kv_heads=1,
-                        head_size=1,
-                        dtype=torch.float32
-                    )
+                    ["layer"],
+                    FullAttentionSpec(block_size=BLOCK_SIZE, num_kv_heads=1, head_size=1, dtype=torch.float32),
                 )
             ],
         )
@@ -408,8 +387,7 @@ def test_schedule_chunked_prefill_running(self):
         mock_executor.collective_rpc.return_value = [10.0]
         scheduler.run_profiling_chunk_init(mock_executor)
 
-        requests = create_requests(num_requests=1, num_tokens=2000,
-                                   max_tokens=16)
+        requests = create_requests(num_requests=1, num_tokens=2000, max_tokens=16)
         for req in requests:
             scheduler.add_request(req)