sgl-project · iforgetmyname · Nov 25, 2025 · Nov 24, 2025 · Nov 24, 2025 · Nov 24, 2025
@@ -112,9 +112,9 @@ def alloc_extend(
                 device=self.device,
             )
             torch.ops.npu.alloc_extend(
-                prefix_lens,
-                seq_lens,
-                last_loc,
+                prefix_lens.to(torch.int64),
+                seq_lens.to(torch.int64),
+                last_loc.to(torch.int64),
                 self.free_pages,
                 self.page_size,
                 out_indices,

@@ -329,6 +329,7 @@ def __init__(self, model_runner: ModelRunner):
             seq_len_fill_value=self.seq_len_fill_value,
             encoder_len_fill_value=self.encoder_len_fill_value,
             num_tokens_per_bs=self.num_tokens_per_bs,
+            cache_loc_dtype=self._cache_loc_dtype(),
         )
 
         self.tbo_plugin = TboCudaGraphRunnerPlugin()

@@ -43,13 +43,14 @@ def create(
         seq_len_fill_value: int,
         encoder_len_fill_value: int,
         num_tokens_per_bs: int,
+        cache_loc_dtype: torch.dtype,
     ) -> "GraphInputBuffers":
         with torch.device(device):
             input_ids = torch.zeros((max_num_token,), dtype=torch.int64)
             input_embeds = torch.zeros((max_num_token, hidden_size), dtype=dtype)
             req_pool_indices = torch.zeros((max_bs,), dtype=torch.int32)
             seq_lens = torch.full((max_bs,), seq_len_fill_value, dtype=torch.int32)
-            out_cache_loc = torch.zeros((max_num_token,), dtype=torch.int64)
+            out_cache_loc = torch.zeros((max_num_token,), dtype=cache_loc_dtype)
             positions = torch.zeros((max_num_token,), dtype=torch.int64)
             mrope_positions = torch.zeros((3, max_num_token), dtype=torch.int64)
             num_token_non_padded = torch.zeros((1,), dtype=torch.int32)

@@ -107,8 +107,8 @@ def replay(
             self.replay_prepare(forward_batch, pp_proxy_tensors)
         else:
             # In speculative decoding, these two fields are still needed.
-            self.input_ids[: self.raw_num_token].copy_(forward_batch.input_ids)
-            self.positions[: self.raw_num_token].copy_(forward_batch.positions)
+            self.buffers.input_ids[: self.raw_num_token].copy_(forward_batch.input_ids)
+            self.buffers.positions[: self.raw_num_token].copy_(forward_batch.positions)
 
         # Replay
         if not is_deepseek_nsa(self.model_runner.model_config.hf_config):

@@ -133,8 +133,8 @@ def assign_req_to_token_pool_func(
         torch.ops.npu.cache_loc_assign(
             req_pool_indices,
             req_to_token,
-            start_offset,
-            end_offset,
+            start_offset.to(torch.int64),
+            end_offset.to(torch.int64),
             out_cache_loc,
         )
 

diff --git a/test/srt/ascend/test_ascend_deepseek_mtp.py → ...manual/ascend/test_ascend_deepseek_mtp.py b/test/srt/ascend/test_ascend_deepseek_mtp.py → ...manual/ascend/test_ascend_deepseek_mtp.py
@@ -8,9 +8,7 @@
     DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
     DEFAULT_URL_FOR_TEST,
     CustomTestCase,
-    is_in_ci,
     popen_launch_server,
-    run_bench_offline_throughput,
 )
 
 TEST_MODEL_MATRIX = {
@@ -71,26 +69,6 @@ def test_a_gsm8k(self):
                 finally:
                     kill_process_tree(process.pid)
 
-    def test_b_throughput(self):
-        for model in self.models:
-            with self.subTest(model=model):
-                print(f"##=== Testing throughput: {model} ===##")
-
-                output_throughput = run_bench_offline_throughput(
-                    model,
-                    [
-                        *self.common_args,
-                    ],
-                )
-
-                print(f"##=== {model} throughput: {output_throughput} ===##")
-
-                if is_in_ci():
-                    self.assertGreater(
-                        output_throughput,
-                        TEST_MODEL_MATRIX[model]["output_throughput"],
-                    )
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
@@ -366,7 +366,7 @@
     ],
     "per-commit-16-npu-a3": [
         TestFile("ascend/test_ascend_deepep.py", 400),
-        TestFile("ascend/test_ascend_deepseek_mtp.py", 400),
+        # TestFile("ascend/test_ascend_deepseek_mtp.py", 400),
     ],
 }