From e83c7dcdaae4ba8e5667d48c94458c0f31d14fd3 Mon Sep 17 00:00:00 2001
From: Icey <1790571317@qq.com>
Date: Tue, 28 Oct 2025 08:39:23 +0000
Subject: [PATCH 1/2] [Wip] fix problems introduced by vllm #26016

Signed-off-by: Icey <1790571317@qq.com>
---
 vllm_ascend/platform.py                   | 61 ++++++++---------------
 vllm_ascend/spec_decode/eagle_proposer.py |  4 +-
 2 files changed, 21 insertions(+), 44 deletions(-)

diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
index d7550bf11b7..152139dc904 100644
--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -143,47 +143,26 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 "Non-MLA LLMs forcibly disable the chunked prefill feature,"
                 "as the performance of operators supporting this feature "
                 "functionality is currently suboptimal.")
-            if vllm_version_is("0.11.0"):
-                if not model_config.is_multimodal_model and \
-                    structured_outputs_config.backend == "auto" and \
-                    not scheduler_config.send_delta_data and \
-                    not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \
-                    scheduler_config.policy == "fcfs":
-                    ascend_scheduler_config.enabled = True
-                    chunked_prefill_enabled_in_ascend_scheduler = getattr(
-                        ascend_scheduler_config, "enable_chunked_prefill",
-                        False)
-                    if chunked_prefill_enabled_in_ascend_scheduler:
-                        logger.warning(
-                            "Chunked prefill feature is enabled in ascend_scheduler,"
-                            "but note that the operator supporting this feature "
-                            "would lead to performance degradation.")
-                    # In this situation, max_num_batched_tokens would have been rewritten.
-                    # So we must make sure max_num_batched_tokens is not smaller than max_model_len.
-                    if (scheduler_config.max_num_batched_tokens
-                            < scheduler_config.max_model_len and
-                            not chunked_prefill_enabled_in_ascend_scheduler):
-                        scheduler_config.max_num_batched_tokens = scheduler_config.max_model_len
-            else:
-                if not model_config.is_multimodal_model and \
-                    structured_outputs_config.backend == "auto" and \
-                    not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \
-                    scheduler_config.policy == "fcfs":
-                    ascend_scheduler_config.enabled = True
-                    chunked_prefill_enabled_in_ascend_scheduler = getattr(
-                        ascend_scheduler_config, "enable_chunked_prefill",
-                        False)
-                    if chunked_prefill_enabled_in_ascend_scheduler:
-                        logger.warning(
-                            "Chunked prefill feature is enabled in ascend_scheduler,"
-                            "but note that the operator supporting this feature "
-                            "would lead to performance degradation.")
-                    # In this situation, max_num_batched_tokens would have been rewritten.
-                    # So we must make sure max_num_batched_tokens is not smaller than max_model_len.
-                    if (scheduler_config.max_num_batched_tokens
-                            < scheduler_config.max_model_len and
-                            not chunked_prefill_enabled_in_ascend_scheduler):
-                        scheduler_config.max_num_batched_tokens = scheduler_config.max_model_len
+            if not model_config.is_multimodal_model and \
+                structured_outputs_config.backend == "auto" and \
+                not getattr(scheduler_config, "send_delta_data", False) and \
+                not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \
+                scheduler_config.policy == "fcfs":
+                ascend_scheduler_config.enabled = True
+                chunked_prefill_enabled_in_ascend_scheduler = getattr(
+                    ascend_scheduler_config, "enable_chunked_prefill",
+                    False)
+                if chunked_prefill_enabled_in_ascend_scheduler:
+                    logger.warning(
+                        "Chunked prefill feature is enabled in ascend_scheduler,"
+                        "but note that the operator supporting this feature "
+                        "would lead to performance degradation.")
+                # In this situation, max_num_batched_tokens would have been rewritten.
+                # So we must make sure max_num_batched_tokens is not smaller than max_model_len.
+                if (scheduler_config.max_num_batched_tokens
+                        < scheduler_config.max_model_len and
+                        not chunked_prefill_enabled_in_ascend_scheduler):
+                    scheduler_config.max_num_batched_tokens = scheduler_config.max_model_len
 
         kv_cache_dtype = vllm_config.additional_config.get(
             "kv_cache_dtype", None)
diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py
index 74e2917806b..447484dea3f 100644
--- a/vllm_ascend/spec_decode/eagle_proposer.py
+++ b/vllm_ascend/spec_decode/eagle_proposer.py
@@ -63,9 +63,7 @@ def __init__(self,
                 == CompilationMode.VLLM_COMPILE
                 and not self.vllm_config.model_config.enforce_eager)
 
-        self.cudagraph_batch_sizes = list(
-            reversed(
-                self.vllm_config.compilation_config.cudagraph_capture_sizes))
+            self.cudagraph_batch_sizes = sorted(self.vllm_config.compilation_config.cudagraph_capture_sizes)
 
         # persistent buffers for cuda graph
         self.input_ids = torch.zeros(

From 820aad0bc5d15f9573bebf4772179bcb9b44a865 Mon Sep 17 00:00:00 2001
From: Icey <1790571317@qq.com>
Date: Tue, 28 Oct 2025 08:47:03 +0000
Subject: [PATCH 2/2] tiny fix

Signed-off-by: Icey <1790571317@qq.com>
---
 vllm_ascend/platform.py                   | 7 +++----
 vllm_ascend/spec_decode/eagle_proposer.py | 3 ++-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
index 152139dc904..401e0b27af0 100644
--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -150,8 +150,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 scheduler_config.policy == "fcfs":
                 ascend_scheduler_config.enabled = True
                 chunked_prefill_enabled_in_ascend_scheduler = getattr(
-                    ascend_scheduler_config, "enable_chunked_prefill",
-                    False)
+                    ascend_scheduler_config, "enable_chunked_prefill", False)
                 if chunked_prefill_enabled_in_ascend_scheduler:
                     logger.warning(
                         "Chunked prefill feature is enabled in ascend_scheduler,"
@@ -160,8 +159,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 # In this situation, max_num_batched_tokens would have been rewritten.
                 # So we must make sure max_num_batched_tokens is not smaller than max_model_len.
                 if (scheduler_config.max_num_batched_tokens
-                        < scheduler_config.max_model_len and
-                        not chunked_prefill_enabled_in_ascend_scheduler):
+                        < scheduler_config.max_model_len
+                        and not chunked_prefill_enabled_in_ascend_scheduler):
                     scheduler_config.max_num_batched_tokens = scheduler_config.max_model_len
 
         kv_cache_dtype = vllm_config.additional_config.get(
diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py
index 447484dea3f..c1be6ed65b7 100644
--- a/vllm_ascend/spec_decode/eagle_proposer.py
+++ b/vllm_ascend/spec_decode/eagle_proposer.py
@@ -63,7 +63,8 @@ def __init__(self,
                 == CompilationMode.VLLM_COMPILE
                 and not self.vllm_config.model_config.enforce_eager)
 
-            self.cudagraph_batch_sizes = sorted(self.vllm_config.compilation_config.cudagraph_capture_sizes)
+            self.cudagraph_batch_sizes = sorted(
+                self.vllm_config.compilation_config.cudagraph_capture_sizes)
 
         # persistent buffers for cuda graph
         self.input_ids = torch.zeros(