From e776c995c9dfda675f94badd43fe5ed5816432e9 Mon Sep 17 00:00:00 2001 From: xiazhahe <1479989397@qq.com> Date: Fri, 27 Mar 2026 11:04:24 +0800 Subject: [PATCH 1/4] fix conflict between empty_cache and use_mem_pool --- python/sglang/srt/model_executor/model_runner.py | 2 ++ python/sglang/srt/model_loader/loader.py | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index fc9afafac90b..fa6b94f58d59 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -1108,6 +1108,8 @@ def load_model(self): self.remote_instance_transfer_engine_weight_info = ( self.loader.remote_instance_transfer_engine_weight_info ) + if _is_npu: + torch.npu.empty_cache() monkey_patch_vllm_parallel_state(reverse=True) # Publish metadata to ModelExpress if running as seed source diff --git a/python/sglang/srt/model_loader/loader.py b/python/sglang/srt/model_loader/loader.py index 27d189d65622..5998eb0234ab 100644 --- a/python/sglang/srt/model_loader/loader.py +++ b/python/sglang/srt/model_loader/loader.py @@ -707,8 +707,6 @@ def load_weights_and_postprocess(model, weights, target_device): # parameters onto device for processing and back off after. with device_loading_context(module, target_device): quant_method.process_weights_after_loading(module) - if _is_npu: - torch.npu.empty_cache() class LayeredModelLoader(DefaultModelLoader): From a9cc31d1f8308974c1033d33e2d398432af70c5b Mon Sep 17 00:00:00 2001 From: xiazhahe <1479989397@qq.com> Date: Sat, 28 Mar 2026 10:32:39 +0800 Subject: [PATCH 2/4] fix conflict between empty_cache and use_mem_pool --- python/sglang/srt/model_executor/model_runner.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index fa6b94f58d59..9e1f21a6c5a4 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -1108,6 +1108,8 @@ def load_model(self): self.remote_instance_transfer_engine_weight_info = ( self.loader.remote_instance_transfer_engine_weight_info ) + # Cache needs to be cleared after loading model weights (in the self.loader.load_model function). + # To avoid conflict with memory_saver_adapter.region, empty_cache operation is now moved here. if _is_npu: torch.npu.empty_cache() monkey_patch_vllm_parallel_state(reverse=True) From 2a9ca791238d0f2dc30b713689262bd4d9701141 Mon Sep 17 00:00:00 2001 From: xiazhahe <1479989397@qq.com> Date: Sat, 28 Mar 2026 10:38:14 +0800 Subject: [PATCH 3/4] fix conflict between empty_cache and use_mem_pool --- python/sglang/srt/model_executor/model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 9e1f21a6c5a4..f70fd087ea2e 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -1108,7 +1108,7 @@ def load_model(self): self.remote_instance_transfer_engine_weight_info = ( self.loader.remote_instance_transfer_engine_weight_info ) - # Cache needs to be cleared after loading model weights (in the self.loader.load_model function). + # Cache needs to be cleared after loading model weights (in the self.loader.load_model function). # To avoid conflict with memory_saver_adapter.region, empty_cache operation is now moved here. if _is_npu: torch.npu.empty_cache() From b1aeb3d286dd3be01a70d3aab15b6d13e8eaa413 Mon Sep 17 00:00:00 2001 From: xiazhahe <1479989397@qq.com> Date: Sat, 28 Mar 2026 18:13:30 +0800 Subject: [PATCH 4/4] add ascend as not support backend for triton --- python/sglang/srt/utils/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/utils/common.py b/python/sglang/srt/utils/common.py index 57d85eb2f136..076946ed666b 100644 --- a/python/sglang/srt/utils/common.py +++ b/python/sglang/srt/utils/common.py @@ -363,7 +363,7 @@ def get_int_env_var(name: str, default: int = 0) -> int: def support_triton(backend: str) -> bool: - return backend not in ["torch_native", "intel_amx"] + return backend not in ["torch_native", "intel_amx", "ascend"] _ENABLE_TORCH_INFERENCE_MODE = get_bool_env_var(