diff --git a/python/sglang/srt/hardware_backend/npu/utils.py b/python/sglang/srt/hardware_backend/npu/utils.py index a0515f4f80e5..b5f42ab1d9a0 100644 --- a/python/sglang/srt/hardware_backend/npu/utils.py +++ b/python/sglang/srt/hardware_backend/npu/utils.py @@ -170,6 +170,10 @@ def npu_format_cast( ) return tensor + # Skip format cast for meta tensors (used in offloader) + if tensor.device.type == "meta": + return tensor + return torch.ops.npu.npu_format_cast(tensor, acl_format.value) diff --git a/python/sglang/srt/layers/quantization/unquant.py b/python/sglang/srt/layers/quantization/unquant.py index 1ade4ed9e4da..6fed17f84e05 100644 --- a/python/sglang/srt/layers/quantization/unquant.py +++ b/python/sglang/srt/layers/quantization/unquant.py @@ -317,8 +317,10 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: if _is_npu: for weight_name in ["w13_weight", "w2_weight"]: weight = getattr(layer, weight_name) - weight.data = weight.data.transpose(1, 2) - weight.data = npu_format_cast(weight.data) + origin_weight = weight.data.transpose(1, 2) + new_weight = origin_weight.contiguous() + origin_weight.untyped_storage().resize_(0) + weight.data = npu_format_cast(new_weight) return diff --git a/python/sglang/srt/utils/offloader.py b/python/sglang/srt/utils/offloader.py index 58ab19c1f4e3..522d4e4d85a4 100644 --- a/python/sglang/srt/utils/offloader.py +++ b/python/sglang/srt/utils/offloader.py @@ -452,6 +452,10 @@ def _move_param_to_meta(module, param_name): data=new_data, requires_grad=False, ) + if hasattr(old_param, "weihgt_loader"): + new_param.weight_loader = old_param.weight_loader + else: + new_param.weight_loader = lambda *args, **kwargs: None else: raise ValueError(f"Unknown {old_param_type=} {old_param=}") diff --git a/test/registered/ascend/basic_function/offloading/test_npu_offload_modes.py b/test/registered/ascend/basic_function/offloading/test_npu_offload_modes.py new file mode 100644 index 000000000000..f69e801af7b5 --- /dev/null +++ b/test/registered/ascend/basic_function/offloading/test_npu_offload_modes.py @@ -0,0 +1,104 @@ +import unittest +from urllib.parse import urlparse + +import requests + +from sglang.srt.utils import kill_process_tree +from sglang.test.ascend.test_ascend_utils import DEEPSEEK_CODER_V2_LITE_WEIGHTS_PATH +from sglang.test.ci.ci_register import register_npu_ci +from sglang.test.test_utils import ( + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + CustomTestCase, + popen_launch_server, +) + +register_npu_ci(est_time=800, suite="nightly-2-npu-a3", nightly=True) + +TEST_MODEL_MATRIX = { + DEEPSEEK_CODER_V2_LITE_WEIGHTS_PATH, +} + + +class TestAscendOffloadModes(CustomTestCase): + + @classmethod + def setUpClass(cls): + cls.models = TEST_MODEL_MATRIX + cls.base_url = DEFAULT_URL_FOR_TEST + cls.url = urlparse(DEFAULT_URL_FOR_TEST) + cls.common_args = [ + "--trust-remote-code", + "--disable-cuda-graph", + "--mem-fraction-static", + 0.9, + "--attention-backend", + "ascend", + "--offload-group-size", + 4, + "--offload-num-in-group", + 1, + "--offload-prefetch-step", + 1, + "--dp-size", + 2, + ] + + def run_a_test(self, offload_mode, additional_args=None): + """Run test for a specific offload mode.""" + for model in self.models: + with self.subTest(model=model, offload_mode=offload_mode): + print(f"##=== Testing {offload_mode} offload: {model} ===##") + + args = [ + *self.common_args, + "--offload-mode", + offload_mode, + ] + + if additional_args: + args.extend(additional_args) + + process = popen_launch_server( + model, + self.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=args, + ) + + try: + # Check if server is running (basic functionality test) + response = requests.post( + f"{DEFAULT_URL_FOR_TEST}/generate", + json={ + "text": "Where is the capital of France?", + "sampling_params": { + "temperature": 0, + "max_new_tokens": 32, + }, + }, + ) + self.assertEqual( + response.status_code, + 200, + f"The request status code is not 200, server failed to respond for {offload_mode}", + ) + self.assertIn( + "Paris", + response.text, + f"The inference result does not include Paris, server failed to respond for {offload_mode}", + ) + finally: + kill_process_tree(process.pid) + + def test_offload_mode_cpu(self): + """Test offload mode: cpu""" + self.run_a_test("cpu") + + def test_offload_mode_sharded_gpu(self): + """Test offload mode: sharded_gpu""" + self.run_a_test("sharded_gpu") + + +if __name__ == "__main__": + unittest.main()