Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions python/sglang/srt/hardware_backend/npu/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,10 @@ def npu_format_cast(
)
return tensor

# Skip format cast for meta tensors (used in offloader)
if tensor.device.type == "meta":
return tensor

return torch.ops.npu.npu_format_cast(tensor, acl_format.value)


Expand Down
6 changes: 4 additions & 2 deletions python/sglang/srt/layers/quantization/unquant.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,8 +317,10 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
if _is_npu:
for weight_name in ["w13_weight", "w2_weight"]:
weight = getattr(layer, weight_name)
weight.data = weight.data.transpose(1, 2)
weight.data = npu_format_cast(weight.data)
origin_weight = weight.data.transpose(1, 2)
new_weight = origin_weight.contiguous()
origin_weight.untyped_storage().resize_(0)
weight.data = npu_format_cast(new_weight)

return

Expand Down
4 changes: 4 additions & 0 deletions python/sglang/srt/utils/offloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -452,6 +452,10 @@ def _move_param_to_meta(module, param_name):
data=new_data,
requires_grad=False,
)
if hasattr(old_param, "weihgt_loader"):
new_param.weight_loader = old_param.weight_loader
else:
new_param.weight_loader = lambda *args, **kwargs: None
else:
raise ValueError(f"Unknown {old_param_type=} {old_param=}")

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import unittest
from urllib.parse import urlparse

import requests

from sglang.srt.utils import kill_process_tree
from sglang.test.ascend.test_ascend_utils import DEEPSEEK_CODER_V2_LITE_WEIGHTS_PATH
from sglang.test.ci.ci_register import register_npu_ci
from sglang.test.test_utils import (
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
CustomTestCase,
popen_launch_server,
)

register_npu_ci(est_time=800, suite="nightly-2-npu-a3", nightly=True)

TEST_MODEL_MATRIX = {
DEEPSEEK_CODER_V2_LITE_WEIGHTS_PATH,
}


class TestAscendOffloadModes(CustomTestCase):

@classmethod
def setUpClass(cls):
cls.models = TEST_MODEL_MATRIX
cls.base_url = DEFAULT_URL_FOR_TEST
cls.url = urlparse(DEFAULT_URL_FOR_TEST)
cls.common_args = [
"--trust-remote-code",
"--disable-cuda-graph",
"--mem-fraction-static",
0.9,
"--attention-backend",
"ascend",
"--offload-group-size",
4,
"--offload-num-in-group",
1,
"--offload-prefetch-step",
1,
"--dp-size",
2,
]

def run_a_test(self, offload_mode, additional_args=None):
"""Run test for a specific offload mode."""
for model in self.models:
with self.subTest(model=model, offload_mode=offload_mode):
print(f"##=== Testing {offload_mode} offload: {model} ===##")

args = [
*self.common_args,
"--offload-mode",
offload_mode,
]

if additional_args:
args.extend(additional_args)

process = popen_launch_server(
model,
self.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=args,
)

try:
# Check if server is running (basic functionality test)
response = requests.post(
f"{DEFAULT_URL_FOR_TEST}/generate",
json={
"text": "Where is the capital of France?",
"sampling_params": {
"temperature": 0,
"max_new_tokens": 32,
},
},
)
self.assertEqual(
response.status_code,
200,
f"The request status code is not 200, server failed to respond for {offload_mode}",
)
self.assertIn(
"Paris",
response.text,
f"The inference result does not include Paris, server failed to respond for {offload_mode}",
)
finally:
kill_process_tree(process.pid)

def test_offload_mode_cpu(self):
"""Test offload mode: cpu"""
self.run_a_test("cpu")

def test_offload_mode_sharded_gpu(self):
"""Test offload mode: sharded_gpu"""
self.run_a_test("sharded_gpu")


if __name__ == "__main__":
unittest.main()
Loading