diff --git a/.github/workflows/pr-test-npu.yml b/.github/workflows/pr-test-npu.yml index 57f04db6ce79..b951a139a5f0 100644 --- a/.github/workflows/pr-test-npu.yml +++ b/.github/workflows/pr-test-npu.yml @@ -60,7 +60,7 @@ jobs: - "python/sglang/!(multimodal_gen)/**/!(*.md)" - "python/pyproject_npu.toml" - "scripts/ci/npu/npu_ci_install_dependency.sh" - - "test/srt/ascend/**" + - "test/registered/ascend/**" - ".github/workflows/pr-test-npu.yml" multimodal_gen: - "python/sglang/multimodal_gen/**/!(*.md|*.ipynb)" diff --git a/test/srt/ascend/test_embed_interpolate_unittest.py b/test/srt/ascend/test_embed_interpolate_unittest.py deleted file mode 100644 index d18f71091241..000000000000 --- a/test/srt/ascend/test_embed_interpolate_unittest.py +++ /dev/null @@ -1,104 +0,0 @@ -import unittest - -import torch - -from sglang.srt.configs.qwen3_vl import Qwen3VLConfig -from sglang.srt.distributed.parallel_state import ( - init_distributed_environment, - initialize_model_parallel, -) -from sglang.srt.layers.dp_attention import initialize_dp_attention -from sglang.srt.layers.quantization.unquant import ( - LinearMethodBase, - UnquantizedLinearMethod, -) -from sglang.srt.models.qwen3_vl import Qwen3VLMoeVisionModel -from sglang.srt.server_args import ServerArgs, set_global_server_args_for_scheduler - - -def unpack(tensor, dim_len, pack_len): - dim_part = dim_len // pack_len - ret_val = tensor.reshape(dim_part, dim_part, pack_len, pack_len, -1) - ret_val = ret_val.permute(4, 0, 2, 1, 3).reshape(1, -1, dim_len, dim_len) - return ret_val - - -class TestEmbedInterpolate(unittest.TestCase): - @classmethod - def setUpClass(cls): - cls.pDevice = torch.get_default_device() - torch.set_default_device("npu") - - @classmethod - def tearDownClass(cls): - torch.set_default_device(cls.pDevice) - - def test_embed_interpolate(self): - self.assertTrue(issubclass(UnquantizedLinearMethod, LinearMethodBase)) - t_dim = [16, 32] - s_dim = [192, 574] - sarg = ServerArgs(model_path="dummy", device="npu") - mconf = Qwen3VLConfig( - hidden_size=64, - num_heads=1, - num_position_embeddings=2304, - patch_size=16, - spatial_merge_size=2, - temporal_patch_size=2, - deepstack_visual_indexes=[5, 11, 17], - in_channels=3, - depth=24, - intermediate_size=256, - hidden_act="gelu_pytorch_tanh", - out_hidden_size=2560, - ) - set_global_server_args_for_scheduler(sarg) - init_distributed_environment( - backend="gloo", - world_size=1, - rank=0, - local_rank=0, - distributed_init_method="tcp://127.0.0.1:2646", - ) - initialize_model_parallel() - initialize_dp_attention( - server_args=sarg, - model_config=mconf, - ) - model = Qwen3VLMoeVisionModel( - mconf, - quant_config=None, - norm_eps=1e-6, - prefix="visual", - ) - grid_thw = torch.tensor( - [(t, s, s) for t, s in zip(t_dim, s_dim)], dtype=torch.int32 - ) - embeddings = model.fast_pos_embed_interpolate(grid_thw) - - embeddings_s0 = embeddings[: s_dim[0] * s_dim[0], :] - embeddings_s1 = embeddings[s_dim[0] * s_dim[0] : 2 * s_dim[0] * s_dim[0], :] - self.assertTrue(torch.allclose(embeddings_s0, embeddings_s1, atol=5e-5)) - - embeddings_l = embeddings[ - t_dim[0] * s_dim[0] * s_dim[0] : t_dim[0] * s_dim[0] * s_dim[0] - + s_dim[1] * s_dim[1], - :, - ] - embeddings_s0 = torch.nn.functional.interpolate( - unpack(embeddings_s0, s_dim[0], 2), - size=(48, 48), - mode="area", - ) - embeddings_r = torch.nn.functional.interpolate( - unpack(embeddings_l, s_dim[1], 2), - size=(48, 48), - mode="area", - ) - self.assertTrue( - torch.allclose(embeddings_s0, embeddings_r, atol=5e-1, rtol=5e-1) - ) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/srt/configs/deepseek_v3.yaml b/test/srt/configs/deepseek_v3.yaml deleted file mode 100644 index 82d059cba881..000000000000 --- a/test/srt/configs/deepseek_v3.yaml +++ /dev/null @@ -1,28 +0,0 @@ -tasks: - - name: sglang-8192-1024-concurrency1 - server_cmd: python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3-0324 --tp 8 --trust-remote-code --disable-radix-cache --max-prefill-tokens 32768 - client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 8192 --random-output-len 1024 --max-concurrency 1 --num-prompts 5 --output-file deepseek_v3_results.jsonl - - - name: sglang-8192-1024-concurrency2 - server_cmd: python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3-0324 --tp 8 --trust-remote-code --disable-radix-cache --max-prefill-tokens 32768 - client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 8192 --random-output-len 1024 --max-concurrency 2 --num-prompts 10 --output-file deepseek_v3_results.jsonl - - - name: sglang-8192-1024-concurrency4 - server_cmd: python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3-0324 --tp 8 --trust-remote-code --disable-radix-cache --max-prefill-tokens 32768 - client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 8192 --random-output-len 1024 --max-concurrency 4 --num-prompts 20 --output-file deepseek_v3_results.jsonl - - - name: sglang-8192-1024-concurrency8 - server_cmd: python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3-0324 --tp 8 --trust-remote-code --disable-radix-cache --max-prefill-tokens 32768 - client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 8192 --random-output-len 1024 --max-concurrency 8 --num-prompts 32 --output-file deepseek_v3_results.jsonl - - - name: sglang-8192-1024-concurrency16 - server_cmd: python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3-0324 --tp 8 --trust-remote-code --disable-radix-cache --max-prefill-tokens 32768 - client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 8192 --random-output-len 1024 --max-concurrency 16 --num-prompts 48 --output-file deepseek_v3_results.jsonl - - - name: sglang-8192-1024-concurrency24 - server_cmd: python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3-0324 --tp 8 --trust-remote-code --disable-radix-cache --max-prefill-tokens 32768 - client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 8192 --random-output-len 1024 --max-concurrency 24 --num-prompts 72 --output-file deepseek_v3_results.jsonl - - - name: sglang-8192-1024-concurrency32 - server_cmd: python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3-0324 --tp 8 --trust-remote-code --disable-radix-cache --max-prefill-tokens 32768 - client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 8192 --random-output-len 1024 --max-concurrency 32 --num-prompts 96 --output-file deepseek_v3_results.jsonl diff --git a/test/srt/configs/deepseek_v3_long_context.yaml b/test/srt/configs/deepseek_v3_long_context.yaml deleted file mode 100644 index df416a4299a1..000000000000 --- a/test/srt/configs/deepseek_v3_long_context.yaml +++ /dev/null @@ -1,28 +0,0 @@ -tasks: - - name: sglang-32000-100-concurrency1 - server_cmd: python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3-0324 --tp 8 --trust-remote-code --disable-radix-cache --max-prefill-tokens 32768 - client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 32000 --random-output-len 100 --max-concurrency 1 --num-prompts 5 --output-file deepseek_v3_long_context_results.jsonl - - - name: sglang-32000-100-concurrency2 - server_cmd: python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3-0324 --tp 8 --trust-remote-code --disable-radix-cache --max-prefill-tokens 32768 - client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 32000 --random-output-len 100 --max-concurrency 2 --num-prompts 10 --output-file deepseek_v3_long_context_results.jsonl - - - name: sglang-32000-100-concurrency4 - server_cmd: python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3-0324 --tp 8 --trust-remote-code --disable-radix-cache --max-prefill-tokens 32768 - client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 32000 --random-output-len 100 --max-concurrency 4 --num-prompts 20 --output-file deepseek_v3_long_context_results.jsonl - - - name: sglang-32000-100-concurrency8 - server_cmd: python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3-0324 --tp 8 --trust-remote-code --disable-radix-cache --max-prefill-tokens 32768 - client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 32000 --random-output-len 100 --max-concurrency 8 --num-prompts 32 --output-file deepseek_v3_long_context_results.jsonl - - - name: sglang-32000-100-concurrency16 - server_cmd: python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3-0324 --tp 8 --trust-remote-code --disable-radix-cache --max-prefill-tokens 32768 - client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 32000 --random-output-len 100 --max-concurrency 16 --num-prompts 48 --output-file deepseek_v3_long_context_results.jsonl - - - name: sglang-32000-100-concurrency24 - server_cmd: python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3-0324 --tp 8 --trust-remote-code --disable-radix-cache --max-prefill-tokens 32768 - client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 32000 --random-output-len 100 --max-concurrency 24 --num-prompts 72 --output-file deepseek_v3_long_context_results.jsonl - - - name: sglang-32000-100-concurrency32 - server_cmd: python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3-0324 --tp 8 --trust-remote-code --disable-radix-cache --max-prefill-tokens 32768 - client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 32000 --random-output-len 100 --max-concurrency 32 --num-prompts 96 --output-file deepseek_v3_long_context_results.jsonl diff --git a/test/srt/configs/llama_405b.yaml b/test/srt/configs/llama_405b.yaml deleted file mode 100644 index db0c816fb577..000000000000 --- a/test/srt/configs/llama_405b.yaml +++ /dev/null @@ -1,28 +0,0 @@ -tasks: - - name: sglang-8192-1024-concurrency1 - server_cmd: python3 -m sglang.launch_server --model nvidia/Llama-3.1-405B-Instruct-FP8 --tp 8 - client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 8192 --random-output-len 1024 --max-concurrency 1 --num-prompts 5 --output-file llama_405b_results.jsonl - - - name: sglang-8192-1024-concurrency2 - server_cmd: python3 -m sglang.launch_server --model nvidia/Llama-3.1-405B-Instruct-FP8 --tp 8 - client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 8192 --random-output-len 1024 --max-concurrency 2 --num-prompts 10 --output-file llama_405b_results.jsonl - - - name: sglang-8192-1024-concurrency4 - server_cmd: python3 -m sglang.launch_server --model nvidia/Llama-3.1-405B-Instruct-FP8 --tp 8 - client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 8192 --random-output-len 1024 --max-concurrency 4 --num-prompts 20 --output-file llama_405b_results.jsonl - - - name: sglang-8192-1024-concurrency8 - server_cmd: python3 -m sglang.launch_server --model nvidia/Llama-3.1-405B-Instruct-FP8 --tp 8 - client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 8192 --random-output-len 1024 --max-concurrency 8 --num-prompts 32 --output-file llama_405b_results.jsonl - - - name: sglang-8192-1024-concurrency16 - server_cmd: python3 -m sglang.launch_server --model nvidia/Llama-3.1-405B-Instruct-FP8 --tp 8 - client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 8192 --random-output-len 1024 --max-concurrency 16 --num-prompts 48 --output-file llama_405b_results.jsonl - - - name: sglang-8192-1024-concurrency24 - server_cmd: python3 -m sglang.launch_server --model nvidia/Llama-3.1-405B-Instruct-FP8 --tp 8 - client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 8192 --random-output-len 1024 --max-concurrency 24 --num-prompts 72 --output-file llama_405b_results.jsonl - - - name: sglang-8192-1024-concurrency32 - server_cmd: python3 -m sglang.launch_server --model nvidia/Llama-3.1-405B-Instruct-FP8 --tp 8 - client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 8192 --random-output-len 1024 --max-concurrency 32 --num-prompts 96 --output-file llama_405b_results.jsonl diff --git a/test/srt/configs/random_config.yaml b/test/srt/configs/random_config.yaml deleted file mode 100644 index eae8c27f41c0..000000000000 --- a/test/srt/configs/random_config.yaml +++ /dev/null @@ -1,25 +0,0 @@ -tasks: - - name: sglang-128-4 - server_cmd: python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disable-radix-cache - client_cmd: python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 128 --random-output 4 --request-rate 24 --num-prompt 1440 - - name: vllm-128-4 - server_cmd: python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-8B-Instruct --disable-log-requests - client_cmd: python3 -m sglang.bench_serving --backend vllm --dataset-name random --random-input 128 --random-output 4 --request-rate 24 --num-prompt 1440 - - name: sglang-2000-100 - server_cmd: python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disable-radix-cache - client_cmd: python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 2000 --random-output 100 --request-rate 2 --num-prompt 120 - - name: vllm-2000-100 - server_cmd: python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-8B-Instruct --disable-log-requests - client_cmd: python3 -m sglang.bench_serving --backend vllm --dataset-name random --random-input 2000 --random-output 100 --request-rate 2 --num-prompt 120 - - name: sglang-4000-200 - server_cmd: python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disable-radix-cache - client_cmd: python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 4000 --random-output 200 --request-rate 8 --num-prompt 480 - - name: vllm-4000-200 - server_cmd: python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-8B-Instruct --disable-log-requests - client_cmd: python3 -m sglang.bench_serving --backend vllm --dataset-name random --random-input 4000 --random-output 200 --request-rate 8 --num-prompt 480 - - name: sglang-32000-100 - server_cmd: python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disable-radix-cache - client_cmd: python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 32000 --random-output 100 --request-rate 1 --num-prompt 60 - - name: vllm-32000-100 - server_cmd: python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-8B-Instruct --disable-log-requests - client_cmd: python3 -m sglang.bench_serving --backend vllm --dataset-name random --random-input 32000 --random-output 100 --request-rate 1 --num-prompt 60 diff --git a/test/srt/configs/random_flashinfer_vs_triton_config.yaml b/test/srt/configs/random_flashinfer_vs_triton_config.yaml deleted file mode 100644 index 7f4a386ddcfe..000000000000 --- a/test/srt/configs/random_flashinfer_vs_triton_config.yaml +++ /dev/null @@ -1,25 +0,0 @@ -tasks: - - name: sglang-128-4 - server_cmd: python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disable-radix-cache - client_cmd: python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 128 --random-output 4 --request-rate 24 --num-prompt 1440 - - name: sglang-triton-128-4 - server_cmd: python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disable-radix-cache --attention-backend triton - client_cmd: python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 128 --random-output 4 --request-rate 24 --num-prompt 1440 - - name: sglang-2000-100 - server_cmd: python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disable-radix-cache - client_cmd: python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 2000 --random-output 100 --request-rate 2 --num-prompt 120 - - name: sglang-triton-2000-100 - server_cmd: python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disable-radix-cache --attention-backend triton - client_cmd: python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 2000 --random-output 100 --request-rate 2 --num-prompt 120 - - name: sglang-4000-200 - server_cmd: python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disable-radix-cache - client_cmd: python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 4000 --random-output 200 --request-rate 8 --num-prompt 480 - - name: sglang-triton-4000-200 - server_cmd: python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disable-radix-cache --attention-backend triton - client_cmd: python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 4000 --random-output 200 --request-rate 8 --num-prompt 480 - - name: sglang-32000-100 - server_cmd: python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disable-radix-cache - client_cmd: python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 32000 --random-output 100 --request-rate 1 --num-prompt 60 - - name: sglang-triton-32000-100 - server_cmd: python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disable-radix-cache --attention-backend triton - client_cmd: python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 32000 --random-output 100 --request-rate 1 --num-prompt 60 diff --git a/test/srt/configs/sharegpt_config.yaml b/test/srt/configs/sharegpt_config.yaml deleted file mode 100644 index a80b96c8eaec..000000000000 --- a/test/srt/configs/sharegpt_config.yaml +++ /dev/null @@ -1,7 +0,0 @@ -tasks: - - name: sglang-benchmark - server_cmd: python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disable-radix-cache - client_cmd: python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --request-rate 16 - - name: vllm-benchmark - server_cmd: python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-8B-Instruct --disable-log-requests - client_cmd: python3 -m sglang.bench_serving --backend vllm --dataset-name sharegpt --request-rate 16 diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 9fff2f7db1a5..d053a895a07d 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -13,9 +13,7 @@ suites = { # quantization_test suite migrated to test/registered/quant/ # All CUDA tests migrated to test/registered/ - "__not_in_ci__": [ - TestFile("ascend/test_embed_interpolate_unittest.py"), - ], + "__not_in_ci__": [], } # Add AMD tests