diff --git a/.github/workflows/nightly-test-amd-rocm720.yml b/.github/workflows/nightly-test-amd-rocm720.yml index 272972077769..ae763597bb89 100644 --- a/.github/workflows/nightly-test-amd-rocm720.yml +++ b/.github/workflows/nightly-test-amd-rocm720.yml @@ -665,7 +665,7 @@ jobs: echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} - # 8-GPU GLM-5 (Accuracy) ROCm 7.2 + # 8-GPU GLM-5 (Accuracy + Performance combined) ROCm 7.2 nightly-8-gpu-glm5-rocm720: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-glm5-rocm720,')) runs-on: linux-mi325-8gpu-sglang @@ -697,6 +697,18 @@ jobs: echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} + - name: Performance Test ROCm 7.2 (8-GPU GLM-5) + timeout-minutes: 120 + continue-on-error: true # Perf test failure doesn't fail the job if accuracy passed + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e SGLANG_USE_AITER=1 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-glm5 --nightly --timeout-per-file 5400 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + # 8-GPU MiniMax-M2.5 (Accuracy + Performance combined) ROCm 7.2 nightly-8-gpu-minimax-m25-rocm720: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-minimax-m25-rocm720,')) @@ -1276,6 +1288,7 @@ jobs: echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} + # MI35x 8-GPU GLM-5 (Accuracy + Performance combined) ROCm 7.2 nightly-8-gpu-mi35x-glm5-rocm720: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-mi35x-glm5-rocm720,')) runs-on: linux-mi35x-gpu-8 @@ -1309,6 +1322,17 @@ jobs: echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} + - name: Performance Test MI35x ROCm 7.2 (8-GPU GLM-5) + timeout-minutes: 120 + continue-on-error: true # Perf test failure doesn't fail the job if accuracy passed + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-glm5 --nightly --timeout-per-file 5400 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + # MI35x 8-GPU GLM-4.7-FP8 (Accuracy) ROCm 7.2 nightly-8-gpu-mi35x-glm47-fp8-rocm720: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-mi35x-glm47-fp8-rocm720,')) diff --git a/.github/workflows/nightly-test-amd.yml b/.github/workflows/nightly-test-amd.yml index 702ec1d94085..8495f51d5315 100644 --- a/.github/workflows/nightly-test-amd.yml +++ b/.github/workflows/nightly-test-amd.yml @@ -668,6 +668,7 @@ jobs: echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} + # 8-GPU GLM-5 (Accuracy + Performance combined) nightly-8-gpu-glm5: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-glm5,')) runs-on: linux-mi325-8gpu-sglang @@ -699,6 +700,18 @@ jobs: echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} + - name: Performance Test (8-GPU GLM-5) + timeout-minutes: 120 + continue-on-error: true # Perf test failure doesn't fail the job if accuracy passed + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e SGLANG_USE_AITER=1 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-glm5 --nightly --timeout-per-file 5400 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + # 8-GPU MiniMax-M2.5 (Accuracy + Performance combined) nightly-8-gpu-minimax-m25: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-minimax-m25,')) @@ -1281,6 +1294,7 @@ jobs: echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} + # MI35x 8-GPU GLM-5 (Accuracy + Performance combined) nightly-8-gpu-mi35x-glm5: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-mi35x-glm5,')) runs-on: linux-mi35x-gpu-8 @@ -1314,6 +1328,17 @@ jobs: echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} + - name: Performance Test MI35x (8-GPU GLM-5) + timeout-minutes: 120 + continue-on-error: true # Perf test failure doesn't fail the job if accuracy passed + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-glm5 --nightly --timeout-per-file 5400 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + # MI35x 8-GPU MiniMax-M2.5 (Accuracy + Performance combined) nightly-8-gpu-mi35x-minimax-m25: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-mi35x-minimax-m25,')) diff --git a/test/registered/amd/accuracy/mi30x/test_glm5_eval_amd.py b/test/registered/amd/accuracy/mi30x/test_glm5_eval_amd.py index ccfae7c192df..93233439f3ab 100644 --- a/test/registered/amd/accuracy/mi30x/test_glm5_eval_amd.py +++ b/test/registered/amd/accuracy/mi30x/test_glm5_eval_amd.py @@ -59,13 +59,17 @@ def get_display_name(self) -> str: GLM5_MODELS = [ # GLM-5 with NSA attention (TP=8) ModelConfig( - model_path="zai-org/GLM-5", + model_path="zai-org/GLM-5-FP8", tp_size=8, accuracy_threshold=0.93, timeout=3600, variant="nsa", other_args=[ "--trust-remote-code", + "--reasoning-parser", + "glm45", + "--tool-call-parser", + "glm47", "--nsa-prefill-backend", "tilelang", "--nsa-decode-backend", @@ -77,7 +81,7 @@ def get_display_name(self) -> str: "--model-loader-extra-config", '{"enable_multithread_load": true}', "--watchdog-timeout", - "1200", # 20 minutes for weight loading + "1200", ], env_vars={"SGLANG_USE_AITER": "1"}, ), diff --git a/test/registered/amd/accuracy/mi35x/test_glm5_eval_mi35x.py b/test/registered/amd/accuracy/mi35x/test_glm5_eval_mi35x.py index 96b38b692ab3..02af23a57c3c 100644 --- a/test/registered/amd/accuracy/mi35x/test_glm5_eval_mi35x.py +++ b/test/registered/amd/accuracy/mi35x/test_glm5_eval_mi35x.py @@ -64,13 +64,17 @@ def get_display_name(self) -> str: MI35X_GLM5_MODELS = [ # GLM-5 with NSA attention (TP=8) ModelConfig( - model_path="zai-org/GLM-5", + model_path="zai-org/GLM-5-FP8", tp_size=8, accuracy_threshold=0.93, timeout=5400, variant="nsa", other_args=[ "--trust-remote-code", + "--reasoning-parser", + "glm45", + "--tool-call-parser", + "glm47", "--nsa-prefill-backend", "tilelang", "--nsa-decode-backend", @@ -82,7 +86,7 @@ def get_display_name(self) -> str: "--model-loader-extra-config", '{"enable_multithread_load": true}', "--watchdog-timeout", - "1200", # 20 minutes for weight loading + "1200", ], env_vars={}, ), diff --git a/test/registered/amd/perf/mi30x/test_glm5_perf_amd.py b/test/registered/amd/perf/mi30x/test_glm5_perf_amd.py new file mode 100644 index 000000000000..1cdd8f660a93 --- /dev/null +++ b/test/registered/amd/perf/mi30x/test_glm5_perf_amd.py @@ -0,0 +1,140 @@ +"""Nightly performance benchmark for GLM-5 on MI30x. + +Tests GLM-5 with NSA attention backend using bench_one_batch on 8 GPUs. + +Model paths can be configured via environment variables: +- GLM5_MODEL_PATH: Path to GLM-5 model (default: zai-org/GLM-5-FP8) + +Example usage: + python -m pytest test_glm5_perf_amd.py -v +""" + +import os +import unittest +from typing import List + +from sglang.test.ci.ci_register import register_amd_ci +from sglang.test.nightly_bench_utils import BenchmarkResult +from sglang.test.nightly_utils import NightlyBenchmarkRunner +from sglang.test.test_utils import DEFAULT_URL_FOR_TEST, _parse_int_list_env + +register_amd_ci(est_time=5400, suite="nightly-perf-8-gpu-glm5", nightly=True) + + +def generate_simple_markdown_report(results: List[BenchmarkResult]) -> str: + model_header = results[0].model_path + if results[0].run_name and results[0].run_name != "default": + model_header += f" ({results[0].run_name})" + + gpu_config = os.getenv("GPU_CONFIG", "MI325") + if gpu_config: + model_header += f" [{gpu_config}]" + + summary = f"### {model_header}\n" + summary += "| batch size | input len | latency (s) | input throughput (tok/s) | output throughput (tok/s) | ITL (ms) |\n" + summary += "| ---------- | --------- | ----------- | ------------------------ | ------------------------- | -------- |\n" + + report_results = ( + results[1:] + if len(results) > 1 and results[0].batch_size == results[1].batch_size + else results + ) + + for result in report_results: + itl = 1 / (result.output_throughput / result.batch_size) * 1000 + summary += f"| {result.batch_size} | {result.input_len} | {result.latency:.2f} | {result.input_throughput:.2f} | {result.output_throughput:.2f} | {itl:.2f} |\n" + + return summary + + +GLM5_MODEL_PATH = os.environ.get("GLM5_MODEL_PATH", "zai-org/GLM-5-FP8") +PROFILE_DIR = "performance_profiles_glm5" + + +class TestNightlyGLM5Performance(unittest.TestCase): + """Nightly performance benchmark for GLM-5. + + Tests GLM-5 with NSA attention backend on TP=8. + """ + + @classmethod + def setUpClass(cls): + cls.base_url = DEFAULT_URL_FOR_TEST + cls.batch_sizes = [1, 8, 16, 64] + cls.input_lens = tuple(_parse_int_list_env("NIGHTLY_INPUT_LENS", "4096")) + cls.output_lens = tuple(_parse_int_list_env("NIGHTLY_OUTPUT_LENS", "512")) + + cls.model_config = { + "name": "glm5", + "model_path": GLM5_MODEL_PATH, + "other_args": [ + "--trust-remote-code", + "--reasoning-parser", + "glm45", + "--tool-call-parser", + "glm47", + "--tp", + "8", + "--nsa-prefill-backend", + "tilelang", + "--nsa-decode-backend", + "tilelang", + "--kv-cache-dtype", + "fp8_e4m3", + "--chunked-prefill-size", + "131072", + "--mem-fraction-static", + "0.85", + "--model-loader-extra-config", + '{"enable_multithread_load": true}', + "--watchdog-timeout", + "1200", + ], + "env_vars": { + "SGLANG_USE_AITER": "1", + }, + } + + cls.runner = NightlyBenchmarkRunner(PROFILE_DIR, cls.__name__, cls.base_url) + cls.runner.setup_profile_directory() + cls.runner.full_report = f"## {cls.__name__}\n" + + def test_bench_glm5(self): + """Run benchmark for GLM-5.""" + old_env = {} + for key, value in self.model_config.get("env_vars", {}).items(): + old_env[key] = os.environ.get(key) + os.environ[key] = value + + try: + result_tuple = self.runner.run_benchmark_for_model( + model_path=self.model_config["model_path"], + batch_sizes=self.batch_sizes, + input_lens=self.input_lens, + output_lens=self.output_lens, + other_args=self.model_config["other_args"], + variant=self.model_config["name"], + extra_bench_args=["--trust-remote-code"], + enable_profile=False, + timeout=5400, + ) + results = result_tuple[0] + success = result_tuple[1] + + if results: + self.runner.full_report += ( + generate_simple_markdown_report(results) + "\n" + ) + + self.assertTrue(success, f"Benchmark failed for {GLM5_MODEL_PATH}") + finally: + for key, value in old_env.items(): + if value is None: + os.environ.pop(key, None) + else: + os.environ[key] = value + self.runner.write_final_report() + + +if __name__ == "__main__": + unittest.main() diff --git a/test/registered/amd/perf/mi35x/test_glm5_perf_mi35x.py b/test/registered/amd/perf/mi35x/test_glm5_perf_mi35x.py new file mode 100644 index 000000000000..a742cbc1d425 --- /dev/null +++ b/test/registered/amd/perf/mi35x/test_glm5_perf_mi35x.py @@ -0,0 +1,143 @@ +"""MI35x Nightly performance benchmark for GLM-5. + +Tests GLM-5 with NSA attention backend using bench_one_batch on 8 GPUs. + +Registry: nightly-perf-8-gpu-mi35x-glm5 suite +""" + +import os + +os.environ.setdefault("HF_HOME", "/data2/models/huggingface") +os.environ.setdefault("HF_HUB_CACHE", "/data2/models/huggingface/hub") + +import unittest +from typing import List + +from sglang.test.ci.ci_register import register_amd_ci +from sglang.test.nightly_bench_utils import BenchmarkResult +from sglang.test.nightly_utils import NightlyBenchmarkRunner +from sglang.test.test_utils import DEFAULT_URL_FOR_TEST, _parse_int_list_env + +register_amd_ci(est_time=5400, suite="nightly-perf-8-gpu-mi35x-glm5", nightly=True) + + +def generate_simple_markdown_report(results: List[BenchmarkResult]) -> str: + model_header = results[0].model_path + if results[0].run_name and results[0].run_name != "default": + model_header += f" ({results[0].run_name})" + + gpu_config = os.getenv("GPU_CONFIG", "MI35x") + if gpu_config: + model_header += f" [{gpu_config}]" + + summary = f"### {model_header}\n" + summary += "| batch size | input len | latency (s) | input throughput (tok/s) | output throughput (tok/s) | ITL (ms) |\n" + summary += "| ---------- | --------- | ----------- | ------------------------ | ------------------------- | -------- |\n" + + report_results = ( + results[1:] + if len(results) > 1 and results[0].batch_size == results[1].batch_size + else results + ) + + for result in report_results: + itl = 1 / (result.output_throughput / result.batch_size) * 1000 + summary += f"| {result.batch_size} | {result.input_len} | {result.latency:.2f} | {result.input_throughput:.2f} | {result.output_throughput:.2f} | {itl:.2f} |\n" + + return summary + + +GLM5_MODEL_PATH = os.environ.get("GLM5_MODEL_PATH", "zai-org/GLM-5-FP8") +PROFILE_DIR = "performance_profiles_glm5_mi35x" + + +class TestGLM5PerfMI35x(unittest.TestCase): + """Nightly performance benchmark for GLM-5 on MI35x. + + Tests GLM-5 with NSA attention backend on TP=8. + """ + + @classmethod + def setUpClass(cls): + cls.base_url = DEFAULT_URL_FOR_TEST + cls.batch_sizes = [1, 8, 16, 64] + cls.input_lens = tuple(_parse_int_list_env("NIGHTLY_INPUT_LENS", "4096")) + cls.output_lens = tuple(_parse_int_list_env("NIGHTLY_OUTPUT_LENS", "512")) + + cls.model_config = { + "name": "glm5-mi35x", + "model_path": GLM5_MODEL_PATH, + "other_args": [ + "--trust-remote-code", + "--reasoning-parser", + "glm45", + "--tool-call-parser", + "glm47", + "--tp", + "8", + "--nsa-prefill-backend", + "tilelang", + "--nsa-decode-backend", + "tilelang", + "--kv-cache-dtype", + "fp8_e4m3", + "--chunked-prefill-size", + "131072", + "--mem-fraction-static", + "0.85", + "--model-loader-extra-config", + '{"enable_multithread_load": true, "num_threads": 8}', + "--watchdog-timeout", + "1200", + ], + "env_vars": { + "SGLANG_ROCM_FUSED_DECODE_MLA": "0", + "ROCM_QUICK_REDUCE_QUANTIZATION": "INT4", + "SAFETENSORS_FAST_GPU": "1", + }, + } + + os.environ.setdefault("SGLANG_BENCH_TIMEOUT", "3600") + cls.runner = NightlyBenchmarkRunner(PROFILE_DIR, cls.__name__, cls.base_url) + cls.runner.setup_profile_directory() + cls.runner.full_report = f"## {cls.__name__}\n" + + def test_glm5_perf(self): + """Run GLM-5 performance benchmark on MI35x.""" + old_env = {} + for key, value in self.model_config.get("env_vars", {}).items(): + old_env[key] = os.environ.get(key) + os.environ[key] = value + + try: + result_tuple = self.runner.run_benchmark_for_model( + model_path=self.model_config["model_path"], + batch_sizes=self.batch_sizes, + input_lens=self.input_lens, + output_lens=self.output_lens, + other_args=self.model_config["other_args"], + variant=self.model_config["name"], + extra_bench_args=["--trust-remote-code"], + enable_profile=False, + timeout=5400, + ) + results = result_tuple[0] + success = result_tuple[1] + + if results: + self.runner.full_report += ( + generate_simple_markdown_report(results) + "\n" + ) + + self.assertTrue(success, f"Benchmark failed for {GLM5_MODEL_PATH} on MI35x") + finally: + for key, value in old_env.items(): + if value is None: + os.environ.pop(key, None) + else: + os.environ[key] = value + self.runner.write_final_report() + + +if __name__ == "__main__": + unittest.main()