diff --git a/.github/workflows/nightly-test-amd-rocm720.yml b/.github/workflows/nightly-test-amd-rocm720.yml index 14929952ebd6..272972077769 100644 --- a/.github/workflows/nightly-test-amd-rocm720.yml +++ b/.github/workflows/nightly-test-amd-rocm720.yml @@ -621,7 +621,7 @@ jobs: echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} - # 8-GPU Qwen 3.5 (Accuracy) ROCm 7.2 + # 8-GPU Qwen 3.5 (Accuracy + Performance combined) ROCm 7.2 nightly-8-gpu-qwen35-rocm720: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-qwen35-rocm720,')) runs-on: linux-mi325-8gpu-sglang @@ -653,6 +653,18 @@ jobs: echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} + - name: Performance Test ROCm 7.2 (8-GPU Qwen 3.5 FP8) + timeout-minutes: 120 + continue-on-error: true + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e SGLANG_USE_AITER=1 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-qwen35-fp8 --nightly --timeout-per-file 5400 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + # 8-GPU GLM-5 (Accuracy) ROCm 7.2 nightly-8-gpu-glm5-rocm720: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-glm5-rocm720,')) @@ -1219,7 +1231,7 @@ jobs: echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} - # MI35x 8-GPU Qwen 3.5 (Accuracy) ROCm 7.2 + # MI35x 8-GPU Qwen 3.5 (Accuracy + Performance combined) ROCm 7.2 nightly-8-gpu-mi35x-qwen35-rocm720: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-mi35x-qwen35-rocm720,')) runs-on: linux-mi35x-gpu-8 @@ -1252,6 +1264,18 @@ jobs: echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} + - name: Performance Test MI35x ROCm 7.2 (8-GPU Qwen 3.5 FP8) + timeout-minutes: 120 + continue-on-error: true + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e SGLANG_USE_AITER=1 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-qwen35-fp8 --nightly --timeout-per-file 5400 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + nightly-8-gpu-mi35x-glm5-rocm720: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-mi35x-glm5-rocm720,')) runs-on: linux-mi35x-gpu-8 diff --git a/.github/workflows/nightly-test-amd.yml b/.github/workflows/nightly-test-amd.yml index 64cca74d7e0f..702ec1d94085 100644 --- a/.github/workflows/nightly-test-amd.yml +++ b/.github/workflows/nightly-test-amd.yml @@ -624,7 +624,7 @@ jobs: echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} - # 8-GPU Qwen 3.5 (Accuracy) + # 8-GPU Qwen 3.5 (Accuracy + Performance combined) nightly-8-gpu-qwen35: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-qwen35,')) runs-on: linux-mi325-8gpu-sglang @@ -656,6 +656,18 @@ jobs: echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} + - name: Performance Test (8-GPU Qwen 3.5 FP8) + timeout-minutes: 120 + continue-on-error: true + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e SGLANG_USE_AITER=1 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-qwen35-fp8 --nightly --timeout-per-file 5400 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + nightly-8-gpu-glm5: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-glm5,')) runs-on: linux-mi325-8gpu-sglang @@ -1224,7 +1236,7 @@ jobs: echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} - # MI35x 8-GPU Qwen 3.5 (Accuracy) + # MI35x 8-GPU Qwen 3.5 (Accuracy + Performance combined) nightly-8-gpu-mi35x-qwen35: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-mi35x-qwen35,')) runs-on: linux-mi35x-gpu-8 @@ -1257,6 +1269,18 @@ jobs: echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} + - name: Performance Test MI35x (8-GPU Qwen 3.5 FP8) + timeout-minutes: 120 + continue-on-error: true + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e SGLANG_USE_AITER=1 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-qwen35-fp8 --nightly --timeout-per-file 5400 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + nightly-8-gpu-mi35x-glm5: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-mi35x-glm5,')) runs-on: linux-mi35x-gpu-8 diff --git a/test/registered/amd/accuracy/mi30x/test_qwen35_eval_amd.py b/test/registered/amd/accuracy/mi30x/test_qwen35_eval_amd.py index dae0e31c10f7..112630ed474c 100644 --- a/test/registered/amd/accuracy/mi30x/test_qwen35_eval_amd.py +++ b/test/registered/amd/accuracy/mi30x/test_qwen35_eval_amd.py @@ -8,6 +8,10 @@ import os import unittest +from pathlib import Path + +import numpy as np +import yaml from sglang.srt.utils import kill_process_tree from sglang.test.ci.ci_register import register_amd_ci @@ -15,7 +19,9 @@ from sglang.test.test_utils import ( DEFAULT_URL_FOR_TEST, CustomTestCase, + is_in_ci, popen_launch_server, + write_github_step_summary, ) register_amd_ci(est_time=3600, suite="nightly-amd-accuracy-8-gpu-qwen35", nightly=True) @@ -38,7 +44,7 @@ def setUpClass(cls): "--tp", str(TP_SIZE), "--attention-backend", - "triton", + "aiter", "--trust-remote-code", "--model-loader-extra-config", '{"enable_multithread_load": true}', @@ -59,6 +65,41 @@ def setUpClass(cls): def tearDownClass(cls): kill_process_tree(cls.process.pid) + def test_lm_eval(self): + """Override to write accuracy results to GitHub step summary.""" + import requests + + requests.get(self.base_url + "/flush_cache") + + eval_config = yaml.safe_load( + Path(self.model_config_name).read_text(encoding="utf-8") + ) + results = self.launch_lm_eval(eval_config) + rtol = eval_config.get("rtol", self.default_rtol) + model_name = eval_config.get("model_name", self.model) + + success = True + summary = f"### lm-eval accuracy ({model_name})\n" + summary += "| task | metric | expected | measured | status |\n" + summary += "| ---- | ------ | -------- | -------- | ------ |\n" + for task in eval_config["tasks"]: + for metric in task["metrics"]: + expected = metric["value"] + measured = results["results"][task["name"]][metric["name"]] + passed = bool(np.isclose(expected, measured, rtol=rtol)) + status = "✅" if passed else "❌" + summary += f"| {task['name']} | {metric['name']} | {expected:.4f} | {measured:.4f} | {status} |\n" + print( + f"{task['name']} | {metric['name']}: " + f"expected={expected:.3f} | measured={measured:.3f} | rtol={rtol}" + ) + success = success and passed + + if is_in_ci(): + write_github_step_summary(summary) + + self.assertTrue(success, "lm-eval validation failed") + if __name__ == "__main__": unittest.main() diff --git a/test/registered/amd/accuracy/mi35x/test_qwen35_eval_mi35x.py b/test/registered/amd/accuracy/mi35x/test_qwen35_eval_mi35x.py index 2c6b8059bfa8..4b35a28d4405 100644 --- a/test/registered/amd/accuracy/mi35x/test_qwen35_eval_mi35x.py +++ b/test/registered/amd/accuracy/mi35x/test_qwen35_eval_mi35x.py @@ -8,8 +8,11 @@ import os import unittest +from pathlib import Path +import numpy as np import requests +import yaml from sglang.srt.utils import kill_process_tree from sglang.test.ci.ci_register import register_amd_ci @@ -17,7 +20,9 @@ from sglang.test.test_utils import ( DEFAULT_URL_FOR_TEST, CustomTestCase, + is_in_ci, popen_launch_server, + write_github_step_summary, ) register_amd_ci( @@ -40,12 +45,12 @@ def setUpClass(cls): cls.base_url = DEFAULT_URL_FOR_TEST def test_lm_eval(self): - """Override to handle server lifecycle within test method (MI35x pattern).""" + """Override to handle server lifecycle and write results to summary.""" other_args = [ "--tp", str(TP_SIZE), "--attention-backend", - "triton", + "aiter", "--trust-remote-code", "--model-loader-extra-config", '{"enable_multithread_load": true}', @@ -65,7 +70,35 @@ def test_lm_eval(self): try: requests.get(self.base_url + "/flush_cache") - super().test_lm_eval() + + eval_config = yaml.safe_load( + Path(self.model_config_name).read_text(encoding="utf-8") + ) + results = self.launch_lm_eval(eval_config) + rtol = eval_config.get("rtol", self.default_rtol) + model_name = eval_config.get("model_name", self.model) + + success = True + summary = f"### lm-eval accuracy ({model_name})\n" + summary += "| task | metric | expected | measured | status |\n" + summary += "| ---- | ------ | -------- | -------- | ------ |\n" + for task in eval_config["tasks"]: + for metric in task["metrics"]: + expected = metric["value"] + measured = results["results"][task["name"]][metric["name"]] + passed = bool(np.isclose(expected, measured, rtol=rtol)) + status = "✅" if passed else "❌" + summary += f"| {task['name']} | {metric['name']} | {expected:.4f} | {measured:.4f} | {status} |\n" + print( + f"{task['name']} | {metric['name']}: " + f"expected={expected:.3f} | measured={measured:.3f} | rtol={rtol}" + ) + success = success and passed + + if is_in_ci(): + write_github_step_summary(summary) + + self.assertTrue(success, "lm-eval validation failed") finally: kill_process_tree(process.pid) diff --git a/test/registered/amd/perf/mi30x/test_qwen35_fp8_perf_amd.py b/test/registered/amd/perf/mi30x/test_qwen35_fp8_perf_amd.py new file mode 100644 index 000000000000..be5314a6438a --- /dev/null +++ b/test/registered/amd/perf/mi30x/test_qwen35_fp8_perf_amd.py @@ -0,0 +1,139 @@ +"""Nightly performance benchmark for Qwen3.5-397B-A17B FP8. + +Tests Qwen3.5-397B-A17B-FP8 (MoE, Hybrid Attention with Gated Delta Networks) +on 8 GPUs with triton attention backend. + +Model path can be configured via environment variable: +- QWEN35_FP8_MODEL_PATH: Path to Qwen3.5-FP8 model + (default: Qwen/Qwen3.5-397B-A17B-FP8) + +Example usage: + python -m pytest test_qwen35_fp8_perf_amd.py -v +""" + +import os +import unittest +from typing import List + +from sglang.test.ci.ci_register import register_amd_ci +from sglang.test.nightly_bench_utils import BenchmarkResult +from sglang.test.nightly_utils import NightlyBenchmarkRunner +from sglang.test.test_utils import DEFAULT_URL_FOR_TEST, _parse_int_list_env + +register_amd_ci(est_time=5400, suite="nightly-perf-8-gpu-qwen35-fp8", nightly=True) + + +def generate_simple_markdown_report(results: List[BenchmarkResult]) -> str: + """Generate a simplified markdown report without traces and cost columns. + + Skips the first result if it's a warmup run (duplicate batch_size). + """ + model_header = results[0].model_path + if results[0].run_name and results[0].run_name != "default": + model_header += f" ({results[0].run_name})" + + gpu_config = os.getenv("GPU_CONFIG", "MI325") + if gpu_config: + model_header += f" [{gpu_config}]" + + summary = f"### {model_header}\n" + summary += "| batch size | input len | latency (s) | input throughput (tok/s) | output throughput (tok/s) | ITL (ms) |\n" + summary += "| ---------- | --------- | ----------- | ------------------------ | ------------------------- | -------- |\n" + + report_results = ( + results[1:] + if len(results) > 1 and results[0].batch_size == results[1].batch_size + else results + ) + + for result in report_results: + itl = 1 / (result.output_throughput / result.batch_size) * 1000 + summary += f"| {result.batch_size} | {result.input_len} | {result.latency:.2f} | {result.input_throughput:.2f} | {result.output_throughput:.2f} | {itl:.2f} |\n" + + return summary + + +QWEN35_FP8_MODEL_PATH = os.environ.get( + "QWEN35_FP8_MODEL_PATH", "Qwen/Qwen3.5-397B-A17B-FP8" +) +PROFILE_DIR = "performance_profiles_qwen35_fp8" + + +class TestNightlyQwen35Fp8Performance(unittest.TestCase): + """Nightly performance benchmark for Qwen3.5-397B-A17B FP8. + + Tests Qwen3.5 FP8 with triton attention backend on TP=8. + Runtime: ~90 minutes + """ + + @classmethod + def setUpClass(cls): + cls.base_url = DEFAULT_URL_FOR_TEST + cls.batch_sizes = [1, 8, 16, 64] + cls.input_lens = tuple(_parse_int_list_env("NIGHTLY_INPUT_LENS", "4096")) + cls.output_lens = tuple(_parse_int_list_env("NIGHTLY_OUTPUT_LENS", "512")) + + cls.model_config = { + "name": "qwen35-fp8", + "model_path": QWEN35_FP8_MODEL_PATH, + "other_args": [ + "--trust-remote-code", + "--tp", + "8", + "--attention-backend", + "aiter", + "--mem-fraction-static", + "0.8", + "--model-loader-extra-config", + '{"enable_multithread_load": true}', + "--watchdog-timeout", + "1200", + ], + "env_vars": { + "SGLANG_USE_AITER": "1", + }, + } + + cls.runner = NightlyBenchmarkRunner(PROFILE_DIR, cls.__name__, cls.base_url) + cls.runner.setup_profile_directory() + cls.runner.full_report = f"## {cls.__name__}\n" + + def test_bench_qwen35_fp8(self): + """Run benchmark for Qwen3.5-397B-A17B FP8.""" + old_env = {} + for key, value in self.model_config.get("env_vars", {}).items(): + old_env[key] = os.environ.get(key) + os.environ[key] = value + + try: + result_tuple = self.runner.run_benchmark_for_model( + model_path=self.model_config["model_path"], + batch_sizes=self.batch_sizes, + input_lens=self.input_lens, + output_lens=self.output_lens, + other_args=self.model_config["other_args"], + variant=self.model_config["name"], + extra_bench_args=["--trust-remote-code"], + enable_profile=False, + timeout=5400, + ) + results = result_tuple[0] + success = result_tuple[1] + + if results: + self.runner.full_report += ( + generate_simple_markdown_report(results) + "\n" + ) + + self.assertTrue(success, f"Benchmark failed for {QWEN35_FP8_MODEL_PATH}") + finally: + for key, value in old_env.items(): + if value is None: + os.environ.pop(key, None) + else: + os.environ[key] = value + self.runner.write_final_report() + + +if __name__ == "__main__": + unittest.main() diff --git a/test/registered/amd/perf/mi35x/test_qwen35_fp8_perf_mi35x.py b/test/registered/amd/perf/mi35x/test_qwen35_fp8_perf_mi35x.py new file mode 100644 index 000000000000..6446eb601e84 --- /dev/null +++ b/test/registered/amd/perf/mi35x/test_qwen35_fp8_perf_mi35x.py @@ -0,0 +1,139 @@ +"""MI35x Nightly performance benchmark for Qwen3.5-397B-A17B FP8. + +Tests Qwen3.5-397B-A17B-FP8 (MoE, Hybrid Attention with Gated Delta Networks) +on 8 GPUs with triton attention backend. + +Registry: nightly-perf-8-gpu-mi35x-qwen35-fp8 suite +""" + +import os + +os.environ.setdefault("HF_HOME", "/data2/models/huggingface") +os.environ.setdefault("HF_HUB_CACHE", "/data2/models/huggingface/hub") + +import unittest +from typing import List + +from sglang.test.ci.ci_register import register_amd_ci +from sglang.test.nightly_bench_utils import BenchmarkResult +from sglang.test.nightly_utils import NightlyBenchmarkRunner +from sglang.test.test_utils import DEFAULT_URL_FOR_TEST, _parse_int_list_env + +register_amd_ci( + est_time=5400, suite="nightly-perf-8-gpu-mi35x-qwen35-fp8", nightly=True +) + + +def generate_simple_markdown_report(results: List[BenchmarkResult]) -> str: + """Generate a simplified markdown report without traces and cost columns. + + Skips the first result if it's a warmup run (duplicate batch_size). + """ + model_header = results[0].model_path + if results[0].run_name and results[0].run_name != "default": + model_header += f" ({results[0].run_name})" + + gpu_config = os.getenv("GPU_CONFIG", "MI35x") + if gpu_config: + model_header += f" [{gpu_config}]" + + summary = f"### {model_header}\n" + summary += "| batch size | input len | latency (s) | input throughput (tok/s) | output throughput (tok/s) | ITL (ms) |\n" + summary += "| ---------- | --------- | ----------- | ------------------------ | ------------------------- | -------- |\n" + + report_results = ( + results[1:] + if len(results) > 1 and results[0].batch_size == results[1].batch_size + else results + ) + + for result in report_results: + itl = 1 / (result.output_throughput / result.batch_size) * 1000 + summary += f"| {result.batch_size} | {result.input_len} | {result.latency:.2f} | {result.input_throughput:.2f} | {result.output_throughput:.2f} | {itl:.2f} |\n" + + return summary + + +QWEN35_FP8_MODEL_PATH = os.environ.get( + "QWEN35_FP8_MODEL_PATH", "Qwen/Qwen3.5-397B-A17B-FP8" +) +PROFILE_DIR = "performance_profiles_qwen35_fp8_mi35x" + + +class TestQwen35Fp8PerfMI35x(unittest.TestCase): + """Test suite for Qwen3.5-397B-A17B FP8 performance benchmarks on MI35x.""" + + @classmethod + def setUpClass(cls): + cls.base_url = DEFAULT_URL_FOR_TEST + cls.batch_sizes = [1, 8, 16, 64] + cls.input_lens = tuple(_parse_int_list_env("NIGHTLY_INPUT_LENS", "4096")) + cls.output_lens = tuple(_parse_int_list_env("NIGHTLY_OUTPUT_LENS", "512")) + + cls.model_config = { + "name": "qwen35-fp8-mi35x", + "model_path": QWEN35_FP8_MODEL_PATH, + "other_args": [ + "--trust-remote-code", + "--tp", + "8", + "--attention-backend", + "aiter", + "--mem-fraction-static", + "0.8", + "--model-loader-extra-config", + '{"enable_multithread_load": true}', + "--watchdog-timeout", + "1200", + ], + "env_vars": { + "SGLANG_USE_AITER": "1", + }, + } + + cls.runner = NightlyBenchmarkRunner(PROFILE_DIR, cls.__name__, cls.base_url) + cls.runner.setup_profile_directory() + cls.runner.full_report = f"## {cls.__name__}\n" + + def test_qwen35_fp8_perf(self): + """Run Qwen3.5-397B-A17B FP8 performance benchmark on MI35x.""" + old_env = {} + for key, value in self.model_config.get("env_vars", {}).items(): + old_env[key] = os.environ.get(key) + os.environ[key] = value + + try: + result_tuple = self.runner.run_benchmark_for_model( + model_path=self.model_config["model_path"], + batch_sizes=self.batch_sizes, + input_lens=self.input_lens, + output_lens=self.output_lens, + other_args=self.model_config["other_args"], + variant=self.model_config["name"], + extra_bench_args=["--trust-remote-code"], + enable_profile=False, + timeout=5400, + ) + results = result_tuple[0] + success = result_tuple[1] + + if results: + self.runner.full_report += ( + generate_simple_markdown_report(results) + "\n" + ) + + self.assertTrue( + success, + f"Benchmark failed for {QWEN35_FP8_MODEL_PATH} on MI35x", + ) + finally: + for key, value in old_env.items(): + if value is None: + os.environ.pop(key, None) + else: + os.environ[key] = value + self.runner.write_final_report() + + +if __name__ == "__main__": + unittest.main()