From cc460e9ad6cd133877937d04d671ffc2929869ca Mon Sep 17 00:00:00 2001 From: Michael <13900043+michaelzhang-ai@users.noreply.github.com> Date: Mon, 30 Mar 2026 02:02:08 -0500 Subject: [PATCH 1/4] [AMD CI] Add Qwen3.5-397B FP8 nightly perf benchmarks for MI30x and MI35x Add bench_one_batch performance tests for Qwen3.5-397B-A17B-FP8 on both MI325/MI300X and MI35x GPUs. Perf steps run after existing accuracy tests with continue-on-error so perf failures don't block CI when accuracy passes. - New test files using triton attention backend, TP=8, mem-fraction 0.8 - Perf steps added to both default ROCm and ROCm 7.2 nightly workflows - Suite names: nightly-perf-8-gpu-qwen35-fp8, nightly-perf-8-gpu-mi35x-qwen35-fp8 --- .../workflows/nightly-test-amd-rocm720.yml | 28 +++- .github/workflows/nightly-test-amd.yml | 28 +++- .../perf/mi30x/test_qwen35_fp8_perf_amd.py | 139 ++++++++++++++++++ .../perf/mi35x/test_qwen35_fp8_perf_mi35x.py | 139 ++++++++++++++++++ 4 files changed, 330 insertions(+), 4 deletions(-) create mode 100644 test/registered/amd/perf/mi30x/test_qwen35_fp8_perf_amd.py create mode 100644 test/registered/amd/perf/mi35x/test_qwen35_fp8_perf_mi35x.py diff --git a/.github/workflows/nightly-test-amd-rocm720.yml b/.github/workflows/nightly-test-amd-rocm720.yml index 2c0d3a120e5e..e00ddd188783 100644 --- a/.github/workflows/nightly-test-amd-rocm720.yml +++ b/.github/workflows/nightly-test-amd-rocm720.yml @@ -621,7 +621,7 @@ jobs: echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} - # 8-GPU Qwen 3.5 (Accuracy) ROCm 7.2 + # 8-GPU Qwen 3.5 (Accuracy + Performance combined) ROCm 7.2 nightly-8-gpu-qwen35-rocm720: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-qwen35-rocm720,')) runs-on: linux-mi325-8gpu-sglang @@ -653,6 +653,18 @@ jobs: echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} + - name: Performance Test ROCm 7.2 (8-GPU Qwen 3.5 FP8) + timeout-minutes: 120 + continue-on-error: true + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e SGLANG_USE_AITER=1 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-qwen35-fp8 --nightly --timeout-per-file 5400 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + # 8-GPU GLM-5 (Accuracy) ROCm 7.2 nightly-8-gpu-glm5-rocm720: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-glm5-rocm720,')) @@ -1207,7 +1219,7 @@ jobs: echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} - # MI35x 8-GPU Qwen 3.5 (Accuracy) ROCm 7.2 + # MI35x 8-GPU Qwen 3.5 (Accuracy + Performance combined) ROCm 7.2 nightly-8-gpu-mi35x-qwen35-rocm720: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-mi35x-qwen35-rocm720,')) runs-on: linux-mi35x-gpu-8 @@ -1240,6 +1252,18 @@ jobs: echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} + - name: Performance Test MI35x ROCm 7.2 (8-GPU Qwen 3.5 FP8) + timeout-minutes: 120 + continue-on-error: true + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e SGLANG_USE_AITER=1 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-qwen35-fp8 --nightly --timeout-per-file 5400 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + nightly-8-gpu-mi35x-glm5-rocm720: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-mi35x-glm5-rocm720,')) runs-on: linux-mi35x-gpu-8 diff --git a/.github/workflows/nightly-test-amd.yml b/.github/workflows/nightly-test-amd.yml index 5443df894df5..1d1499817cd6 100644 --- a/.github/workflows/nightly-test-amd.yml +++ b/.github/workflows/nightly-test-amd.yml @@ -624,7 +624,7 @@ jobs: echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} - # 8-GPU Qwen 3.5 (Accuracy) + # 8-GPU Qwen 3.5 (Accuracy + Performance combined) nightly-8-gpu-qwen35: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-qwen35,')) runs-on: linux-mi325-8gpu-sglang @@ -656,6 +656,18 @@ jobs: echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} + - name: Performance Test (8-GPU Qwen 3.5 FP8) + timeout-minutes: 120 + continue-on-error: true + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e SGLANG_USE_AITER=1 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-qwen35-fp8 --nightly --timeout-per-file 5400 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + nightly-8-gpu-glm5: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-glm5,')) runs-on: linux-mi325-8gpu-sglang @@ -1212,7 +1224,7 @@ jobs: echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} - # MI35x 8-GPU Qwen 3.5 (Accuracy) + # MI35x 8-GPU Qwen 3.5 (Accuracy + Performance combined) nightly-8-gpu-mi35x-qwen35: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-mi35x-qwen35,')) runs-on: linux-mi35x-gpu-8 @@ -1245,6 +1257,18 @@ jobs: echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} + - name: Performance Test MI35x (8-GPU Qwen 3.5 FP8) + timeout-minutes: 120 + continue-on-error: true + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e SGLANG_USE_AITER=1 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-qwen35-fp8 --nightly --timeout-per-file 5400 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + nightly-8-gpu-mi35x-glm5: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-mi35x-glm5,')) runs-on: linux-mi35x-gpu-8 diff --git a/test/registered/amd/perf/mi30x/test_qwen35_fp8_perf_amd.py b/test/registered/amd/perf/mi30x/test_qwen35_fp8_perf_amd.py new file mode 100644 index 000000000000..824ed636eb02 --- /dev/null +++ b/test/registered/amd/perf/mi30x/test_qwen35_fp8_perf_amd.py @@ -0,0 +1,139 @@ +"""Nightly performance benchmark for Qwen3.5-397B-A17B FP8. + +Tests Qwen3.5-397B-A17B-FP8 (MoE, Hybrid Attention with Gated Delta Networks) +on 8 GPUs with triton attention backend. + +Model path can be configured via environment variable: +- QWEN35_FP8_MODEL_PATH: Path to Qwen3.5-FP8 model + (default: Qwen/Qwen3.5-397B-A17B-FP8) + +Example usage: + python -m pytest test_qwen35_fp8_perf_amd.py -v +""" + +import os +import unittest +from typing import List + +from sglang.test.ci.ci_register import register_amd_ci +from sglang.test.nightly_bench_utils import BenchmarkResult +from sglang.test.nightly_utils import NightlyBenchmarkRunner +from sglang.test.test_utils import DEFAULT_URL_FOR_TEST, _parse_int_list_env + +register_amd_ci(est_time=5400, suite="nightly-perf-8-gpu-qwen35-fp8", nightly=True) + + +def generate_simple_markdown_report(results: List[BenchmarkResult]) -> str: + """Generate a simplified markdown report without traces and cost columns. + + Skips the first result if it's a warmup run (duplicate batch_size). + """ + model_header = results[0].model_path + if results[0].run_name and results[0].run_name != "default": + model_header += f" ({results[0].run_name})" + + gpu_config = os.getenv("GPU_CONFIG", "MI325") + if gpu_config: + model_header += f" [{gpu_config}]" + + summary = f"### {model_header}\n" + summary += "| batch size | input len | latency (s) | input throughput (tok/s) | output throughput (tok/s) | ITL (ms) |\n" + summary += "| ---------- | --------- | ----------- | ------------------------ | ------------------------- | -------- |\n" + + report_results = ( + results[1:] + if len(results) > 1 and results[0].batch_size == results[1].batch_size + else results + ) + + for result in report_results: + itl = 1 / (result.output_throughput / result.batch_size) * 1000 + summary += f"| {result.batch_size} | {result.input_len} | {result.latency:.2f} | {result.input_throughput:.2f} | {result.output_throughput:.2f} | {itl:.2f} |\n" + + return summary + + +QWEN35_FP8_MODEL_PATH = os.environ.get( + "QWEN35_FP8_MODEL_PATH", "Qwen/Qwen3.5-397B-A17B-FP8" +) +PROFILE_DIR = "performance_profiles_qwen35_fp8" + + +class TestNightlyQwen35Fp8Performance(unittest.TestCase): + """Nightly performance benchmark for Qwen3.5-397B-A17B FP8. + + Tests Qwen3.5 FP8 with triton attention backend on TP=8. + Runtime: ~90 minutes + """ + + @classmethod + def setUpClass(cls): + cls.base_url = DEFAULT_URL_FOR_TEST + cls.batch_sizes = [1, 8, 16, 64] + cls.input_lens = tuple(_parse_int_list_env("NIGHTLY_INPUT_LENS", "4096")) + cls.output_lens = tuple(_parse_int_list_env("NIGHTLY_OUTPUT_LENS", "512")) + + cls.model_config = { + "name": "qwen35-fp8", + "model_path": QWEN35_FP8_MODEL_PATH, + "other_args": [ + "--trust-remote-code", + "--tp", + "8", + "--attention-backend", + "triton", + "--mem-fraction-static", + "0.8", + "--model-loader-extra-config", + '{"enable_multithread_load": true}', + "--watchdog-timeout", + "1200", + ], + "env_vars": { + "SGLANG_USE_AITER": "1", + }, + } + + cls.runner = NightlyBenchmarkRunner(PROFILE_DIR, cls.__name__, cls.base_url) + cls.runner.setup_profile_directory() + cls.runner.full_report = f"## {cls.__name__}\n" + + def test_bench_qwen35_fp8(self): + """Run benchmark for Qwen3.5-397B-A17B FP8.""" + old_env = {} + for key, value in self.model_config.get("env_vars", {}).items(): + old_env[key] = os.environ.get(key) + os.environ[key] = value + + try: + result_tuple = self.runner.run_benchmark_for_model( + model_path=self.model_config["model_path"], + batch_sizes=self.batch_sizes, + input_lens=self.input_lens, + output_lens=self.output_lens, + other_args=self.model_config["other_args"], + variant=self.model_config["name"], + extra_bench_args=["--trust-remote-code"], + enable_profile=False, + timeout=5400, + ) + results = result_tuple[0] + success = result_tuple[1] + + if results: + self.runner.full_report += ( + generate_simple_markdown_report(results) + "\n" + ) + + self.assertTrue(success, f"Benchmark failed for {QWEN35_FP8_MODEL_PATH}") + finally: + for key, value in old_env.items(): + if value is None: + os.environ.pop(key, None) + else: + os.environ[key] = value + self.runner.write_final_report() + + +if __name__ == "__main__": + unittest.main() diff --git a/test/registered/amd/perf/mi35x/test_qwen35_fp8_perf_mi35x.py b/test/registered/amd/perf/mi35x/test_qwen35_fp8_perf_mi35x.py new file mode 100644 index 000000000000..97b3be9c34ba --- /dev/null +++ b/test/registered/amd/perf/mi35x/test_qwen35_fp8_perf_mi35x.py @@ -0,0 +1,139 @@ +"""MI35x Nightly performance benchmark for Qwen3.5-397B-A17B FP8. + +Tests Qwen3.5-397B-A17B-FP8 (MoE, Hybrid Attention with Gated Delta Networks) +on 8 GPUs with triton attention backend. + +Registry: nightly-perf-8-gpu-mi35x-qwen35-fp8 suite +""" + +import os + +os.environ.setdefault("HF_HOME", "/data2/models/huggingface") +os.environ.setdefault("HF_HUB_CACHE", "/data2/models/huggingface/hub") + +import unittest +from typing import List + +from sglang.test.ci.ci_register import register_amd_ci +from sglang.test.nightly_bench_utils import BenchmarkResult +from sglang.test.nightly_utils import NightlyBenchmarkRunner +from sglang.test.test_utils import DEFAULT_URL_FOR_TEST, _parse_int_list_env + +register_amd_ci( + est_time=5400, suite="nightly-perf-8-gpu-mi35x-qwen35-fp8", nightly=True +) + + +def generate_simple_markdown_report(results: List[BenchmarkResult]) -> str: + """Generate a simplified markdown report without traces and cost columns. + + Skips the first result if it's a warmup run (duplicate batch_size). + """ + model_header = results[0].model_path + if results[0].run_name and results[0].run_name != "default": + model_header += f" ({results[0].run_name})" + + gpu_config = os.getenv("GPU_CONFIG", "MI35x") + if gpu_config: + model_header += f" [{gpu_config}]" + + summary = f"### {model_header}\n" + summary += "| batch size | input len | latency (s) | input throughput (tok/s) | output throughput (tok/s) | ITL (ms) |\n" + summary += "| ---------- | --------- | ----------- | ------------------------ | ------------------------- | -------- |\n" + + report_results = ( + results[1:] + if len(results) > 1 and results[0].batch_size == results[1].batch_size + else results + ) + + for result in report_results: + itl = 1 / (result.output_throughput / result.batch_size) * 1000 + summary += f"| {result.batch_size} | {result.input_len} | {result.latency:.2f} | {result.input_throughput:.2f} | {result.output_throughput:.2f} | {itl:.2f} |\n" + + return summary + + +QWEN35_FP8_MODEL_PATH = os.environ.get( + "QWEN35_FP8_MODEL_PATH", "Qwen/Qwen3.5-397B-A17B-FP8" +) +PROFILE_DIR = "performance_profiles_qwen35_fp8_mi35x" + + +class TestQwen35Fp8PerfMI35x(unittest.TestCase): + """Test suite for Qwen3.5-397B-A17B FP8 performance benchmarks on MI35x.""" + + @classmethod + def setUpClass(cls): + cls.base_url = DEFAULT_URL_FOR_TEST + cls.batch_sizes = [1, 8, 16, 64] + cls.input_lens = tuple(_parse_int_list_env("NIGHTLY_INPUT_LENS", "4096")) + cls.output_lens = tuple(_parse_int_list_env("NIGHTLY_OUTPUT_LENS", "512")) + + cls.model_config = { + "name": "qwen35-fp8-mi35x", + "model_path": QWEN35_FP8_MODEL_PATH, + "other_args": [ + "--trust-remote-code", + "--tp", + "8", + "--attention-backend", + "triton", + "--mem-fraction-static", + "0.8", + "--model-loader-extra-config", + '{"enable_multithread_load": true}', + "--watchdog-timeout", + "1200", + ], + "env_vars": { + "SGLANG_USE_AITER": "1", + }, + } + + cls.runner = NightlyBenchmarkRunner(PROFILE_DIR, cls.__name__, cls.base_url) + cls.runner.setup_profile_directory() + cls.runner.full_report = f"## {cls.__name__}\n" + + def test_qwen35_fp8_perf(self): + """Run Qwen3.5-397B-A17B FP8 performance benchmark on MI35x.""" + old_env = {} + for key, value in self.model_config.get("env_vars", {}).items(): + old_env[key] = os.environ.get(key) + os.environ[key] = value + + try: + result_tuple = self.runner.run_benchmark_for_model( + model_path=self.model_config["model_path"], + batch_sizes=self.batch_sizes, + input_lens=self.input_lens, + output_lens=self.output_lens, + other_args=self.model_config["other_args"], + variant=self.model_config["name"], + extra_bench_args=["--trust-remote-code"], + enable_profile=False, + timeout=5400, + ) + results = result_tuple[0] + success = result_tuple[1] + + if results: + self.runner.full_report += ( + generate_simple_markdown_report(results) + "\n" + ) + + self.assertTrue( + success, + f"Benchmark failed for {QWEN35_FP8_MODEL_PATH} on MI35x", + ) + finally: + for key, value in old_env.items(): + if value is None: + os.environ.pop(key, None) + else: + os.environ[key] = value + self.runner.write_final_report() + + +if __name__ == "__main__": + unittest.main() From 9bef99f743b4db9d003b4575ab6f9448d227c6bc Mon Sep 17 00:00:00 2001 From: Michael <13900043+michaelzhang-ai@users.noreply.github.com> Date: Mon, 6 Apr 2026 16:09:08 -0500 Subject: [PATCH 2/4] [AMD CI] Write Qwen3.5 accuracy results to GitHub step summary Override test_lm_eval in the Qwen3.5 accuracy tests to write a markdown results table to GITHUB_STEP_SUMMARY, matching the pattern used by the MXFP4 combined tests. No common code changed. --- .../accuracy/mi30x/test_qwen35_eval_amd.py | 41 +++++++++++++++++++ .../accuracy/mi35x/test_qwen35_eval_mi35x.py | 37 ++++++++++++++++- 2 files changed, 76 insertions(+), 2 deletions(-) diff --git a/test/registered/amd/accuracy/mi30x/test_qwen35_eval_amd.py b/test/registered/amd/accuracy/mi30x/test_qwen35_eval_amd.py index dae0e31c10f7..a82e6607a2d8 100644 --- a/test/registered/amd/accuracy/mi30x/test_qwen35_eval_amd.py +++ b/test/registered/amd/accuracy/mi30x/test_qwen35_eval_amd.py @@ -8,6 +8,10 @@ import os import unittest +from pathlib import Path + +import numpy as np +import yaml from sglang.srt.utils import kill_process_tree from sglang.test.ci.ci_register import register_amd_ci @@ -15,7 +19,9 @@ from sglang.test.test_utils import ( DEFAULT_URL_FOR_TEST, CustomTestCase, + is_in_ci, popen_launch_server, + write_github_step_summary, ) register_amd_ci(est_time=3600, suite="nightly-amd-accuracy-8-gpu-qwen35", nightly=True) @@ -59,6 +65,41 @@ def setUpClass(cls): def tearDownClass(cls): kill_process_tree(cls.process.pid) + def test_lm_eval(self): + """Override to write accuracy results to GitHub step summary.""" + import requests + + requests.get(self.base_url + "/flush_cache") + + eval_config = yaml.safe_load( + Path(self.model_config_name).read_text(encoding="utf-8") + ) + results = self.launch_lm_eval(eval_config) + rtol = eval_config.get("rtol", self.default_rtol) + model_name = eval_config.get("model_name", self.model) + + success = True + summary = f"### lm-eval accuracy ({model_name})\n" + summary += "| task | metric | expected | measured | status |\n" + summary += "| ---- | ------ | -------- | -------- | ------ |\n" + for task in eval_config["tasks"]: + for metric in task["metrics"]: + expected = metric["value"] + measured = results["results"][task["name"]][metric["name"]] + passed = bool(np.isclose(expected, measured, rtol=rtol)) + status = "✅" if passed else "❌" + summary += f"| {task['name']} | {metric['name']} | {expected:.4f} | {measured:.4f} | {status} |\n" + print( + f"{task['name']} | {metric['name']}: " + f"expected={expected:.3f} | measured={measured:.3f} | rtol={rtol}" + ) + success = success and passed + + if is_in_ci(): + write_github_step_summary(summary) + + self.assertTrue(success, "lm-eval validation failed") + if __name__ == "__main__": unittest.main() diff --git a/test/registered/amd/accuracy/mi35x/test_qwen35_eval_mi35x.py b/test/registered/amd/accuracy/mi35x/test_qwen35_eval_mi35x.py index 2c6b8059bfa8..658ba3744fe4 100644 --- a/test/registered/amd/accuracy/mi35x/test_qwen35_eval_mi35x.py +++ b/test/registered/amd/accuracy/mi35x/test_qwen35_eval_mi35x.py @@ -8,8 +8,11 @@ import os import unittest +from pathlib import Path +import numpy as np import requests +import yaml from sglang.srt.utils import kill_process_tree from sglang.test.ci.ci_register import register_amd_ci @@ -17,7 +20,9 @@ from sglang.test.test_utils import ( DEFAULT_URL_FOR_TEST, CustomTestCase, + is_in_ci, popen_launch_server, + write_github_step_summary, ) register_amd_ci( @@ -40,7 +45,7 @@ def setUpClass(cls): cls.base_url = DEFAULT_URL_FOR_TEST def test_lm_eval(self): - """Override to handle server lifecycle within test method (MI35x pattern).""" + """Override to handle server lifecycle and write results to summary.""" other_args = [ "--tp", str(TP_SIZE), @@ -65,7 +70,35 @@ def test_lm_eval(self): try: requests.get(self.base_url + "/flush_cache") - super().test_lm_eval() + + eval_config = yaml.safe_load( + Path(self.model_config_name).read_text(encoding="utf-8") + ) + results = self.launch_lm_eval(eval_config) + rtol = eval_config.get("rtol", self.default_rtol) + model_name = eval_config.get("model_name", self.model) + + success = True + summary = f"### lm-eval accuracy ({model_name})\n" + summary += "| task | metric | expected | measured | status |\n" + summary += "| ---- | ------ | -------- | -------- | ------ |\n" + for task in eval_config["tasks"]: + for metric in task["metrics"]: + expected = metric["value"] + measured = results["results"][task["name"]][metric["name"]] + passed = bool(np.isclose(expected, measured, rtol=rtol)) + status = "✅" if passed else "❌" + summary += f"| {task['name']} | {metric['name']} | {expected:.4f} | {measured:.4f} | {status} |\n" + print( + f"{task['name']} | {metric['name']}: " + f"expected={expected:.3f} | measured={measured:.3f} | rtol={rtol}" + ) + success = success and passed + + if is_in_ci(): + write_github_step_summary(summary) + + self.assertTrue(success, "lm-eval validation failed") finally: kill_process_tree(process.pid) From 43f2e94058c0b5e1656617ec0a559f0ebfb2963d Mon Sep 17 00:00:00 2001 From: Michael <13900043+michaelzhang-ai@users.noreply.github.com> Date: Tue, 7 Apr 2026 00:34:45 -0500 Subject: [PATCH 3/4] [AMD CI] Switch Qwen3.5 FP8 perf tests to aiter attention backend --- test/registered/amd/perf/mi30x/test_qwen35_fp8_perf_amd.py | 2 +- test/registered/amd/perf/mi35x/test_qwen35_fp8_perf_mi35x.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/registered/amd/perf/mi30x/test_qwen35_fp8_perf_amd.py b/test/registered/amd/perf/mi30x/test_qwen35_fp8_perf_amd.py index 824ed636eb02..be5314a6438a 100644 --- a/test/registered/amd/perf/mi30x/test_qwen35_fp8_perf_amd.py +++ b/test/registered/amd/perf/mi30x/test_qwen35_fp8_perf_amd.py @@ -81,7 +81,7 @@ def setUpClass(cls): "--tp", "8", "--attention-backend", - "triton", + "aiter", "--mem-fraction-static", "0.8", "--model-loader-extra-config", diff --git a/test/registered/amd/perf/mi35x/test_qwen35_fp8_perf_mi35x.py b/test/registered/amd/perf/mi35x/test_qwen35_fp8_perf_mi35x.py index 97b3be9c34ba..6446eb601e84 100644 --- a/test/registered/amd/perf/mi35x/test_qwen35_fp8_perf_mi35x.py +++ b/test/registered/amd/perf/mi35x/test_qwen35_fp8_perf_mi35x.py @@ -78,7 +78,7 @@ def setUpClass(cls): "--tp", "8", "--attention-backend", - "triton", + "aiter", "--mem-fraction-static", "0.8", "--model-loader-extra-config", From c000b9d98513ebf09de2c9489ba19c15ce6f9a19 Mon Sep 17 00:00:00 2001 From: Michael <13900043+michaelzhang-ai@users.noreply.github.com> Date: Tue, 7 Apr 2026 00:44:01 -0500 Subject: [PATCH 4/4] [AMD CI] Switch Qwen3.5 accuracy tests to aiter attention backend --- test/registered/amd/accuracy/mi30x/test_qwen35_eval_amd.py | 2 +- test/registered/amd/accuracy/mi35x/test_qwen35_eval_mi35x.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/registered/amd/accuracy/mi30x/test_qwen35_eval_amd.py b/test/registered/amd/accuracy/mi30x/test_qwen35_eval_amd.py index a82e6607a2d8..112630ed474c 100644 --- a/test/registered/amd/accuracy/mi30x/test_qwen35_eval_amd.py +++ b/test/registered/amd/accuracy/mi30x/test_qwen35_eval_amd.py @@ -44,7 +44,7 @@ def setUpClass(cls): "--tp", str(TP_SIZE), "--attention-backend", - "triton", + "aiter", "--trust-remote-code", "--model-loader-extra-config", '{"enable_multithread_load": true}', diff --git a/test/registered/amd/accuracy/mi35x/test_qwen35_eval_mi35x.py b/test/registered/amd/accuracy/mi35x/test_qwen35_eval_mi35x.py index 658ba3744fe4..4b35a28d4405 100644 --- a/test/registered/amd/accuracy/mi35x/test_qwen35_eval_mi35x.py +++ b/test/registered/amd/accuracy/mi35x/test_qwen35_eval_mi35x.py @@ -50,7 +50,7 @@ def test_lm_eval(self): "--tp", str(TP_SIZE), "--attention-backend", - "triton", + "aiter", "--trust-remote-code", "--model-loader-extra-config", '{"enable_multithread_load": true}',