Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 26 additions & 2 deletions .github/workflows/nightly-test-amd-rocm720.yml
Original file line number Diff line number Diff line change
Expand Up @@ -621,7 +621,7 @@ jobs:
echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
exit ${TEST_EXIT_CODE:-0}

# 8-GPU Qwen 3.5 (Accuracy) ROCm 7.2
# 8-GPU Qwen 3.5 (Accuracy + Performance combined) ROCm 7.2
nightly-8-gpu-qwen35-rocm720:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-qwen35-rocm720,'))
runs-on: linux-mi325-8gpu-sglang
Expand Down Expand Up @@ -653,6 +653,18 @@ jobs:
echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
exit ${TEST_EXIT_CODE:-0}

- name: Performance Test ROCm 7.2 (8-GPU Qwen 3.5 FP8)
timeout-minutes: 120
continue-on-error: true
run: |
> github_summary.md # Clear summary file
bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
-e SGLANG_USE_AITER=1 \
-e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-qwen35-fp8 --nightly --timeout-per-file 5400 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$?
echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
exit ${TEST_EXIT_CODE:-0}

# 8-GPU GLM-5 (Accuracy) ROCm 7.2
nightly-8-gpu-glm5-rocm720:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-glm5-rocm720,'))
Expand Down Expand Up @@ -1219,7 +1231,7 @@ jobs:
echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
exit ${TEST_EXIT_CODE:-0}

# MI35x 8-GPU Qwen 3.5 (Accuracy) ROCm 7.2
# MI35x 8-GPU Qwen 3.5 (Accuracy + Performance combined) ROCm 7.2
nightly-8-gpu-mi35x-qwen35-rocm720:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-mi35x-qwen35-rocm720,'))
runs-on: linux-mi35x-gpu-8
Expand Down Expand Up @@ -1252,6 +1264,18 @@ jobs:
echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
exit ${TEST_EXIT_CODE:-0}

- name: Performance Test MI35x ROCm 7.2 (8-GPU Qwen 3.5 FP8)
timeout-minutes: 120
continue-on-error: true
run: |
> github_summary.md # Clear summary file
bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
-e SGLANG_USE_AITER=1 \
-e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-qwen35-fp8 --nightly --timeout-per-file 5400 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$?
echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
exit ${TEST_EXIT_CODE:-0}

nightly-8-gpu-mi35x-glm5-rocm720:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-mi35x-glm5-rocm720,'))
runs-on: linux-mi35x-gpu-8
Expand Down
28 changes: 26 additions & 2 deletions .github/workflows/nightly-test-amd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -624,7 +624,7 @@ jobs:
echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
exit ${TEST_EXIT_CODE:-0}

# 8-GPU Qwen 3.5 (Accuracy)
# 8-GPU Qwen 3.5 (Accuracy + Performance combined)
nightly-8-gpu-qwen35:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-qwen35,'))
runs-on: linux-mi325-8gpu-sglang
Expand Down Expand Up @@ -656,6 +656,18 @@ jobs:
echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
exit ${TEST_EXIT_CODE:-0}

- name: Performance Test (8-GPU Qwen 3.5 FP8)
timeout-minutes: 120
continue-on-error: true
run: |
> github_summary.md # Clear summary file
bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
-e SGLANG_USE_AITER=1 \
-e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-qwen35-fp8 --nightly --timeout-per-file 5400 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$?
echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
exit ${TEST_EXIT_CODE:-0}

nightly-8-gpu-glm5:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-glm5,'))
runs-on: linux-mi325-8gpu-sglang
Expand Down Expand Up @@ -1224,7 +1236,7 @@ jobs:
echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
exit ${TEST_EXIT_CODE:-0}

# MI35x 8-GPU Qwen 3.5 (Accuracy)
# MI35x 8-GPU Qwen 3.5 (Accuracy + Performance combined)
nightly-8-gpu-mi35x-qwen35:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-mi35x-qwen35,'))
runs-on: linux-mi35x-gpu-8
Expand Down Expand Up @@ -1257,6 +1269,18 @@ jobs:
echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
exit ${TEST_EXIT_CODE:-0}

- name: Performance Test MI35x (8-GPU Qwen 3.5 FP8)
timeout-minutes: 120
continue-on-error: true
run: |
> github_summary.md # Clear summary file
bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
-e SGLANG_USE_AITER=1 \
-e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-qwen35-fp8 --nightly --timeout-per-file 5400 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$?
echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
exit ${TEST_EXIT_CODE:-0}

nightly-8-gpu-mi35x-glm5:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-mi35x-glm5,'))
runs-on: linux-mi35x-gpu-8
Expand Down
43 changes: 42 additions & 1 deletion test/registered/amd/accuracy/mi30x/test_qwen35_eval_amd.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,20 @@

import os
import unittest
from pathlib import Path

import numpy as np
import yaml

from sglang.srt.utils import kill_process_tree
from sglang.test.ci.ci_register import register_amd_ci
from sglang.test.kits.lm_eval_kit import LMEvalMixin
from sglang.test.test_utils import (
DEFAULT_URL_FOR_TEST,
CustomTestCase,
is_in_ci,
popen_launch_server,
write_github_step_summary,
)

register_amd_ci(est_time=3600, suite="nightly-amd-accuracy-8-gpu-qwen35", nightly=True)
Expand All @@ -38,7 +44,7 @@ def setUpClass(cls):
"--tp",
str(TP_SIZE),
"--attention-backend",
"triton",
"aiter",
"--trust-remote-code",
"--model-loader-extra-config",
'{"enable_multithread_load": true}',
Expand All @@ -59,6 +65,41 @@ def setUpClass(cls):
def tearDownClass(cls):
kill_process_tree(cls.process.pid)

def test_lm_eval(self):
"""Override to write accuracy results to GitHub step summary."""
import requests

requests.get(self.base_url + "/flush_cache")

eval_config = yaml.safe_load(
Path(self.model_config_name).read_text(encoding="utf-8")
)
results = self.launch_lm_eval(eval_config)
rtol = eval_config.get("rtol", self.default_rtol)
model_name = eval_config.get("model_name", self.model)

success = True
summary = f"### lm-eval accuracy ({model_name})\n"
summary += "| task | metric | expected | measured | status |\n"
summary += "| ---- | ------ | -------- | -------- | ------ |\n"
for task in eval_config["tasks"]:
for metric in task["metrics"]:
expected = metric["value"]
measured = results["results"][task["name"]][metric["name"]]
passed = bool(np.isclose(expected, measured, rtol=rtol))
status = "✅" if passed else "❌"
summary += f"| {task['name']} | {metric['name']} | {expected:.4f} | {measured:.4f} | {status} |\n"
print(
f"{task['name']} | {metric['name']}: "
f"expected={expected:.3f} | measured={measured:.3f} | rtol={rtol}"
)
success = success and passed

if is_in_ci():
write_github_step_summary(summary)

self.assertTrue(success, "lm-eval validation failed")


if __name__ == "__main__":
unittest.main()
39 changes: 36 additions & 3 deletions test/registered/amd/accuracy/mi35x/test_qwen35_eval_mi35x.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,21 @@

import os
import unittest
from pathlib import Path

import numpy as np
import requests
import yaml

from sglang.srt.utils import kill_process_tree
from sglang.test.ci.ci_register import register_amd_ci
from sglang.test.kits.lm_eval_kit import LMEvalMixin
from sglang.test.test_utils import (
DEFAULT_URL_FOR_TEST,
CustomTestCase,
is_in_ci,
popen_launch_server,
write_github_step_summary,
)

register_amd_ci(
Expand All @@ -40,12 +45,12 @@ def setUpClass(cls):
cls.base_url = DEFAULT_URL_FOR_TEST

def test_lm_eval(self):
"""Override to handle server lifecycle within test method (MI35x pattern)."""
"""Override to handle server lifecycle and write results to summary."""
other_args = [
"--tp",
str(TP_SIZE),
"--attention-backend",
"triton",
"aiter",
"--trust-remote-code",
"--model-loader-extra-config",
'{"enable_multithread_load": true}',
Expand All @@ -65,7 +70,35 @@ def test_lm_eval(self):

try:
requests.get(self.base_url + "/flush_cache")
super().test_lm_eval()

eval_config = yaml.safe_load(
Path(self.model_config_name).read_text(encoding="utf-8")
)
results = self.launch_lm_eval(eval_config)
rtol = eval_config.get("rtol", self.default_rtol)
model_name = eval_config.get("model_name", self.model)

success = True
summary = f"### lm-eval accuracy ({model_name})\n"
summary += "| task | metric | expected | measured | status |\n"
summary += "| ---- | ------ | -------- | -------- | ------ |\n"
for task in eval_config["tasks"]:
for metric in task["metrics"]:
expected = metric["value"]
measured = results["results"][task["name"]][metric["name"]]
passed = bool(np.isclose(expected, measured, rtol=rtol))
status = "✅" if passed else "❌"
summary += f"| {task['name']} | {metric['name']} | {expected:.4f} | {measured:.4f} | {status} |\n"
print(
f"{task['name']} | {metric['name']}: "
f"expected={expected:.3f} | measured={measured:.3f} | rtol={rtol}"
)
success = success and passed

if is_in_ci():
write_github_step_summary(summary)

self.assertTrue(success, "lm-eval validation failed")
finally:
kill_process_tree(process.pid)

Expand Down
139 changes: 139 additions & 0 deletions test/registered/amd/perf/mi30x/test_qwen35_fp8_perf_amd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
"""Nightly performance benchmark for Qwen3.5-397B-A17B FP8.

Tests Qwen3.5-397B-A17B-FP8 (MoE, Hybrid Attention with Gated Delta Networks)
on 8 GPUs with triton attention backend.

Model path can be configured via environment variable:
- QWEN35_FP8_MODEL_PATH: Path to Qwen3.5-FP8 model
(default: Qwen/Qwen3.5-397B-A17B-FP8)

Example usage:
python -m pytest test_qwen35_fp8_perf_amd.py -v
"""

import os
import unittest
from typing import List

from sglang.test.ci.ci_register import register_amd_ci
from sglang.test.nightly_bench_utils import BenchmarkResult
from sglang.test.nightly_utils import NightlyBenchmarkRunner
from sglang.test.test_utils import DEFAULT_URL_FOR_TEST, _parse_int_list_env

register_amd_ci(est_time=5400, suite="nightly-perf-8-gpu-qwen35-fp8", nightly=True)


def generate_simple_markdown_report(results: List[BenchmarkResult]) -> str:
"""Generate a simplified markdown report without traces and cost columns.

Skips the first result if it's a warmup run (duplicate batch_size).
"""
model_header = results[0].model_path
if results[0].run_name and results[0].run_name != "default":
model_header += f" ({results[0].run_name})"

gpu_config = os.getenv("GPU_CONFIG", "MI325")
if gpu_config:
model_header += f" [{gpu_config}]"

summary = f"### {model_header}\n"
summary += "| batch size | input len | latency (s) | input throughput (tok/s) | output throughput (tok/s) | ITL (ms) |\n"
summary += "| ---------- | --------- | ----------- | ------------------------ | ------------------------- | -------- |\n"

report_results = (
results[1:]
if len(results) > 1 and results[0].batch_size == results[1].batch_size
else results
)

for result in report_results:
itl = 1 / (result.output_throughput / result.batch_size) * 1000
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The calculation for itl could lead to a ZeroDivisionError if result.output_throughput is zero. It's safer to check for this case to prevent the test from crashing during report generation. Rewriting the expression also improves readability.

Suggested change
itl = 1 / (result.output_throughput / result.batch_size) * 1000
itl = (result.batch_size / result.output_throughput) * 1000 if result.output_throughput > 0 else 0

summary += f"| {result.batch_size} | {result.input_len} | {result.latency:.2f} | {result.input_throughput:.2f} | {result.output_throughput:.2f} | {itl:.2f} |\n"

return summary


QWEN35_FP8_MODEL_PATH = os.environ.get(
"QWEN35_FP8_MODEL_PATH", "Qwen/Qwen3.5-397B-A17B-FP8"
)
PROFILE_DIR = "performance_profiles_qwen35_fp8"


class TestNightlyQwen35Fp8Performance(unittest.TestCase):
"""Nightly performance benchmark for Qwen3.5-397B-A17B FP8.

Tests Qwen3.5 FP8 with triton attention backend on TP=8.
Runtime: ~90 minutes
"""

@classmethod
def setUpClass(cls):
cls.base_url = DEFAULT_URL_FOR_TEST
cls.batch_sizes = [1, 8, 16, 64]
cls.input_lens = tuple(_parse_int_list_env("NIGHTLY_INPUT_LENS", "4096"))
cls.output_lens = tuple(_parse_int_list_env("NIGHTLY_OUTPUT_LENS", "512"))

cls.model_config = {
"name": "qwen35-fp8",
"model_path": QWEN35_FP8_MODEL_PATH,
"other_args": [
"--trust-remote-code",
"--tp",
"8",
"--attention-backend",
"aiter",
"--mem-fraction-static",
"0.8",
"--model-loader-extra-config",
'{"enable_multithread_load": true}',
"--watchdog-timeout",
"1200",
],
"env_vars": {
"SGLANG_USE_AITER": "1",
},
}

cls.runner = NightlyBenchmarkRunner(PROFILE_DIR, cls.__name__, cls.base_url)
cls.runner.setup_profile_directory()
cls.runner.full_report = f"## {cls.__name__}\n"

def test_bench_qwen35_fp8(self):
"""Run benchmark for Qwen3.5-397B-A17B FP8."""
old_env = {}
for key, value in self.model_config.get("env_vars", {}).items():
old_env[key] = os.environ.get(key)
os.environ[key] = value

try:
result_tuple = self.runner.run_benchmark_for_model(
model_path=self.model_config["model_path"],
batch_sizes=self.batch_sizes,
input_lens=self.input_lens,
output_lens=self.output_lens,
other_args=self.model_config["other_args"],
variant=self.model_config["name"],
extra_bench_args=["--trust-remote-code"],
enable_profile=False,
timeout=5400,
)
results = result_tuple[0]
success = result_tuple[1]

if results:
self.runner.full_report += (
generate_simple_markdown_report(results) + "\n"
)

self.assertTrue(success, f"Benchmark failed for {QWEN35_FP8_MODEL_PATH}")
finally:
for key, value in old_env.items():
if value is None:
os.environ.pop(key, None)
else:
os.environ[key] = value
self.runner.write_final_report()
Comment thread
michaelzhang-ai marked this conversation as resolved.


if __name__ == "__main__":
unittest.main()
Loading
Loading