Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 40 additions & 5 deletions .github/workflows/nightly-test-amd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ on:
- 'nightly-8-gpu-kimi-k2'
# MI35x jobs
- 'nightly-test-1-gpu-mi35x'
- 'nightly-8-gpu-mi35x-kimi-k2'
- 'nightly-accuracy-8-gpu-mi35x'
- 'nightly-8-gpu-mi35x-grok1-int4'
- 'nightly-8-gpu-mi35x-grok2'
Expand Down Expand Up @@ -582,13 +583,13 @@ jobs:
bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate

- name: Accuracy Test MI35x (8-GPU Grok1-INT4)
timeout-minutes: 60
timeout-minutes: 90
run: |
> github_summary.md # Clear summary file
bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
-e RCCL_MSCCL_ENABLE=0 \
-e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-grok1-int4 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$?
python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-grok1-int4 --nightly --timeout-per-file 5400 || TEST_EXIT_CODE=$?
echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
exit ${TEST_EXIT_CODE:-0}

Expand Down Expand Up @@ -793,6 +794,39 @@ jobs:
echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
exit ${TEST_EXIT_CODE:-0}

# MI35x 8-GPU Kimi-K2 (Accuracy)
nightly-8-gpu-mi35x-kimi-k2:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-kimi-k2')
runs-on: linux-mi35x-gpu-8
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.ref || github.ref }}

- name: Setup docker
run: |
touch github_summary.md
bash scripts/ci/amd/amd_ci_start_container.sh
env:
GITHUB_WORKSPACE: ${{ github.workspace }}

- name: Install dependencies
run: |
bash scripts/ci/amd/amd_ci_install_dependency.sh
# Install tabulate for run_suite.py (missing in MI35x container)
bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate

- name: Accuracy Test MI35x (8-GPU Kimi-K2)
timeout-minutes: 180
run: |
> github_summary.md # Clear summary file
bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
-e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-kimi-k2 --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$?
echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
exit ${TEST_EXIT_CODE:-0}

# MI35x 8-GPU DeepSeek-V3.2 Performance Test (MTP)
nightly-perf-8-gpu-mi35x-deepseek-v32-mtp:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-8-gpu-mi35x-deepseek-v32-mtp')
Expand Down Expand Up @@ -834,10 +868,10 @@ jobs:
# MI30x Accuracy Tests
- nightly-accuracy-2-gpu
- nightly-accuracy-2-gpu-vlm
# MI30x Performance Tests
- nightly-perf-2-gpu-text
- nightly-perf-2-gpu-vlm
- nightly-accuracy-8-gpu
# MI30x Performance Tests - excluded from check (perf failures don't block CI)
# - nightly-perf-2-gpu-text
# - nightly-perf-2-gpu-vlm
# MI30x Combined Accuracy + Performance Tests
- nightly-8-gpu-grok1-int4
- nightly-8-gpu-grok2
Expand All @@ -853,6 +887,7 @@ jobs:
- nightly-8-gpu-mi35x-deepseek-r1-mxfp4
- nightly-accuracy-8-gpu-mi35x-deepseek-v32
- nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp
- nightly-8-gpu-mi35x-kimi-k2
# MI35x perf jobs excluded from check - perf failures don't block CI
# - nightly-perf-8-gpu-mi35x-deepseek-v32-basic
# - nightly-perf-8-gpu-mi35x-deepseek-v32-mtp
Expand Down
20 changes: 16 additions & 4 deletions python/sglang/test/nightly_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ def build_benchmark_command(
json_output_file: str,
extra_args: Optional[List[str]] = None,
server_args: Optional[List[str]] = None,
enable_profile: bool = True,
) -> List[str]:
"""Build the benchmark command with all required arguments.

Expand All @@ -106,6 +107,7 @@ def build_benchmark_command(
json_output_file: Path to JSON output file
extra_args: Optional extra arguments to append to command
server_args: Optional server launch arguments to record in metrics
enable_profile: Whether to enable profiling (default True for NVIDIA)

Returns:
List of command arguments ready for subprocess.run()
Expand All @@ -125,15 +127,22 @@ def build_benchmark_command(
"--output-len",
*[str(x) for x in output_lens],
"--show-report",
"--profile",
"--profile-by-stage",
"--profile-output-dir",
profile_path_prefix,
f"--pydantic-result-filename={json_output_file}",
"--no-append-to-github-summary",
"--trust-remote-code",
]

# Add profiling flags only if enabled (disabled for AMD tests)
if enable_profile and profile_path_prefix:
command.extend(
[
"--profile",
"--profile-by-stage",
"--profile-output-dir",
profile_path_prefix,
]
)

if extra_args:
command.extend(extra_args)

Expand Down Expand Up @@ -218,6 +227,7 @@ def run_benchmark_for_model(
other_args: Optional[List[str]] = None,
variant: str = "",
extra_bench_args: Optional[List[str]] = None,
enable_profile: bool = True,
) -> Tuple[List[BenchmarkResult], bool, Optional[float]]:
"""Run a complete benchmark for a single model with server management.

Expand All @@ -236,6 +246,7 @@ def run_benchmark_for_model(
other_args: Arguments to pass to server launch
variant: Optional variant suffix (e.g., "basic", "mtp")
extra_bench_args: Extra arguments for the benchmark command
enable_profile: Whether to enable profiling (default True for NVIDIA)

Returns:
Tuple of (list of BenchmarkResult objects, success_bool, avg_spec_accept_length or None)
Expand Down Expand Up @@ -273,6 +284,7 @@ def run_benchmark_for_model(
json_output_file,
extra_args=bench_args,
server_args=other_args,
enable_profile=enable_profile,
)

result, cmd_success = self.run_benchmark_command(command, model_description)
Expand Down
8 changes: 6 additions & 2 deletions test/registered/amd/accuracy/mi35x/test_gpt_oss_eval_mi35x.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,9 @@ def __post_init__(self):
"triton",
"--trust-remote-code",
],
env_vars={"SGLANG_USE_AITER": "1"},
env_vars={
"SGLANG_USE_AITER": "0"
}, # Disabled due to SWA eviction bug with aiter (#17220)
),
ModelConfig(
model_path="openai/gpt-oss-120b",
Expand All @@ -93,7 +95,9 @@ def __post_init__(self):
"triton",
"--trust-remote-code",
],
env_vars={"SGLANG_USE_AITER": "1"},
env_vars={
"SGLANG_USE_AITER": "0"
}, # Disabled due to SWA eviction bug with aiter (#17220)
),
]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@
)
from sglang.utils import download_and_cache_file, read_jsonl

# Register for AMD CI - GROK1-INT4 accuracy tests on MI35x (~25 min)
# Register for AMD CI - GROK1-INT4 accuracy tests on MI35x (~70 min)
register_amd_ci(
est_time=1500, suite="nightly-amd-accuracy-8-gpu-mi35x-grok1-int4", nightly=True
est_time=4200, suite="nightly-amd-accuracy-8-gpu-mi35x-grok1-int4", nightly=True
)

INVALID = -9999999
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ class TestGrok2EvalMI35x(unittest.TestCase):
def setUpClass(cls):
cls.base_url = DEFAULT_URL_FOR_TEST
cls.num_questions = int(os.environ.get("GSM8K_NUM_QUESTIONS", "200"))
cls.accuracy_threshold = 0.915
cls.accuracy_threshold = 0.90

def test_grok2_accuracy(self):
"""Test Grok-2 with GSM8K completion benchmark."""
Expand Down
105 changes: 105 additions & 0 deletions test/registered/amd/accuracy/mi35x/test_kimi_k2_eval_mi35x.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
"""MI35x Kimi-K2 GSM8K Completion Evaluation Test (8-GPU)

Tests moonshotai/Kimi-K2-Instruct-0905 with GSM8K few-shot benchmark on MI35x.

Registry: nightly-amd-accuracy-8-gpu-mi35x-kimi-k2 suite
"""

import os
import unittest
from types import SimpleNamespace

import requests

from sglang.srt.utils import kill_process_tree
from sglang.test.ci.ci_register import register_amd_ci
from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
from sglang.test.test_utils import (
DEFAULT_URL_FOR_TEST,
CustomTestCase,
is_in_ci,
popen_launch_server,
write_github_step_summary,
)

# Register for AMD CI - Kimi K2 accuracy test on MI35x (~60 min)
register_amd_ci(
est_time=3600, suite="nightly-amd-accuracy-8-gpu-mi35x-kimi-k2", nightly=True
)

KIMI_K2_MODEL_PATH = "moonshotai/Kimi-K2-Instruct-0905"
SERVER_LAUNCH_TIMEOUT = 3600
ACCURACY_THRESHOLD = 0.94


class TestKimiK2EvalMI35x(CustomTestCase):
"""Kimi-K2 GSM8K Completion Evaluation Test for AMD MI35x."""

@classmethod
def setUpClass(cls):
cls.base_url = DEFAULT_URL_FOR_TEST

def test_kimi_k2_gsm8k_accuracy(self):
"""Test Kimi-K2 with GSM8K few-shot completion benchmark."""
other_args = [
"--tp",
"8",
"--decode-attention-backend",
"triton",
"--prefill-attention-backend",
"aiter",
"--trust-remote-code",
"--model-loader-extra-config",
'{"enable_multithread_load": true}',
"--watchdog-timeout",
"1200",
]
env = os.environ.copy()
env["SGLANG_USE_AITER"] = "1"
env["SGLANG_ROCM_FUSED_DECODE_MLA"] = "0"

process = popen_launch_server(
KIMI_K2_MODEL_PATH,
self.base_url,
timeout=SERVER_LAUNCH_TIMEOUT,
other_args=other_args,
env=env,
)

try:
requests.get(self.base_url + "/flush_cache")

args = SimpleNamespace(
num_shots=8,
data_path=None,
num_questions=1319,
parallel=1319,
max_new_tokens=512,
host="http://127.0.0.1",
port=int(self.base_url.split(":")[-1]),
)
metrics = run_eval_few_shot_gsm8k(args)
acc = metrics["accuracy"]

passed = acc >= ACCURACY_THRESHOLD
status = "✅ PASS" if passed else "❌ FAIL"
print(f" accuracy={acc:.3f} threshold={ACCURACY_THRESHOLD} {status}")

if is_in_ci():
summary = "### Kimi-K2 Model (MI35x)\n\n"
summary += "| Model | TP | Accuracy | Threshold | Status |\n"
summary += "| ----- | -- | -------- | --------- | ------ |\n"
summary += f"| {KIMI_K2_MODEL_PATH} | 8 | {acc:.3f} | {ACCURACY_THRESHOLD} | {status} |\n"
write_github_step_summary(summary)

self.assertGreaterEqual(
acc,
ACCURACY_THRESHOLD,
f"Kimi-K2 accuracy {acc:.3f} below threshold {ACCURACY_THRESHOLD}",
)
finally:
kill_process_tree(process.pid)


if __name__ == "__main__":
unittest.main()
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ def test_bench_one_batch(self):
other_args=variant_config["other_args"],
variant=variant_config["name"],
extra_bench_args=["--trust-remote-code"],
enable_profile=False, # Disable profiling for AMD tests
)
results = result_tuple[0]
success = result_tuple[1]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ def test_bench_one_batch(self):
other_args=self.variant_config["other_args"],
variant=self.variant_config["name"],
extra_bench_args=["--trust-remote-code"],
enable_profile=False, # Disable profiling for AMD tests
)
results = result_tuple[0]
success = result_tuple[1]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ def test_bench_one_batch(self):
other_args=self.variant_config["other_args"],
variant=self.variant_config["name"],
extra_bench_args=["--trust-remote-code"],
enable_profile=False, # Disable profiling for AMD tests
)
results = result_tuple[0]
success = result_tuple[1]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ def test_bench_one_batch(self):
other_args=variant_config["other_args"],
variant=variant_config["name"],
extra_bench_args=["--trust-remote-code"],
enable_profile=False, # Disable profiling for AMD tests
)
results = result_tuple[0]
success = result_tuple[1]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ def test_bench_grok1_fp8(self):
other_args=self.model_config["other_args"],
variant=self.model_config["name"],
extra_bench_args=["--trust-remote-code"],
enable_profile=False, # Disable profiling for AMD tests
)
results = result_tuple[0]
success = result_tuple[1]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ def test_bench_grok1_int4(self):
other_args=self.model_config["other_args"],
variant=self.model_config["name"],
extra_bench_args=["--trust-remote-code"],
enable_profile=False, # Disable profiling for AMD tests
)
results = result_tuple[0]
success = result_tuple[1]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ def test_bench_grok2(self):
other_args=self.model_config["other_args"],
variant=self.model_config["name"],
extra_bench_args=["--trust-remote-code"],
enable_profile=False, # Disable profiling for AMD tests
)
results = result_tuple[0]
success = result_tuple[1]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ def test_bench_one_batch(self):
input_lens=self.input_lens,
output_lens=self.output_lens,
other_args=other_args,
enable_profile=False, # Disable profiling for AMD tests
)
results = result_tuple[0]
success = result_tuple[1]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ def test_bench_one_batch(self):
output_lens=self.output_lens,
other_args=other_args,
extra_bench_args=extra_bench_args,
enable_profile=False, # Disable profiling for AMD tests
)
results = result_tuple[0]
success = result_tuple[1]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ def test_bench_one_batch(self):
other_args=variant_config["other_args"],
variant=variant_config["name"],
extra_bench_args=["--trust-remote-code"],
enable_profile=False, # Disable profiling for AMD tests
)
results = result_tuple[0]
success = result_tuple[1]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ def test_bench_one_batch(self):
other_args=self.variant_config["other_args"],
variant=self.variant_config["name"],
extra_bench_args=["--trust-remote-code"],
enable_profile=False, # Disable profiling for AMD tests
)
results = result_tuple[0]
success = result_tuple[1]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ def _run_benchmark_with_timeout(
profile_path_prefix,
json_output_file,
extra_args=bench_args,
enable_profile=False, # Disable profiling for AMD tests
)
_, cmd_success = runner.run_benchmark_command(command, model_description)
if not cmd_success:
Expand Down
Loading
Loading