Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
eb6ab38
Add MI35x nightly test files
michaelzhang-ai Jan 6, 2026
3e3e09e
Enhance nightly testing for AMD with MI35x support
michaelzhang-ai Jan 6, 2026
f679170
Add MI35x 8-GPU nightly tests and update run suite
michaelzhang-ai Jan 6, 2026
b82e8db
Refactor nightly tests for DeepSeek-R1 and MI35x models
michaelzhang-ai Jan 6, 2026
946bcb4
Enhance model path handling for DeepSeek-R1-MXFP4 tests
michaelzhang-ai Jan 6, 2026
12d3070
Update nightly test workflow and improve code formatting
michaelzhang-ai Jan 6, 2026
d33af4c
Merge branch 'main' into add-mi35x-nightly-tests
michaelzhang-ai Jan 7, 2026
b106b20
Update nightly test workflow and model configurations for DeepSeek-R1…
michaelzhang-ai Jan 7, 2026
69da461
MI35x: Use DeepSeek-R1-0528 for basic+MTP only, skip DP/TC
michaelzhang-ai Jan 7, 2026
284def5
Remove pull_request trigger before merge
michaelzhang-ai Jan 7, 2026
9a675e8
Merge branch 'main' into add-mi35x-nightly-tests
michaelzhang-ai Jan 7, 2026
132979d
Remove duplicate AMD nightly suites from test/srt/run_suite.py
michaelzhang-ai Jan 7, 2026
e62e96b
Merge upstream/main - resolve run_suite.py conflict
michaelzhang-ai Jan 7, 2026
30f127f
Merge branch 'main' into add-mi35x-nightly-tests
michaelzhang-ai Jan 7, 2026
6ba4cba
Merge branch 'main' into add-mi35x-nightly-tests
bingxche Jan 8, 2026
21e61fb
Merge branch 'main' into add-mi35x-nightly-tests
HaiShaw Jan 8, 2026
3af0b76
Merge branch 'main' into add-mi35x-nightly-tests
michaelzhang-ai Jan 8, 2026
5ce5747
Merge branch 'main' into add-mi35x-nightly-tests
HaiShaw Jan 8, 2026
3006d68
Merge branch 'main' into add-mi35x-nightly-tests
michaelzhang-ai Jan 8, 2026
6786ba7
Merge branch 'main' into add-mi35x-nightly-tests
michaelzhang-ai Jan 9, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
272 changes: 221 additions & 51 deletions .github/workflows/nightly-test-amd.yml

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
AMD GSM8K Completion Evaluation Test
AMD GSM8K Completion Evaluation Test (Migrated from test/srt/nightly/)

This test uses the completion-based gsm8k benchmark (few-shot prompting)
which works with base models that don't have chat templates.
Expand All @@ -20,6 +20,8 @@
- "deepseek-v3-mtp": DeepSeek-V3 with MTP/EAGLE (nightly-amd-8-gpu-deepseek-v3-mtp)
- "deepseek-r1": DeepSeek-R1 reasoning model (nightly-amd-8-gpu-deepseek-r1)
- "all": All models

Registry: nightly-amd-8-gpu suite (8-GPU tests)
"""

import ast
Expand All @@ -44,6 +46,7 @@
print("[WARNING] huggingface_hub not available - model cache checking disabled")

from sglang.srt.utils import kill_process_tree
from sglang.test.ci.ci_register import register_amd_ci
from sglang.test.test_utils import (
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
Expand All @@ -53,6 +56,9 @@
)
from sglang.utils import download_and_cache_file, read_jsonl

# Register for AMD CI - GSM8K completion tests (~120 min)
register_amd_ci(est_time=7200, suite="nightly-amd-8-gpu", nightly=True)

INVALID = -9999999


Expand All @@ -67,13 +73,22 @@ class BaseModelConfig:
env_vars: Optional[dict] = None
tokenizer_path: Optional[str] = None
timeout: Optional[int] = None # Custom timeout for server launch (seconds)
variant: Optional[str] = (
None # Test variant name (e.g., "basic", "MTP", "DP", "TC")
)

def __post_init__(self):
if self.other_args is None:
self.other_args = []
if self.env_vars is None:
self.env_vars = {}

def get_display_name(self) -> str:
"""Return display name for logs/summary (model + variant if set)."""
if self.variant:
return f"{self.model_path} ({self.variant})"
return self.model_path


# =============================================================================
# MODEL GROUPS - Each group runs on a separate 8-GPU runner
Expand Down Expand Up @@ -193,113 +208,107 @@ def __post_init__(self):
),
]

# Group 3: DeepSeek-V3 with DP Attention
# Runner: nightly-amd-8-gpu-deepseek-v3-dp
# Note: Uses DP attention (dp-size=8) for better performance, requires ROCm 7.0+
AMD_DEEPSEEK_V3_DP_MODELS = [
# DeepSeek-V3-0324 with DP attention
# Note: DeepSeek-V3 accuracy tests removed - V3 only used for perf tests
# See test_deepseek_v3_perf.py and test_deepseek_v31_perf.py for V3 perf tests

# Group 3: DeepSeek-R1 (reasoning model) - Basic + MTP combined
# Runner: nightly-amd-8-gpu-deepseek-r1
AMD_DEEPSEEK_R1_MODELS = [
# DeepSeek-R1-0528 basic - reasoning model, ~80GB per GPU
BaseModelConfig(
model_path="deepseek-ai/DeepSeek-V3-0324",
model_path="deepseek-ai/DeepSeek-R1-0528",
tp_size=8,
accuracy_threshold=0.93,
timeout=3600, # 1 hour for large model
variant="basic",
other_args=[
"--attention-backend",
"aiter",
"--chunked-prefill-size",
"131072",
"--dp-size",
"8",
"--enable-dp-attention",
"--disable-radix-cache",
"--mem-fraction-static",
"0.85",
"--trust-remote-code",
],
env_vars={
"SGLANG_USE_ROCM700A": "1",
"SGLANG_USE_AITER": "1",
},
),
]

# Group 3b: DeepSeek-V3 with Torch Compile
# Runner: nightly-amd-8-gpu-deepseek-v3-tc
# Note: Uses torch compile for performance optimization, requires ROCm 7.0+
AMD_DEEPSEEK_V3_TC_MODELS = [
# DeepSeek-V3-0324 with torch compile
# DeepSeek-R1-0528 with MTP (EAGLE speculative decoding)
BaseModelConfig(
model_path="deepseek-ai/DeepSeek-V3-0324",
model_path="deepseek-ai/DeepSeek-R1-0528",
tp_size=8,
accuracy_threshold=0.93,
timeout=7200, # 2 hours for compilation + large model
timeout=3600,
variant="MTP",
other_args=[
"--chunked-prefill-size",
"131072",
"--speculative-algorithm",
"EAGLE",
"--speculative-num-steps",
"3",
"--speculative-eagle-topk",
"1",
"--speculative-num-draft-tokens",
"4",
"--mem-fraction-static",
"0.70", # Reduced further for torch compile
"--cuda-graph-max-bs",
"8", # Reduced from 16 to reduce memory
"--enable-torch-compile",
"--disable-cuda-graph", # Disable cuda graph to avoid memory issues
"0.7",
"--trust-remote-code",
],
env_vars={
"SGLANG_USE_ROCM700A": "1",
"SGLANG_USE_AITER": "1",
},
),
]

# Group 3c: DeepSeek-V3 with MTP (EAGLE speculative decoding)
# Runner: nightly-amd-8-gpu-deepseek-v3-mtp
# Note: Uses MTP for improved throughput, requires ROCm 7.0+
AMD_DEEPSEEK_V3_MTP_MODELS = [
# DeepSeek-V3-0324 with MTP (EAGLE speculative decoding)
# Group 5: DeepSeek-R1 with DP + TC combined
# Runner: nightly-amd-8-gpu-deepseek-r1-dp-tc
# Combines DP attention and Torch Compile tests for DeepSeek-R1
AMD_DEEPSEEK_R1_DP_TC_MODELS = [
# DeepSeek-R1-0528 with DP attention
BaseModelConfig(
model_path="deepseek-ai/DeepSeek-V3-0324",
model_path="deepseek-ai/DeepSeek-R1-0528",
tp_size=8,
accuracy_threshold=0.93,
timeout=3600, # 1 hour for large model
timeout=3600,
variant="DP",
other_args=[
"--chunked-prefill-size",
"131072",
"--speculative-algorithm",
"EAGLE",
"--speculative-num-steps",
"3",
"--speculative-eagle-topk",
"1",
"--speculative-num-draft-tokens",
"4",
"--dp-size",
"8",
"--enable-dp-attention",
"--mem-fraction-static",
"0.7",
"0.85",
"--trust-remote-code",
],
env_vars={
"SGLANG_USE_ROCM700A": "1",
"SGLANG_USE_AITER": "1",
},
),
]

# Group 4: DeepSeek-R1 (reasoning model)
# Runner: nightly-amd-8-gpu-deepseek-r1
AMD_DEEPSEEK_R1_MODELS = [
# DeepSeek-R1-0528 - reasoning model, ~80GB per GPU
# DeepSeek-R1-0528 with torch compile
BaseModelConfig(
model_path="deepseek-ai/DeepSeek-R1-0528",
tp_size=8,
accuracy_threshold=0.93,
timeout=3600, # 1 hour for large model
timeout=7200, # 2 hours for compilation
variant="TC",
other_args=[
"--attention-backend",
"aiter",
"--chunked-prefill-size",
"131072",
"--disable-radix-cache",
"--mem-fraction-static",
"0.85",
"0.70",
"--cuda-graph-max-bs",
"8",
"--enable-torch-compile",
"--disable-cuda-graph",
"--trust-remote-code",
],
env_vars={
"SGLANG_USE_ROCM700A": "1",
"SGLANG_USE_AITER": "1",
},
),
Expand All @@ -312,27 +321,28 @@ def get_model_group() -> str:


def get_models_for_group(group: str) -> List[BaseModelConfig]:
"""Get the list of models for a given group."""
"""Get the list of models for a given group.

Note: DeepSeek-V3 is only used for perf tests, not accuracy tests.
See test_deepseek_v3_perf.py and test_deepseek_v31_perf.py.
"""
if group == "gpt-oss":
return AMD_GPT_OSS_MODELS
elif group == "grok":
return AMD_GROK_MODELS
elif group == "deepseek-v3-dp":
return AMD_DEEPSEEK_V3_DP_MODELS
elif group == "deepseek-v3-tc":
return AMD_DEEPSEEK_V3_TC_MODELS
elif group == "deepseek-v3-mtp":
return AMD_DEEPSEEK_V3_MTP_MODELS
elif group == "deepseek-r1":
return AMD_DEEPSEEK_R1_MODELS
elif group == "deepseek-r1-dp-tc":
return AMD_DEEPSEEK_R1_DP_TC_MODELS
elif group == "deepseek-r1-all":
# All DeepSeek-R1 variants: basic, MTP, DP, TC
return AMD_DEEPSEEK_R1_MODELS + AMD_DEEPSEEK_R1_DP_TC_MODELS
elif group == "all":
return (
AMD_GPT_OSS_MODELS
+ AMD_GROK_MODELS
+ AMD_DEEPSEEK_V3_DP_MODELS
+ AMD_DEEPSEEK_V3_TC_MODELS
+ AMD_DEEPSEEK_V3_MTP_MODELS
+ AMD_DEEPSEEK_R1_MODELS
+ AMD_DEEPSEEK_R1_DP_TC_MODELS
)
else:
print(f"[WARNING] Unknown model group '{group}', using 'gpt-oss'")
Expand Down Expand Up @@ -671,9 +681,10 @@ def test_gsm8k_completion_all_models(self):
)

for config in self.models:
with self.subTest(model=config.model_path):
display_name = config.get_display_name()
with self.subTest(model=display_name):
print(f"\n{'='*60}")
print(f"Testing: {config.model_path} (TP={config.tp_size})")
print(f"Testing: {display_name} (TP={config.tp_size})")
print(f"{'='*60}")

error_message = None
Expand All @@ -687,12 +698,12 @@ def test_gsm8k_completion_all_models(self):

if not is_available:
print(f"\n❌ MODEL NOT AVAILABLE: {status_msg}")
print(f"⏭️ SKIPPING: {config.model_path}")
print(f"⏭️ SKIPPING: {display_name}")
status = f"⏭️ SKIP"
skipped = True
all_results.append(
{
"model": config.model_path,
"model": display_name,
"tp_size": config.tp_size,
"accuracy": None,
"threshold": config.accuracy_threshold,
Expand All @@ -709,7 +720,7 @@ def test_gsm8k_completion_all_models(self):
else:
try:
# Launch server with timing
print(f"\n🚀 Launching server for {config.model_path}...")
print(f"\n🚀 Launching server for {display_name}...")
server_start = time.time()
process = popen_launch_server_for_base_model(
self.base_url, config
Expand Down Expand Up @@ -747,7 +758,7 @@ def test_gsm8k_completion_all_models(self):

total_time = time.time() - model_start

print(f"\n📈 Results for {config.model_path}:")
print(f"\n📈 Results for {display_name}:")
print(
f" Accuracy: {acc:.3f} (threshold: {config.accuracy_threshold})"
)
Expand All @@ -768,7 +779,7 @@ def test_gsm8k_completion_all_models(self):

all_results.append(
{
"model": config.model_path,
"model": display_name,
"tp_size": config.tp_size,
"accuracy": acc,
"threshold": config.accuracy_threshold,
Expand All @@ -790,7 +801,7 @@ def test_gsm8k_completion_all_models(self):
status = "❌ ERROR"
all_results.append(
{
"model": config.model_path,
"model": display_name,
"tp_size": config.tp_size,
"accuracy": None,
"threshold": config.accuracy_threshold,
Expand All @@ -806,7 +817,7 @@ def test_gsm8k_completion_all_models(self):
)

finally:
print(f"\n🛑 Stopping server for {config.model_path}...")
print(f"\n🛑 Stopping server for {display_name}...")
kill_process_tree(process.pid)

except Exception as e:
Expand All @@ -816,7 +827,7 @@ def test_gsm8k_completion_all_models(self):
status = "❌ ERROR"
all_results.append(
{
"model": config.model_path,
"model": display_name,
"tp_size": config.tp_size,
"accuracy": None,
"threshold": config.accuracy_threshold,
Expand All @@ -831,14 +842,14 @@ def test_gsm8k_completion_all_models(self):
}
)

# Add to summary with runtime
# Add to summary with runtime (use display name to show variant)
acc_str = f"{acc:.3f}" if acc is not None else "N/A"
startup_str = (
f"{startup_time:.0f}s" if startup_time is not None else "N/A"
)
bench_str = f"{bench_time:.0f}s" if bench_time is not None else "N/A"
total_str = f"{total_time:.0f}s" if total_time is not None else "N/A"
summary += f"| {config.model_path} | {config.tp_size} | {acc_str} | {config.accuracy_threshold} | {startup_str} | {bench_str} | {total_str} | {status} |\n"
summary += f"| {display_name} | {config.tp_size} | {acc_str} | {config.accuracy_threshold} | {startup_str} | {bench_str} | {total_str} | {status} |\n"

# Calculate total test runtime
total_test_time = time.time() - total_test_start
Expand Down
Loading
Loading