sgl-project · HaiShaw · Jan 9, 2026 · Jan 6, 2026 · Jan 6, 2026 · Jan 6, 2026
diff --git a/...nightly/test_gsm8k_completion_eval_amd.py → ...nightly/test_gsm8k_completion_eval_amd.py b/...nightly/test_gsm8k_completion_eval_amd.py → ...nightly/test_gsm8k_completion_eval_amd.py
@@ -1,5 +1,5 @@
 """
-AMD GSM8K Completion Evaluation Test
+AMD GSM8K Completion Evaluation Test (Migrated from test/srt/nightly/)
 
 This test uses the completion-based gsm8k benchmark (few-shot prompting)
 which works with base models that don't have chat templates.
@@ -20,6 +20,8 @@
 - "deepseek-v3-mtp": DeepSeek-V3 with MTP/EAGLE (nightly-amd-8-gpu-deepseek-v3-mtp)
 - "deepseek-r1": DeepSeek-R1 reasoning model (nightly-amd-8-gpu-deepseek-r1)
 - "all": All models
+
+Registry: nightly-amd-8-gpu suite (8-GPU tests)
 """
 
 import ast
@@ -44,6 +46,7 @@
     print("[WARNING] huggingface_hub not available - model cache checking disabled")
 
 from sglang.srt.utils import kill_process_tree
+from sglang.test.ci.ci_register import register_amd_ci
 from sglang.test.test_utils import (
     DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
     DEFAULT_URL_FOR_TEST,
@@ -53,6 +56,9 @@
 )
 from sglang.utils import download_and_cache_file, read_jsonl
 
+# Register for AMD CI - GSM8K completion tests (~120 min)
+register_amd_ci(est_time=7200, suite="nightly-amd-8-gpu", nightly=True)
+
 INVALID = -9999999
 
 
@@ -67,13 +73,22 @@ class BaseModelConfig:
     env_vars: Optional[dict] = None
     tokenizer_path: Optional[str] = None
     timeout: Optional[int] = None  # Custom timeout for server launch (seconds)
+    variant: Optional[str] = (
+        None  # Test variant name (e.g., "basic", "MTP", "DP", "TC")
+    )
 
     def __post_init__(self):
         if self.other_args is None:
             self.other_args = []
         if self.env_vars is None:
             self.env_vars = {}
 
+    def get_display_name(self) -> str:
+        """Return display name for logs/summary (model + variant if set)."""
+        if self.variant:
+            return f"{self.model_path} ({self.variant})"
+        return self.model_path
+
 
 # =============================================================================
 # MODEL GROUPS - Each group runs on a separate 8-GPU runner
@@ -193,113 +208,107 @@ def __post_init__(self):
     ),
 ]
 
-# Group 3: DeepSeek-V3 with DP Attention
-# Runner: nightly-amd-8-gpu-deepseek-v3-dp
-# Note: Uses DP attention (dp-size=8) for better performance, requires ROCm 7.0+
-AMD_DEEPSEEK_V3_DP_MODELS = [
-    # DeepSeek-V3-0324 with DP attention
+# Note: DeepSeek-V3 accuracy tests removed - V3 only used for perf tests
+# See test_deepseek_v3_perf.py and test_deepseek_v31_perf.py for V3 perf tests
+
+# Group 3: DeepSeek-R1 (reasoning model) - Basic + MTP combined
+# Runner: nightly-amd-8-gpu-deepseek-r1
+AMD_DEEPSEEK_R1_MODELS = [
+    # DeepSeek-R1-0528 basic - reasoning model, ~80GB per GPU
     BaseModelConfig(
-        model_path="deepseek-ai/DeepSeek-V3-0324",
+        model_path="deepseek-ai/DeepSeek-R1-0528",
         tp_size=8,
         accuracy_threshold=0.93,
         timeout=3600,  # 1 hour for large model
+        variant="basic",
         other_args=[
+            "--attention-backend",
+            "aiter",
             "--chunked-prefill-size",
             "131072",
-            "--dp-size",
-            "8",
-            "--enable-dp-attention",
+            "--disable-radix-cache",
             "--mem-fraction-static",
             "0.85",
             "--trust-remote-code",
         ],
         env_vars={
-            "SGLANG_USE_ROCM700A": "1",
             "SGLANG_USE_AITER": "1",
         },
     ),
-]
-
-# Group 3b: DeepSeek-V3 with Torch Compile
-# Runner: nightly-amd-8-gpu-deepseek-v3-tc
-# Note: Uses torch compile for performance optimization, requires ROCm 7.0+
-AMD_DEEPSEEK_V3_TC_MODELS = [
-    # DeepSeek-V3-0324 with torch compile
+    # DeepSeek-R1-0528 with MTP (EAGLE speculative decoding)
     BaseModelConfig(
-        model_path="deepseek-ai/DeepSeek-V3-0324",
+        model_path="deepseek-ai/DeepSeek-R1-0528",
         tp_size=8,
         accuracy_threshold=0.93,
-        timeout=7200,  # 2 hours for compilation + large model
+        timeout=3600,
+        variant="MTP",
         other_args=[
             "--chunked-prefill-size",
             "131072",
+            "--speculative-algorithm",
+            "EAGLE",
+            "--speculative-num-steps",
+            "3",
+            "--speculative-eagle-topk",
+            "1",
+            "--speculative-num-draft-tokens",
+            "4",
             "--mem-fraction-static",
-            "0.70",  # Reduced further for torch compile
-            "--cuda-graph-max-bs",
-            "8",  # Reduced from 16 to reduce memory
-            "--enable-torch-compile",
-            "--disable-cuda-graph",  # Disable cuda graph to avoid memory issues
+            "0.7",
             "--trust-remote-code",
         ],
         env_vars={
-            "SGLANG_USE_ROCM700A": "1",
             "SGLANG_USE_AITER": "1",
         },
     ),
 ]
 
-# Group 3c: DeepSeek-V3 with MTP (EAGLE speculative decoding)
-# Runner: nightly-amd-8-gpu-deepseek-v3-mtp
-# Note: Uses MTP for improved throughput, requires ROCm 7.0+
-AMD_DEEPSEEK_V3_MTP_MODELS = [
-    # DeepSeek-V3-0324 with MTP (EAGLE speculative decoding)
+# Group 5: DeepSeek-R1 with DP + TC combined
+# Runner: nightly-amd-8-gpu-deepseek-r1-dp-tc
+# Combines DP attention and Torch Compile tests for DeepSeek-R1
+AMD_DEEPSEEK_R1_DP_TC_MODELS = [
+    # DeepSeek-R1-0528 with DP attention
     BaseModelConfig(
-        model_path="deepseek-ai/DeepSeek-V3-0324",
+        model_path="deepseek-ai/DeepSeek-R1-0528",
         tp_size=8,
         accuracy_threshold=0.93,
-        timeout=3600,  # 1 hour for large model
+        timeout=3600,
+        variant="DP",
         other_args=[
             "--chunked-prefill-size",
             "131072",
-            "--speculative-algorithm",
-            "EAGLE",
-            "--speculative-num-steps",
-            "3",
-            "--speculative-eagle-topk",
-            "1",
-            "--speculative-num-draft-tokens",
-            "4",
+            "--dp-size",
+            "8",
+            "--enable-dp-attention",
             "--mem-fraction-static",
-            "0.7",
+            "0.85",
             "--trust-remote-code",
         ],
         env_vars={
             "SGLANG_USE_ROCM700A": "1",
             "SGLANG_USE_AITER": "1",
         },
     ),
-]
-
-# Group 4: DeepSeek-R1 (reasoning model)
-# Runner: nightly-amd-8-gpu-deepseek-r1
-AMD_DEEPSEEK_R1_MODELS = [
-    # DeepSeek-R1-0528 - reasoning model, ~80GB per GPU
+    # DeepSeek-R1-0528 with torch compile
     BaseModelConfig(
         model_path="deepseek-ai/DeepSeek-R1-0528",
         tp_size=8,
         accuracy_threshold=0.93,
-        timeout=3600,  # 1 hour for large model
+        timeout=7200,  # 2 hours for compilation
+        variant="TC",
         other_args=[
-            "--attention-backend",
-            "aiter",
             "--chunked-prefill-size",
             "131072",
-            "--disable-radix-cache",
             "--mem-fraction-static",
-            "0.85",
+            "0.70",
+            "--cuda-graph-max-bs",
+            "8",
+            "--enable-torch-compile",
+            "--disable-cuda-graph",
             "--trust-remote-code",
         ],
         env_vars={
+            "SGLANG_USE_ROCM700A": "1",
             "SGLANG_USE_AITER": "1",
         },
     ),
@@ -312,27 +321,28 @@ def get_model_group() -> str:
 
 
 def get_models_for_group(group: str) -> List[BaseModelConfig]:
-    """Get the list of models for a given group."""
+    """Get the list of models for a given group.
+
+    Note: DeepSeek-V3 is only used for perf tests, not accuracy tests.
+    See test_deepseek_v3_perf.py and test_deepseek_v31_perf.py.
+    """
     if group == "gpt-oss":
         return AMD_GPT_OSS_MODELS
     elif group == "grok":
         return AMD_GROK_MODELS
-    elif group == "deepseek-v3-dp":
-        return AMD_DEEPSEEK_V3_DP_MODELS
-    elif group == "deepseek-v3-tc":
-        return AMD_DEEPSEEK_V3_TC_MODELS
-    elif group == "deepseek-v3-mtp":
-        return AMD_DEEPSEEK_V3_MTP_MODELS
     elif group == "deepseek-r1":
         return AMD_DEEPSEEK_R1_MODELS
+    elif group == "deepseek-r1-dp-tc":
+        return AMD_DEEPSEEK_R1_DP_TC_MODELS
+    elif group == "deepseek-r1-all":
+        # All DeepSeek-R1 variants: basic, MTP, DP, TC
+        return AMD_DEEPSEEK_R1_MODELS + AMD_DEEPSEEK_R1_DP_TC_MODELS
     elif group == "all":
         return (
             AMD_GPT_OSS_MODELS
             + AMD_GROK_MODELS
-            + AMD_DEEPSEEK_V3_DP_MODELS
-            + AMD_DEEPSEEK_V3_TC_MODELS
-            + AMD_DEEPSEEK_V3_MTP_MODELS
             + AMD_DEEPSEEK_R1_MODELS
+            + AMD_DEEPSEEK_R1_DP_TC_MODELS
         )
     else:
         print(f"[WARNING] Unknown model group '{group}', using 'gpt-oss'")
@@ -671,9 +681,10 @@ def test_gsm8k_completion_all_models(self):
         )
 
         for config in self.models:
-            with self.subTest(model=config.model_path):
+            display_name = config.get_display_name()
+            with self.subTest(model=display_name):
                 print(f"\n{'='*60}")
-                print(f"Testing: {config.model_path} (TP={config.tp_size})")
+                print(f"Testing: {display_name} (TP={config.tp_size})")
                 print(f"{'='*60}")
 
                 error_message = None
@@ -687,12 +698,12 @@ def test_gsm8k_completion_all_models(self):
 
                 if not is_available:
                     print(f"\n❌ MODEL NOT AVAILABLE: {status_msg}")
-                    print(f"⏭️ SKIPPING: {config.model_path}")
+                    print(f"⏭️ SKIPPING: {display_name}")
                     status = f"⏭️ SKIP"
                     skipped = True
                     all_results.append(
                         {
-                            "model": config.model_path,
+                            "model": display_name,
                             "tp_size": config.tp_size,
                             "accuracy": None,
                             "threshold": config.accuracy_threshold,
@@ -709,7 +720,7 @@ def test_gsm8k_completion_all_models(self):
                 else:
                     try:
                         # Launch server with timing
-                        print(f"\n🚀 Launching server for {config.model_path}...")
+                        print(f"\n🚀 Launching server for {display_name}...")
                         server_start = time.time()
                         process = popen_launch_server_for_base_model(
                             self.base_url, config
@@ -747,7 +758,7 @@ def test_gsm8k_completion_all_models(self):
 
                             total_time = time.time() - model_start
 
-                            print(f"\n📈 Results for {config.model_path}:")
+                            print(f"\n📈 Results for {display_name}:")
                             print(
                                 f"   Accuracy: {acc:.3f} (threshold: {config.accuracy_threshold})"
                             )
@@ -768,7 +779,7 @@ def test_gsm8k_completion_all_models(self):
 
                             all_results.append(
                                 {
-                                    "model": config.model_path,
+                                    "model": display_name,
                                     "tp_size": config.tp_size,
                                     "accuracy": acc,
                                     "threshold": config.accuracy_threshold,
@@ -790,7 +801,7 @@ def test_gsm8k_completion_all_models(self):
                             status = "❌ ERROR"
                             all_results.append(
                                 {
-                                    "model": config.model_path,
+                                    "model": display_name,
                                     "tp_size": config.tp_size,
                                     "accuracy": None,
                                     "threshold": config.accuracy_threshold,
@@ -806,7 +817,7 @@ def test_gsm8k_completion_all_models(self):
                             )
 
                         finally:
-                            print(f"\n🛑 Stopping server for {config.model_path}...")
+                            print(f"\n🛑 Stopping server for {display_name}...")
                             kill_process_tree(process.pid)
 
                     except Exception as e:
@@ -816,7 +827,7 @@ def test_gsm8k_completion_all_models(self):
                         status = "❌ ERROR"
                         all_results.append(
                             {
-                                "model": config.model_path,
+                                "model": display_name,
                                 "tp_size": config.tp_size,
                                 "accuracy": None,
                                 "threshold": config.accuracy_threshold,
@@ -831,14 +842,14 @@ def test_gsm8k_completion_all_models(self):
                             }
                         )
 
-                # Add to summary with runtime
+                # Add to summary with runtime (use display name to show variant)
                 acc_str = f"{acc:.3f}" if acc is not None else "N/A"
                 startup_str = (
                     f"{startup_time:.0f}s" if startup_time is not None else "N/A"
                 )
                 bench_str = f"{bench_time:.0f}s" if bench_time is not None else "N/A"
                 total_str = f"{total_time:.0f}s" if total_time is not None else "N/A"
-                summary += f"| {config.model_path} | {config.tp_size} | {acc_str} | {config.accuracy_threshold} | {startup_str} | {bench_str} | {total_str} | {status} |\n"
+                summary += f"| {display_name} | {config.tp_size} | {acc_str} | {config.accuracy_threshold} | {startup_str} | {bench_str} | {total_str} | {status} |\n"
 
         # Calculate total test runtime
         total_test_time = time.time() - total_test_start