sgl-project · HaiShaw · Feb 4, 2026 · Jan 28, 2026 · Jan 28, 2026 · Jan 29, 2026
@@ -34,6 +34,7 @@ on:
           - 'nightly-8-gpu-kimi-k2'
           # MI35x jobs
           - 'nightly-test-1-gpu-mi35x'
+          - 'nightly-8-gpu-mi35x-kimi-k2'
           - 'nightly-accuracy-8-gpu-mi35x'
           - 'nightly-8-gpu-mi35x-grok1-int4'
           - 'nightly-8-gpu-mi35x-grok2'
@@ -582,13 +583,13 @@ jobs:
           bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate
 
       - name: Accuracy Test MI35x (8-GPU Grok1-INT4)
-        timeout-minutes: 60
+        timeout-minutes: 90
         run: |
           > github_summary.md  # Clear summary file
           bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
             -e RCCL_MSCCL_ENABLE=0 \
             -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
-            python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-grok1-int4 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$?
+            python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-grok1-int4 --nightly --timeout-per-file 5400 || TEST_EXIT_CODE=$?
           echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
           exit ${TEST_EXIT_CODE:-0}
 
@@ -793,6 +794,39 @@ jobs:
           echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
           exit ${TEST_EXIT_CODE:-0}
 
+  # MI35x 8-GPU Kimi-K2 (Accuracy)
+  nightly-8-gpu-mi35x-kimi-k2:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-kimi-k2')
+    runs-on: linux-mi35x-gpu-8
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/amd/amd_ci_install_dependency.sh
+          # Install tabulate for run_suite.py (missing in MI35x container)
+          bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate
+
+      - name: Accuracy Test MI35x (8-GPU Kimi-K2)
+        timeout-minutes: 180
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-kimi-k2 --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
   # MI35x 8-GPU DeepSeek-V3.2 Performance Test (MTP)
   nightly-perf-8-gpu-mi35x-deepseek-v32-mtp:
     if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-8-gpu-mi35x-deepseek-v32-mtp')
@@ -834,10 +868,10 @@ jobs:
       # MI30x Accuracy Tests
       - nightly-accuracy-2-gpu
       - nightly-accuracy-2-gpu-vlm
-      # MI30x Performance Tests
-      - nightly-perf-2-gpu-text
-      - nightly-perf-2-gpu-vlm
       - nightly-accuracy-8-gpu
+      # MI30x Performance Tests - excluded from check (perf failures don't block CI)
+      # - nightly-perf-2-gpu-text
+      # - nightly-perf-2-gpu-vlm
       # MI30x Combined Accuracy + Performance Tests
       - nightly-8-gpu-grok1-int4
       - nightly-8-gpu-grok2
@@ -853,6 +887,7 @@ jobs:
       - nightly-8-gpu-mi35x-deepseek-r1-mxfp4
       - nightly-accuracy-8-gpu-mi35x-deepseek-v32
       - nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp
+      - nightly-8-gpu-mi35x-kimi-k2
       # MI35x perf jobs excluded from check - perf failures don't block CI
       # - nightly-perf-8-gpu-mi35x-deepseek-v32-basic
       # - nightly-perf-8-gpu-mi35x-deepseek-v32-mtp

diff --git a/python/sglang/test/nightly_utils.py b/python/sglang/test/nightly_utils.py
@@ -94,6 +94,7 @@ def build_benchmark_command(
         json_output_file: str,
         extra_args: Optional[List[str]] = None,
         server_args: Optional[List[str]] = None,
+        enable_profile: bool = True,
     ) -> List[str]:
         """Build the benchmark command with all required arguments.
 
@@ -106,6 +107,7 @@ def build_benchmark_command(
             json_output_file: Path to JSON output file
             extra_args: Optional extra arguments to append to command
             server_args: Optional server launch arguments to record in metrics
+            enable_profile: Whether to enable profiling (default True for NVIDIA)
 
         Returns:
             List of command arguments ready for subprocess.run()
@@ -125,15 +127,22 @@ def build_benchmark_command(
             "--output-len",
             *[str(x) for x in output_lens],
             "--show-report",
-            "--profile",
-            "--profile-by-stage",
-            "--profile-output-dir",
-            profile_path_prefix,
             f"--pydantic-result-filename={json_output_file}",
             "--no-append-to-github-summary",
             "--trust-remote-code",
         ]
 
+        # Add profiling flags only if enabled (disabled for AMD tests)
+        if enable_profile and profile_path_prefix:
+            command.extend(
+                [
+                    "--profile",
+                    "--profile-by-stage",
+                    "--profile-output-dir",
+                    profile_path_prefix,
+                ]
+            )
+
         if extra_args:
             command.extend(extra_args)
 
@@ -218,6 +227,7 @@ def run_benchmark_for_model(
         other_args: Optional[List[str]] = None,
         variant: str = "",
         extra_bench_args: Optional[List[str]] = None,
+        enable_profile: bool = True,
     ) -> Tuple[List[BenchmarkResult], bool, Optional[float]]:
         """Run a complete benchmark for a single model with server management.
 
@@ -236,6 +246,7 @@ def run_benchmark_for_model(
             other_args: Arguments to pass to server launch
             variant: Optional variant suffix (e.g., "basic", "mtp")
             extra_bench_args: Extra arguments for the benchmark command
+            enable_profile: Whether to enable profiling (default True for NVIDIA)
 
         Returns:
             Tuple of (list of BenchmarkResult objects, success_bool, avg_spec_accept_length or None)
@@ -273,6 +284,7 @@ def run_benchmark_for_model(
                 json_output_file,
                 extra_args=bench_args,
                 server_args=other_args,
+                enable_profile=enable_profile,
             )
 
             result, cmd_success = self.run_benchmark_command(command, model_description)

diff --git a/...amd/accuracy/test_deepseek_r1_eval_amd.py → ...curacy/mi30x/test_deepseek_r1_eval_amd.py b/...amd/accuracy/test_deepseek_r1_eval_amd.py → ...curacy/mi30x/test_deepseek_r1_eval_amd.py
diff --git a/...md/accuracy/test_deepseek_v31_eval_amd.py → ...uracy/mi30x/test_deepseek_v31_eval_amd.py b/...md/accuracy/test_deepseek_v31_eval_amd.py → ...uracy/mi30x/test_deepseek_v31_eval_amd.py
diff --git a/...accuracy/test_deepseek_v32_dp_eval_amd.py → ...cy/mi30x/test_deepseek_v32_dp_eval_amd.py b/...accuracy/test_deepseek_v32_dp_eval_amd.py → ...cy/mi30x/test_deepseek_v32_dp_eval_amd.py
diff --git a/...md/accuracy/test_deepseek_v32_eval_amd.py → ...uracy/mi30x/test_deepseek_v32_eval_amd.py b/...md/accuracy/test_deepseek_v32_eval_amd.py → ...uracy/mi30x/test_deepseek_v32_eval_amd.py
diff --git a/...ccuracy/test_deepseek_v32_mtp_eval_amd.py → ...y/mi30x/test_deepseek_v32_mtp_eval_amd.py b/...ccuracy/test_deepseek_v32_mtp_eval_amd.py → ...y/mi30x/test_deepseek_v32_mtp_eval_amd.py
diff --git a/...accuracy/test_deepseek_v32_tc_eval_amd.py → ...cy/mi30x/test_deepseek_v32_tc_eval_amd.py b/...accuracy/test_deepseek_v32_tc_eval_amd.py → ...cy/mi30x/test_deepseek_v32_tc_eval_amd.py
diff --git a/...red/amd/accuracy/test_gpt_oss_eval_amd.py → ...d/accuracy/mi30x/test_gpt_oss_eval_amd.py b/...red/amd/accuracy/test_gpt_oss_eval_amd.py → ...d/accuracy/mi30x/test_gpt_oss_eval_amd.py
diff --git a/...d/amd/accuracy/test_grok1_fp8_eval_amd.py → ...accuracy/mi30x/test_grok1_fp8_eval_amd.py b/...d/amd/accuracy/test_grok1_fp8_eval_amd.py → ...accuracy/mi30x/test_grok1_fp8_eval_amd.py
diff --git a/.../amd/accuracy/test_grok1_int4_eval_amd.py → ...ccuracy/mi30x/test_grok1_int4_eval_amd.py b/.../amd/accuracy/test_grok1_int4_eval_amd.py → ...ccuracy/mi30x/test_grok1_int4_eval_amd.py
diff --git a/...tered/amd/accuracy/test_grok2_eval_amd.py → ...amd/accuracy/mi30x/test_grok2_eval_amd.py b/...tered/amd/accuracy/test_grok2_eval_amd.py → ...amd/accuracy/mi30x/test_grok2_eval_amd.py
diff --git a/...stered/amd/accuracy/test_grok_eval_amd.py → .../amd/accuracy/mi30x/test_grok_eval_amd.py b/...stered/amd/accuracy/test_grok_eval_amd.py → .../amd/accuracy/mi30x/test_grok_eval_amd.py
diff --git a/...tered/amd/accuracy/test_gsm8k_eval_amd.py → ...amd/accuracy/mi30x/test_gsm8k_eval_amd.py b/...tered/amd/accuracy/test_gsm8k_eval_amd.py → ...amd/accuracy/mi30x/test_gsm8k_eval_amd.py
diff --git a/...red/amd/accuracy/test_kimi_k2_eval_amd.py → ...d/accuracy/mi30x/test_kimi_k2_eval_amd.py b/...red/amd/accuracy/test_kimi_k2_eval_amd.py → ...d/accuracy/mi30x/test_kimi_k2_eval_amd.py
diff --git a/...d/amd/accuracy/test_vlms_mmmu_eval_amd.py → ...accuracy/mi30x/test_vlms_mmmu_eval_amd.py b/...d/amd/accuracy/test_vlms_mmmu_eval_amd.py → ...accuracy/mi30x/test_vlms_mmmu_eval_amd.py
diff --git a/test/registered/amd/accuracy/mi35x/test_gpt_oss_eval_mi35x.py b/test/registered/amd/accuracy/mi35x/test_gpt_oss_eval_mi35x.py
@@ -75,7 +75,9 @@ def __post_init__(self):
             "triton",
             "--trust-remote-code",
         ],
-        env_vars={"SGLANG_USE_AITER": "1"},
+        env_vars={
+            "SGLANG_USE_AITER": "0"
+        },  # Disabled due to SWA eviction bug with aiter (#17220)
     ),
     ModelConfig(
         model_path="openai/gpt-oss-120b",
@@ -93,7 +95,9 @@ def __post_init__(self):
             "triton",
             "--trust-remote-code",
         ],
-        env_vars={"SGLANG_USE_AITER": "1"},
+        env_vars={
+            "SGLANG_USE_AITER": "0"
+        },  # Disabled due to SWA eviction bug with aiter (#17220)
     ),
 ]
 

diff --git a/test/registered/amd/accuracy/mi35x/test_grok1_int4_eval_mi35x.py b/test/registered/amd/accuracy/mi35x/test_grok1_int4_eval_mi35x.py
@@ -23,9 +23,9 @@
 )
 from sglang.utils import download_and_cache_file, read_jsonl
 
-# Register for AMD CI - GROK1-INT4 accuracy tests on MI35x (~25 min)
+# Register for AMD CI - GROK1-INT4 accuracy tests on MI35x (~70 min)
 register_amd_ci(
-    est_time=1500, suite="nightly-amd-accuracy-8-gpu-mi35x-grok1-int4", nightly=True
+    est_time=4200, suite="nightly-amd-accuracy-8-gpu-mi35x-grok1-int4", nightly=True
 )
 
 INVALID = -9999999

diff --git a/test/registered/amd/accuracy/mi35x/test_grok2_eval_mi35x.py b/test/registered/amd/accuracy/mi35x/test_grok2_eval_mi35x.py
@@ -105,7 +105,7 @@ class TestGrok2EvalMI35x(unittest.TestCase):
     def setUpClass(cls):
         cls.base_url = DEFAULT_URL_FOR_TEST
         cls.num_questions = int(os.environ.get("GSM8K_NUM_QUESTIONS", "200"))
-        cls.accuracy_threshold = 0.915
+        cls.accuracy_threshold = 0.90
 
     def test_grok2_accuracy(self):
         """Test Grok-2 with GSM8K completion benchmark."""

diff --git a/test/registered/amd/accuracy/mi35x/test_kimi_k2_eval_mi35x.py b/test/registered/amd/accuracy/mi35x/test_kimi_k2_eval_mi35x.py
@@ -0,0 +1,105 @@
+"""MI35x Kimi-K2 GSM8K Completion Evaluation Test (8-GPU)
+
+Tests moonshotai/Kimi-K2-Instruct-0905 with GSM8K few-shot benchmark on MI35x.
+
+Registry: nightly-amd-accuracy-8-gpu-mi35x-kimi-k2 suite
+"""
+
+import os
+import unittest
+from types import SimpleNamespace
+
+import requests
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.ci.ci_register import register_amd_ci
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.test_utils import (
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+    write_github_step_summary,
+)
+
+# Register for AMD CI - Kimi K2 accuracy test on MI35x (~60 min)
+register_amd_ci(
+    est_time=3600, suite="nightly-amd-accuracy-8-gpu-mi35x-kimi-k2", nightly=True
+)
+
+KIMI_K2_MODEL_PATH = "moonshotai/Kimi-K2-Instruct-0905"
+SERVER_LAUNCH_TIMEOUT = 3600
+ACCURACY_THRESHOLD = 0.94
+
+
+class TestKimiK2EvalMI35x(CustomTestCase):
+    """Kimi-K2 GSM8K Completion Evaluation Test for AMD MI35x."""
+
+    @classmethod
+    def setUpClass(cls):
+        cls.base_url = DEFAULT_URL_FOR_TEST
+
+    def test_kimi_k2_gsm8k_accuracy(self):
+        """Test Kimi-K2 with GSM8K few-shot completion benchmark."""
+        other_args = [
+            "--tp",
+            "8",
+            "--decode-attention-backend",
+            "triton",
+            "--prefill-attention-backend",
+            "aiter",
+            "--trust-remote-code",
+            "--model-loader-extra-config",
+            '{"enable_multithread_load": true}',
+            "--watchdog-timeout",
+            "1200",
+        ]
+        env = os.environ.copy()
+        env["SGLANG_USE_AITER"] = "1"
+        env["SGLANG_ROCM_FUSED_DECODE_MLA"] = "0"
+
+        process = popen_launch_server(
+            KIMI_K2_MODEL_PATH,
+            self.base_url,
+            timeout=SERVER_LAUNCH_TIMEOUT,
+            other_args=other_args,
+            env=env,
+        )
+
+        try:
+            requests.get(self.base_url + "/flush_cache")
+
+            args = SimpleNamespace(
+                num_shots=8,
+                data_path=None,
+                num_questions=1319,
+                parallel=1319,
+                max_new_tokens=512,
+                host="http://127.0.0.1",
+                port=int(self.base_url.split(":")[-1]),
+            )
+            metrics = run_eval_few_shot_gsm8k(args)
+            acc = metrics["accuracy"]
+
+            passed = acc >= ACCURACY_THRESHOLD
+            status = "✅ PASS" if passed else "❌ FAIL"
+            print(f"  accuracy={acc:.3f} threshold={ACCURACY_THRESHOLD} {status}")
+
+            if is_in_ci():
+                summary = "### Kimi-K2 Model (MI35x)\n\n"
+                summary += "| Model | TP | Accuracy | Threshold | Status |\n"
+                summary += "| ----- | -- | -------- | --------- | ------ |\n"
+                summary += f"| {KIMI_K2_MODEL_PATH} | 8 | {acc:.3f} | {ACCURACY_THRESHOLD} | {status} |\n"
+                write_github_step_summary(summary)
+
+            self.assertGreaterEqual(
+                acc,
+                ACCURACY_THRESHOLD,
+                f"Kimi-K2 accuracy {acc:.3f} below threshold {ACCURACY_THRESHOLD}",
+            )
+        finally:
+            kill_process_tree(process.pid)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/...stered/amd/perf/test_deepseek_v31_perf.py → .../amd/perf/mi30x/test_deepseek_v31_perf.py b/...stered/amd/perf/test_deepseek_v31_perf.py → .../amd/perf/mi30x/test_deepseek_v31_perf.py
@@ -129,6 +129,7 @@ def test_bench_one_batch(self):
                         other_args=variant_config["other_args"],
                         variant=variant_config["name"],
                         extra_bench_args=["--trust-remote-code"],
+                        enable_profile=False,  # Disable profiling for AMD tests
                     )
                     results = result_tuple[0]
                     success = result_tuple[1]

diff --git a/.../perf/test_deepseek_v32_basic_perf_amd.py → ...mi30x/test_deepseek_v32_basic_perf_amd.py b/.../perf/test_deepseek_v32_basic_perf_amd.py → ...mi30x/test_deepseek_v32_basic_perf_amd.py
@@ -115,6 +115,7 @@ def test_bench_one_batch(self):
                 other_args=self.variant_config["other_args"],
                 variant=self.variant_config["name"],
                 extra_bench_args=["--trust-remote-code"],
+                enable_profile=False,  # Disable profiling for AMD tests
             )
             results = result_tuple[0]
             success = result_tuple[1]

diff --git a/...md/perf/test_deepseek_v32_mtp_perf_amd.py → ...f/mi30x/test_deepseek_v32_mtp_perf_amd.py b/...md/perf/test_deepseek_v32_mtp_perf_amd.py → ...f/mi30x/test_deepseek_v32_mtp_perf_amd.py
@@ -124,6 +124,7 @@ def test_bench_one_batch(self):
                 other_args=self.variant_config["other_args"],
                 variant=self.variant_config["name"],
                 extra_bench_args=["--trust-remote-code"],
+                enable_profile=False,  # Disable profiling for AMD tests
             )
             results = result_tuple[0]
             success = result_tuple[1]

diff --git a/...istered/amd/perf/test_deepseek_v3_perf.py → ...d/amd/perf/mi30x/test_deepseek_v3_perf.py b/...istered/amd/perf/test_deepseek_v3_perf.py → ...d/amd/perf/mi30x/test_deepseek_v3_perf.py
@@ -119,6 +119,7 @@ def test_bench_one_batch(self):
                         other_args=variant_config["other_args"],
                         variant=variant_config["name"],
                         extra_bench_args=["--trust-remote-code"],
+                        enable_profile=False,  # Disable profiling for AMD tests
                     )
                     results = result_tuple[0]
                     success = result_tuple[1]

diff --git a/...egistered/amd/perf/test_grok1_fp8_perf.py → ...red/amd/perf/mi30x/test_grok1_fp8_perf.py b/...egistered/amd/perf/test_grok1_fp8_perf.py → ...red/amd/perf/mi30x/test_grok1_fp8_perf.py
@@ -109,6 +109,7 @@ def test_bench_grok1_fp8(self):
                 other_args=self.model_config["other_args"],
                 variant=self.model_config["name"],
                 extra_bench_args=["--trust-remote-code"],
+                enable_profile=False,  # Disable profiling for AMD tests
             )
             results = result_tuple[0]
             success = result_tuple[1]

diff --git a/...gistered/amd/perf/test_grok1_int4_perf.py → ...ed/amd/perf/mi30x/test_grok1_int4_perf.py b/...gistered/amd/perf/test_grok1_int4_perf.py → ...ed/amd/perf/mi30x/test_grok1_int4_perf.py
@@ -119,6 +119,7 @@ def test_bench_grok1_int4(self):
                 other_args=self.model_config["other_args"],
                 variant=self.model_config["name"],
                 extra_bench_args=["--trust-remote-code"],
+                enable_profile=False,  # Disable profiling for AMD tests
             )
             results = result_tuple[0]
             success = result_tuple[1]

diff --git a/test/registered/amd/perf/test_grok2_perf.py → ...istered/amd/perf/mi30x/test_grok2_perf.py b/test/registered/amd/perf/test_grok2_perf.py → ...istered/amd/perf/mi30x/test_grok2_perf.py
@@ -121,6 +121,7 @@ def test_bench_grok2(self):
                 other_args=self.model_config["other_args"],
                 variant=self.model_config["name"],
                 extra_bench_args=["--trust-remote-code"],
+                enable_profile=False,  # Disable profiling for AMD tests
             )
             results = result_tuple[0]
             success = result_tuple[1]

diff --git a/...red/amd/perf/test_text_models_perf_amd.py → ...d/perf/mi30x/test_text_models_perf_amd.py b/...red/amd/perf/test_text_models_perf_amd.py → ...d/perf/mi30x/test_text_models_perf_amd.py
@@ -110,6 +110,7 @@ def test_bench_one_batch(self):
                         input_lens=self.input_lens,
                         output_lens=self.output_lens,
                         other_args=other_args,
+                        enable_profile=False,  # Disable profiling for AMD tests
                     )
                     results = result_tuple[0]
                     success = result_tuple[1]

diff --git a/...registered/amd/perf/test_vlms_perf_amd.py → ...ered/amd/perf/mi30x/test_vlms_perf_amd.py b/...registered/amd/perf/test_vlms_perf_amd.py → ...ered/amd/perf/mi30x/test_vlms_perf_amd.py
@@ -123,6 +123,7 @@ def test_bench_one_batch(self):
                         output_lens=self.output_lens,
                         other_args=other_args,
                         extra_bench_args=extra_bench_args,
+                        enable_profile=False,  # Disable profiling for AMD tests
                     )
                     results = result_tuple[0]
                     success = result_tuple[1]

diff --git a/test/registered/amd/perf/mi35x/test_deepseek_r1_mxfp4_perf_mi35x.py b/test/registered/amd/perf/mi35x/test_deepseek_r1_mxfp4_perf_mi35x.py
@@ -152,6 +152,7 @@ def test_bench_one_batch(self):
                         other_args=variant_config["other_args"],
                         variant=variant_config["name"],
                         extra_bench_args=["--trust-remote-code"],
+                        enable_profile=False,  # Disable profiling for AMD tests
                     )
                     results = result_tuple[0]
                     success = result_tuple[1]

diff --git a/test/registered/amd/perf/mi35x/test_deepseek_v32_basic_perf_mi35x.py b/test/registered/amd/perf/mi35x/test_deepseek_v32_basic_perf_mi35x.py
@@ -114,6 +114,7 @@ def test_bench_one_batch(self):
                 other_args=self.variant_config["other_args"],
                 variant=self.variant_config["name"],
                 extra_bench_args=["--trust-remote-code"],
+                enable_profile=False,  # Disable profiling for AMD tests
             )
             results = result_tuple[0]
             success = result_tuple[1]

diff --git a/test/registered/amd/perf/mi35x/test_deepseek_v32_mtp_perf_mi35x.py b/test/registered/amd/perf/mi35x/test_deepseek_v32_mtp_perf_mi35x.py
@@ -96,6 +96,7 @@ def _run_benchmark_with_timeout(
             profile_path_prefix,
             json_output_file,
             extra_args=bench_args,
+            enable_profile=False,  # Disable profiling for AMD tests
         )
         _, cmd_success = runner.run_benchmark_command(command, model_description)
         if not cmd_success: