diff --git a/.github/workflows/pr-test-amd.yml b/.github/workflows/pr-test-amd.yml index 1c3a3cc1c08d..432a1f1e9921 100644 --- a/.github/workflows/pr-test-amd.yml +++ b/.github/workflows/pr-test-amd.yml @@ -657,7 +657,7 @@ jobs: fail-fast: false matrix: runner: [linux-mi35x-gpu-8] - part: [0, 1] + part: [0, 1, 2] runs-on: ${{matrix.runner}} steps: - name: Checkout code @@ -679,7 +679,7 @@ jobs: - name: Run test timeout-minutes: 60 run: | - bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd-mi35x --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 3600 + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd-mi35x --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 --timeout-per-file 3600 stage-b-test-small-1-gpu-performance-amd: needs: [check-changes, call-gate, stage-a-test-1-amd] diff --git a/test/registered/amd/test_kimi_k2_instruct.py b/test/registered/amd/test_kimi_k2_instruct.py new file mode 100644 index 000000000000..34761396ef16 --- /dev/null +++ b/test/registered/amd/test_kimi_k2_instruct.py @@ -0,0 +1,95 @@ +import os +import unittest +from types import SimpleNamespace + +import requests + +from sglang.srt.utils import kill_process_tree +from sglang.test.ci.ci_register import register_amd_ci +from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k +from sglang.test.send_one import BenchArgs, send_one_prompt +from sglang.test.test_utils import ( + DEFAULT_URL_FOR_TEST, + CustomTestCase, + is_in_ci, + popen_launch_server, + write_github_step_summary, +) + +register_amd_ci(est_time=3600, suite="stage-c-test-large-8-gpu-amd-mi35x") + +KIMI_K2_MODEL_PATH = "moonshotai/Kimi-K2-Instruct-0905" +SERVER_LAUNCH_TIMEOUT = 3600 + + +class TestKimiK2Instruct0905(CustomTestCase): + @classmethod + def setUpClass(cls): + cls.model = KIMI_K2_MODEL_PATH + cls.base_url = DEFAULT_URL_FOR_TEST + other_args = [ + "--tp", + "8", + "--decode-attention-backend", + "triton", + "--prefill-attention-backend", + "aiter", + "--trust-remote-code", + "--model-loader-extra-config", + '{"enable_multithread_load": true}', + ] + env = os.environ.copy() + env["SGLANG_USE_AITER"] = "1" + env["SGLANG_ROCM_FUSED_DECODE_MLA"] = "0" + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=SERVER_LAUNCH_TIMEOUT, + other_args=other_args, + env=env, + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_a_gsm8k( + self, + ): # Append an "a" to make this test run first (alphabetically) to warm up the server + requests.get(self.base_url + "/flush_cache") + + args = SimpleNamespace( + num_shots=8, + data_path=None, + num_questions=1319, + parallel=1319, + max_new_tokens=512, + host="http://127.0.0.1", + port=int(self.base_url.split(":")[-1]), + ) + metrics = run_eval_few_shot_gsm8k(args) + print(f"{metrics=}") + + if is_in_ci(): + write_github_step_summary( + f"### test_gsm8k (Kimi-K2-Instruct-0905)\n" + f'{metrics["accuracy"]=:.3f}\n' + ) + self.assertGreater(metrics["accuracy"], 0.94) + + def test_bs_1_speed(self): + args = BenchArgs(port=int(self.base_url.split(":")[-1]), max_new_tokens=2048) + _, speed = send_one_prompt(args) + + print(f"{speed=:.2f}") + + if is_in_ci(): + write_github_step_summary( + f"### test_bs_1_speed (Kimi-K2-Instruct-0905)\n" + f"{speed=:.2f} token/s\n" + ) + self.assertGreater(speed, 45) + + +if __name__ == "__main__": + unittest.main()