diff --git a/test/registered/amd/accuracy/mi35x/test_qwen3_coder_next_eval_mi35x.py b/test/registered/amd/accuracy/mi35x/test_qwen3_coder_next_eval_mi35x.py new file mode 100644 index 000000000000..523e4878ffb1 --- /dev/null +++ b/test/registered/amd/accuracy/mi35x/test_qwen3_coder_next_eval_mi35x.py @@ -0,0 +1,302 @@ +"""MI35x Qwen3-Coder-Next GSM8K Completion Evaluation Test (8-GPU) + +Tests Qwen3-Coder-Next model with basic and MTP configurations +using few-shot completion benchmark on MI35x. + +Registry: nightly-amd-8-gpu-mi35x-qwen3-coder-next suite +""" + +import ast +import os + +# Set HF cache for MI35x +os.environ.setdefault("HF_HOME", "/data2/models/huggingface") +os.environ.setdefault("HF_HUB_CACHE", "/data2/models/huggingface/hub") + +import re +import time +import unittest +from dataclasses import dataclass +from typing import List, Optional, Tuple + +import numpy as np + +from sglang.srt.utils import kill_process_tree +from sglang.test.ci.ci_register import register_amd_ci +from sglang.test.test_utils import ( + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + is_in_ci, + popen_launch_server, + write_github_step_summary, +) +from sglang.utils import download_and_cache_file, read_jsonl + +# Register for AMD CI - MI35x Qwen3-Coder-Next accuracy test +register_amd_ci(est_time=3600, suite="nightly-amd-8-gpu-mi35x", nightly=True) + +INVALID = -9999999 + +# Model path configuration for MI35x Qwen3-Coder-Next +# Priority: 1) env var, 2) local path +QWEN3_CODER_NEXT_LOCAL_PATH = "/data/Qwen/Qwen3-Coder-Next/" +QWEN3_CODER_NEXT_HF_MODEL_ID = "Qwen/Qwen3-Coder-Next" + + +def get_model_path() -> str: + """Get effective model path: env var > local path > HF model ID.""" + env_path = os.environ.get("QWEN3_CODER_NEXT_MODEL_PATH") + if env_path: + return env_path + if os.path.exists(QWEN3_CODER_NEXT_LOCAL_PATH): + return QWEN3_CODER_NEXT_LOCAL_PATH + return QWEN3_CODER_NEXT_HF_MODEL_ID + + +@dataclass +class ModelConfig: + """Configuration for a model to test.""" + + model_path: str + tp_size: int = 8 + accuracy_threshold: float = 0.50 + other_args: Optional[List[str]] = None + env_vars: Optional[dict] = None + timeout: Optional[int] = None + variant: Optional[str] = None + + def __post_init__(self): + if self.other_args is None: + self.other_args = [] + if self.env_vars is None: + self.env_vars = {} + + def get_display_name(self) -> str: + if self.variant: + return f"{self.model_path} ({self.variant})" + return self.model_path + + +def get_qwen3_coder_next_models() -> List[ModelConfig]: + """Get Qwen3-Coder-Next model configurations for MI35x.""" + model_path = get_model_path() + common_kwargs = { + "model_path": model_path, + "tp_size": 8, + "accuracy_threshold": 0.90, + "timeout": 3600, + } + common_args = [ + "--attention-backend", + "aiter", + "--chunked-prefill-size", + "131072", + "--disable-radix-cache", + "--mem-fraction-static", + "0.8", + "--trust-remote-code", + ] + return [ + # Basic — matches run_qwen3-coder-next_spec.sh + ModelConfig( + **common_kwargs, + variant="basic", + other_args=common_args + + [ + "--kv-cache-dtype", + "fp8_e4m3", + ], + ), + # MTP (speculative decoding) + # TODO: Support MTP with fp8 kv cache on gfx950. + # Note: no --kv-cache-dtype fp8_e4m3 because Triton extend_attention + # used by MTP does not support fp8 kv cache on gfx950. + ModelConfig( + **common_kwargs, + variant="mtp", + other_args=common_args + + [ + "--speculative-algorithm", + "EAGLE", + "--speculative-num-steps", + "3", + "--speculative-eagle-topk", + "1", + "--speculative-num-draft-tokens", + "4", + ], + ), + ] + + +def get_one_example(lines, i, include_answer): + """Format a single GSM8K example.""" + ret = "Question: " + lines[i]["question"] + "\nAnswer:" + if include_answer: + ret += " " + lines[i]["answer"] + return ret + + +def get_few_shot_examples(lines, k): + """Get k few-shot examples for prompting.""" + ret = "" + for i in range(k): + ret += get_one_example(lines, i, True) + "\n\n" + return ret + + +def get_answer_value(answer_str): + """Extract numerical answer from response.""" + answer_str = answer_str.replace(",", "") + numbers = re.findall(r"\d+", answer_str) + if len(numbers) < 1: + return INVALID + try: + return ast.literal_eval(numbers[-1]) + except SyntaxError: + return INVALID + + +def run_gsm8k_benchmark( + base_url: str, + num_questions: int = 200, + num_shots: int = 5, + parallel: int = 64, +) -> Tuple[float, float, float]: + """Run GSM8K few-shot completion benchmark.""" + import sglang as sgl + from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint + + url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl" + data_path = download_and_cache_file(url) + lines = list(read_jsonl(data_path)) + + few_shot_examples = get_few_shot_examples(lines, num_shots) + + questions = [] + labels = [] + for i in range(len(lines[:num_questions])): + questions.append(get_one_example(lines, i, False)) + labels.append(get_answer_value(lines[i]["answer"])) + assert all(l != INVALID for l in labels) + arguments = [{"question": q} for q in questions] + + @sgl.function + def few_shot_gsm8k(s, question): + s += few_shot_examples + question + s += sgl.gen( + "answer", max_tokens=512, stop=["Question", "Assistant:", "<|separator|>"] + ) + + backend = RuntimeEndpoint(base_url) + sgl.set_default_backend(backend) + + tic = time.perf_counter() + states = few_shot_gsm8k.run_batch( + arguments, temperature=0, num_threads=parallel, progress_bar=True + ) + latency = time.perf_counter() - tic + + preds = [get_answer_value(states[i]["answer"]) for i in range(len(states))] + acc = np.mean(np.array(preds) == np.array(labels)) + invalid = np.mean(np.array(preds) == INVALID) + + return float(acc), float(invalid), float(latency) + + +class TestQwen3CoderNextEvalMI35x(unittest.TestCase): + """Qwen3-Coder-Next GSM8K Completion Evaluation Test for AMD MI35x.""" + + @classmethod + def setUpClass(cls): + cls.models = get_qwen3_coder_next_models() + cls.base_url = DEFAULT_URL_FOR_TEST + cls.num_questions = int(os.environ.get("GSM8K_NUM_QUESTIONS", "200")) + + def test_qwen3_coder_next_accuracy(self): + """Test Qwen3-Coder-Next models with GSM8K completion benchmark.""" + # Check if model exists + model_path = get_model_path() + is_local_path = model_path.startswith("/") + if is_local_path and not os.path.exists(model_path): + print(f"\nSKIPPING: Local model not found at {model_path}") + self.skipTest(f"Local model not found at {model_path}") + return + + if is_local_path: + print(f"Using local model: {model_path}") + else: + print(f"Using HuggingFace model: {model_path}") + + all_results = [] + summary = "### Qwen3-Coder-Next Models (MI35x)\n\n" + summary += "| Model | Variant | TP | Accuracy | Threshold | Status |\n" + summary += "| ----- | ------- | -- | -------- | --------- | ------ |\n" + + for config in self.models: + display_name = config.get_display_name() + with self.subTest(model=display_name): + print(f"\n{'='*60}") + print(f"Testing: {display_name}") + print(f"{'='*60}") + + env = os.environ.copy() + for key, value in config.env_vars.items(): + env[key] = value + + other_args = list(config.other_args) + other_args.extend(["--tp", str(config.tp_size)]) + timeout = config.timeout or DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH + + try: + process = popen_launch_server( + model=config.model_path, + base_url=self.base_url, + timeout=timeout, + other_args=other_args, + env=env, + ) + + try: + acc, invalid, latency = run_gsm8k_benchmark( + self.base_url, num_questions=self.num_questions + ) + passed = acc >= config.accuracy_threshold + status = "PASS" if passed else "FAIL" + print( + f" accuracy={acc:.3f} threshold={config.accuracy_threshold} {status}" + ) + + all_results.append( + { + "model": display_name, + "accuracy": acc, + "passed": passed, + } + ) + summary += f"| {config.model_path} | {config.variant or 'N/A'} | {config.tp_size} | {acc:.3f} | {config.accuracy_threshold} | {status} |\n" + + finally: + kill_process_tree(process.pid) + + except Exception as e: + summary += f"| {config.model_path} | {config.variant or 'N/A'} | {config.tp_size} | N/A | {config.accuracy_threshold} | ERROR |\n" + all_results.append( + { + "model": display_name, + "accuracy": None, + "passed": False, + "error": str(e), + } + ) + + if is_in_ci(): + write_github_step_summary(summary) + + failed = [r for r in all_results if not r["passed"]] + if failed: + raise AssertionError(f"Failed models: {[r['model'] for r in failed]}") + + +if __name__ == "__main__": + unittest.main() diff --git a/test/registered/amd/test_qwen3_coder_next_8gpu.py b/test/registered/amd/test_qwen3_coder_next_8gpu.py new file mode 100644 index 000000000000..a8631af6e8c2 --- /dev/null +++ b/test/registered/amd/test_qwen3_coder_next_8gpu.py @@ -0,0 +1,184 @@ +"""MI35x Qwen3-Coder-Next Functionality Test (8-GPU) + +Tests Qwen3-Coder-Next model with basic configuration +on MI35x. Covers GSM8K accuracy and BS=1 decode speed. + +Server args match run_qwen3-coder-next_spec.sh. + +Registry: stage-c-test-large-8-gpu-amd-mi35x-qwen3-coder-next suite +""" + +import unittest +from types import SimpleNamespace + +import requests + +from sglang.srt.utils import kill_process_tree +from sglang.test.ci.ci_register import register_amd_ci +from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k +from sglang.test.send_one import BenchArgs, send_one_prompt +from sglang.test.test_utils import ( + DEFAULT_URL_FOR_TEST, + CustomTestCase, + is_in_ci, + popen_launch_server, + write_github_step_summary, +) + +register_amd_ci(est_time=3600, suite="stage-c-test-large-8-gpu-amd-mi35x") + +QWEN3_CODER_NEXT_MODEL_PATH = "Qwen/Qwen3-Coder-Next" +SERVER_LAUNCH_TIMEOUT = 1800 + +COMMON_ARGS = [ + "--tp", + "8", + "--attention-backend", + "aiter", + "--chunked-prefill-size", + "131072", + "--disable-radix-cache", + "--mem-fraction-static", + "0.8", + "--trust-remote-code", +] + + +class TestQwen3CoderNext(CustomTestCase): + @classmethod + def setUpClass(cls): + cls.model = QWEN3_CODER_NEXT_MODEL_PATH + cls.base_url = DEFAULT_URL_FOR_TEST + other_args = COMMON_ARGS + [ + "--kv-cache-dtype", + "fp8_e4m3", + ] + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=SERVER_LAUNCH_TIMEOUT, + other_args=other_args, + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_a_gsm8k(self): + """GSM8K few-shot accuracy (runs first to warm up server).""" + requests.get(self.base_url + "/flush_cache") + + args = SimpleNamespace( + num_shots=5, + data_path=None, + num_questions=200, + parallel=128, + max_new_tokens=512, + host="http://127.0.0.1", + port=int(self.base_url.split(":")[-1]), + ) + metrics = run_eval_few_shot_gsm8k(args) + print(f"{metrics=}") + + if is_in_ci(): + write_github_step_summary( + f"### test_gsm8k (qwen3-coder-next)\n" f'{metrics["accuracy"]=:.3f}\n' + ) + self.assertGreater(metrics["accuracy"], 0.90) + + def test_bs_1_speed(self): + """Batch-size 1 decode speed.""" + args = BenchArgs(port=int(self.base_url.split(":")[-1]), max_new_tokens=2048) + _, speed = send_one_prompt(args) + + print(f"{speed=:.2f}") + + if is_in_ci(): + write_github_step_summary( + f"### test_bs_1_speed (qwen3-coder-next)\n" f"{speed=:.2f} token/s\n" + ) + # self.assertGreater(speed, 50) + + +@unittest.skip("MTP perf not ready yet — Triton extend_attention fp8 kv cache TODO") +class TestQwen3CoderNextMTP(CustomTestCase): + @classmethod + def setUpClass(cls): + cls.model = QWEN3_CODER_NEXT_MODEL_PATH + cls.base_url = DEFAULT_URL_FOR_TEST + # TODO: Support MTP with fp8 kv cache on gfx950. + # Note: no --kv-cache-dtype fp8_e4m3 because Triton extend_attention + # used by MTP does not support fp8 kv cache on gfx950. + other_args = COMMON_ARGS + [ + "--speculative-algorithm", + "EAGLE", + "--speculative-num-steps", + "3", + "--speculative-eagle-topk", + "1", + "--speculative-num-draft-tokens", + "4", + ] + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=SERVER_LAUNCH_TIMEOUT, + other_args=other_args, + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_a_gsm8k(self): + """GSM8K few-shot accuracy with MTP (runs first to warm up server).""" + requests.get(self.base_url + "/flush_cache") + + args = SimpleNamespace( + num_shots=5, + data_path=None, + num_questions=200, + max_new_tokens=512, + parallel=128, + host="http://127.0.0.1", + port=int(self.base_url.split(":")[-1]), + ) + metrics = run_eval_few_shot_gsm8k(args) + print(f"{metrics=}") + + server_info = requests.get(self.base_url + "/get_server_info") + avg_spec_accept_length = server_info.json()["internal_states"][0][ + "avg_spec_accept_length" + ] + print(f"{avg_spec_accept_length=}") + + if is_in_ci(): + write_github_step_summary( + f"### test_gsm8k (qwen3-coder-next mtp)\n" + f'{metrics["accuracy"]=:.3f}\n' + f"{avg_spec_accept_length=:.2f}\n" + ) + self.assertGreater(metrics["accuracy"], 0.90) + self.assertGreater(avg_spec_accept_length, 2.0) + + def test_bs_1_speed(self): + """Batch-size 1 decode speed with MTP.""" + args = BenchArgs(port=int(self.base_url.split(":")[-1]), max_new_tokens=2048) + acc_length, speed = send_one_prompt(args) + + print(f"{acc_length=:.2f} {speed=:.2f}") + + if is_in_ci(): + write_github_step_summary( + f"### test_bs_1_speed (qwen3-coder-next mtp)\n" + f"{acc_length=:.2f}\n" + f"{speed=:.2f} token/s\n" + ) + # self.assertGreater(acc_length, 2.0) + # self.assertGreater(speed, 100) + + +if __name__ == "__main__": + import unittest + + unittest.main()