diff --git a/python/sglang/srt/environ.py b/python/sglang/srt/environ.py index f6ca9026ef0a..f8dbcca85713 100644 --- a/python/sglang/srt/environ.py +++ b/python/sglang/srt/environ.py @@ -458,7 +458,7 @@ class Envs: SGLANG_ROPE_CACHE_ALIGN = EnvInt(128) # Overlap Spec V2 - SGLANG_ENABLE_SPEC_V2 = EnvBool(False) + SGLANG_ENABLE_SPEC_V2 = EnvBool(True) SGLANG_ENABLE_OVERLAP_PLAN_STREAM = EnvBool(False) # Spec Config diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 817ae43fbed2..9cbe98987135 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -1962,11 +1962,6 @@ def _handle_model_specific_adjustments(self): logger.info( "Enable multi-layer EAGLE speculative decoding for MiMoV2 model." ) - if not envs.SGLANG_ENABLE_SPEC_V2.get(): - envs.SGLANG_ENABLE_SPEC_V2.set(True) - logger.warning( - "Spec v2 is enabled for multi-layer EAGLE speculative decoding." - ) if self.enable_hierarchical_cache: self.swa_full_tokens_ratio = 1.0 @@ -1983,11 +1978,6 @@ def _handle_model_specific_adjustments(self): logger.info( "Enable multi-layer EAGLE speculative decoding for Step3p5ForCausalLM model." ) - if not envs.SGLANG_ENABLE_SPEC_V2.get(): - envs.SGLANG_ENABLE_SPEC_V2.set(True) - logger.warning( - "Spec v2 is enabled for multi-layer EAGLE speculative decoding." - ) if self.enable_hierarchical_cache: self.swa_full_tokens_ratio = 1.0 logger.warning( @@ -3386,26 +3376,29 @@ def _handle_speculative_decoding(self): "Max running requests is reset to 48 for speculative decoding. You can override this by explicitly setting --max-running-requests." ) + spec_v1_reason = None if ( - self.speculative_algorithm in ["EAGLE", "EAGLE3", "STANDALONE"] - and envs.SGLANG_ENABLE_SPEC_V2.get() + self.speculative_eagle_topk is not None + and self.speculative_eagle_topk > 1 + and not self.disable_overlap_schedule + ): + self.disable_overlap_schedule = True + spec_v1_reason = "spec v2 currently only supports topk = 1" + elif ( + not envs.SGLANG_ENABLE_SPEC_V2.get() + and not self.disable_overlap_schedule ): - self.disable_overlap_schedule = False + self.disable_overlap_schedule = True + spec_v1_reason = "SGLANG_ENABLE_SPEC_V2=False" + + if self.disable_overlap_schedule: logger.warning( - "Spec v2 is enabled for eagle/eagle3 speculative decoding and overlap schedule is turned on." + "Spec v1 is used for eagle/eagle3/standalone speculative decoding because %s.", + spec_v1_reason or "overlap schedule is disabled", ) - if ( - self.speculative_eagle_topk is not None - and self.speculative_eagle_topk > 1 - ): - raise ValueError( - "Spec v2 currently only supports topk = 1 for speculative decoding." - ) else: - self.disable_overlap_schedule = True logger.warning( - "Overlap scheduler is disabled when spec v2 is off or using unsupported speculative algorithm. " - "You can set env SGLANG_ENABLE_SPEC_V2=True to enable the experimental overlap scheduler. " + "Spec v2 is enabled by default for eagle/eagle3/standalone speculative decoding." ) if self.enable_mixed_chunk: diff --git a/test/manual/ascend/test_ascend_deepseek_mtp.py b/test/manual/ascend/test_ascend_deepseek_mtp.py index cbe01a07add1..acc78fa5b44b 100644 --- a/test/manual/ascend/test_ascend_deepseek_mtp.py +++ b/test/manual/ascend/test_ascend_deepseek_mtp.py @@ -53,7 +53,6 @@ def setUpClass(cls): ] envs.SGLANG_NPU_USE_MLAPO.set(True) - envs.SGLANG_ENABLE_SPEC_V2.set(True) envs.SGLANG_ENABLE_OVERLAP_PLAN_STREAM.set(True) def test_a_gsm8k(self): diff --git a/test/manual/test_deepseek_v31.py b/test/manual/test_deepseek_v31.py index 8025e47c4e2b..543879b17de2 100644 --- a/test/manual/test_deepseek_v31.py +++ b/test/manual/test_deepseek_v31.py @@ -50,7 +50,6 @@ def test_deepseek_v31_all_variants(self): DEEPSEEK_V31_MODEL_PATH, tp_size=8, extra_args=base_args + mtp_args, - env={"SGLANG_ENABLE_SPEC_V2": "1"}, variant="TP8+MTP", ), ] diff --git a/test/manual/test_glm_46_fp8.py b/test/manual/test_glm_46_fp8.py index 94fb724b75b8..815ad33f4d53 100644 --- a/test/manual/test_glm_46_fp8.py +++ b/test/manual/test_glm_46_fp8.py @@ -41,7 +41,6 @@ def test_glm_46_fp8_all_variants(self): GLM_4_6_FP8_MODEL_PATH, tp_size=8, extra_args=base_args + mtp_args, - env={"SGLANG_ENABLE_SPEC_V2": "1"}, variant="TP8+MTP", ), ] diff --git a/test/manual/test_qwen3_235b.py b/test/manual/test_qwen3_235b.py index f0e4f03996ce..acae0bd1e182 100644 --- a/test/manual/test_qwen3_235b.py +++ b/test/manual/test_qwen3_235b.py @@ -52,7 +52,6 @@ def test_qwen3_235b_fp8_all_variants(self): QWEN3_235B_FP8_MODEL_PATH, tp_size=8, extra_args=base_args + eagle3_args, - env={"SGLANG_ENABLE_SPEC_V2": "1"}, variant="TP8+EP2+EAGLE3", ), ] diff --git a/test/registered/4-gpu-models/test_qwen35_models.py b/test/registered/4-gpu-models/test_qwen35_models.py new file mode 100644 index 000000000000..be125c32175f --- /dev/null +++ b/test/registered/4-gpu-models/test_qwen35_models.py @@ -0,0 +1,242 @@ +import unittest +from types import SimpleNamespace + +import requests + +from sglang.srt.utils import kill_process_tree +from sglang.test.accuracy_test_runner import AccuracyTestParams +from sglang.test.ci.ci_register import register_cuda_ci +from sglang.test.kits.reasoning_kit import ReasoningTokenUsageMixin + +# This eval harness applies the chat_template, which is critical for qwen3.5 +# to get good accuracy on gsm8k +from sglang.test.run_combined_tests import run_combined_tests +from sglang.test.run_eval import run_eval +from sglang.test.test_utils import ( + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + CustomTestCase, + ModelLaunchSettings, + popen_launch_server, +) + +register_cuda_ci(est_time=768, suite="stage-c-test-4-gpu-b200") + +QWEN35_FP4_MODEL = "nvidia/Qwen3.5-397B-A17B-NVFP4" +ACC_THRESHOLDS = {QWEN35_FP4_MODEL: {"gsm8k": 0.95}} + + +class TestQwen35FP4(CustomTestCase): + def test_gsm8k(self): + base_args = [ + "--tp-size", + "4", + "--chunked-prefill-size", + "2048", + "--mamba-scheduler-strategy", + "extra_buffer", + "--mamba-track-interval", + "128", + "--mamba-ssm-dtype", + "bfloat16", + "--max-running-requests", + "128", + "--reasoning-parser", + "qwen3", + "--attention-backend", + "trtllm_mha", + "--quantization", + "modelopt_fp4", + "--model-loader-extra-config", + '{"enable_multithread_load": true,"num_threads": 64}', + ] + + variants = [ + ModelLaunchSettings( + QWEN35_FP4_MODEL, + extra_args=base_args, + variant="Triton", + ), + # TODO: Fix this and re-enable it + # ModelLaunchSettings( + # QWEN35_FP4_MODEL, + # extra_args=base_args + ["--linear-attn-decode-backend", "flashinfer"], + # variant="FlashInfer", + # ), + ] + + run_combined_tests( + models=variants, + test_name="Qwen3.5-397B-A17B-NVFP4", + accuracy_params=AccuracyTestParams( + dataset="gsm8k", + baseline_accuracy=ACC_THRESHOLDS[QWEN35_FP4_MODEL]["gsm8k"], + num_examples=200, + num_threads=128, + max_tokens=16000, + thinking_mode="qwen3", + temperature=0.6, + top_p=0.95, + top_k=20, + ), + ) + + +class TestQwen35FP4MTP(ReasoningTokenUsageMixin, CustomTestCase): + reasoning_parser_name = "qwen3" + + @classmethod + def setUpClass(cls): + cls.model = QWEN35_FP4_MODEL + cls.base_url = DEFAULT_URL_FOR_TEST + cls.init_reasoning_token_verifier() + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=[ + "--tp-size", + "4", + "--chunked-prefill-size", + "2048", + "--mamba-scheduler-strategy", + "extra_buffer", + "--mamba-track-interval", + "128", + "--mamba-ssm-dtype", + "bfloat16", + "--max-running-requests", + "128", + "--reasoning-parser", + "qwen3", + "--attention-backend", + "trtllm_mha", + "--quantization", + "modelopt_fp4", + "--speculative-algorithm", + "NEXTN", + "--speculative-num-steps", + "3", + "--speculative-eagle-topk", + "1", + "--speculative-num-draft-tokens", + "4", + "--mem-fraction-static", + "0.8", + "--model-loader-extra-config", + '{"enable_multithread_load": true,"num_threads": 64}', + ], + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_gsm8k(self): + args = SimpleNamespace( + model=self.model, + eval_name="gsm8k", + num_shots=5, + num_examples=200, + max_tokens=16000, + num_threads=128, + repeat=1, + temperature=0.6, + top_p=0.95, + top_k=20, + base_url=self.base_url, + host="http://127.0.0.1", + port=int(self.base_url.split(":")[-1]), + ) + metrics = run_eval(args) + print(f"{metrics=}") + self.assertGreaterEqual(metrics["score"], ACC_THRESHOLDS[self.model]["gsm8k"]) + + server_info = requests.get(self.base_url + "/server_info") + avg_spec_accept_length = server_info.json()["internal_states"][0][ + "avg_spec_accept_length" + ] + print(f"{avg_spec_accept_length=}") + self.assertGreater(avg_spec_accept_length, 3.3) + + +class TestQwen35FP4MTPV2(ReasoningTokenUsageMixin, CustomTestCase): + reasoning_parser_name = "qwen3" + + @classmethod + def setUpClass(cls): + cls.model = QWEN35_FP4_MODEL + cls.base_url = DEFAULT_URL_FOR_TEST + cls.init_reasoning_token_verifier() + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=[ + "--tp-size", + "4", + "--chunked-prefill-size", + "2048", + "--mamba-scheduler-strategy", + "extra_buffer", + "--mamba-track-interval", + "128", + "--mamba-ssm-dtype", + "bfloat16", + "--max-running-requests", + "128", + "--reasoning-parser", + "qwen3", + "--attention-backend", + "trtllm_mha", + "--quantization", + "modelopt_fp4", + "--speculative-algorithm", + "NEXTN", + "--speculative-num-steps", + "3", + "--speculative-eagle-topk", + "1", + "--speculative-num-draft-tokens", + "4", + "--mem-fraction-static", + "0.8", + "--model-loader-extra-config", + '{"enable_multithread_load": true,"num_threads": 64}', + ], + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_gsm8k(self): + args = SimpleNamespace( + model=self.model, + eval_name="gsm8k", + num_shots=5, + num_examples=200, + max_tokens=16000, + num_threads=128, + repeat=1, + temperature=0.6, + top_p=0.95, + top_k=20, + base_url=self.base_url, + host="http://127.0.0.1", + port=int(self.base_url.split(":")[-1]), + ) + metrics = run_eval(args) + print(f"{metrics=}") + self.assertGreaterEqual(metrics["score"], ACC_THRESHOLDS[self.model]["gsm8k"]) + + server_info = requests.get(self.base_url + "/server_info") + avg_spec_accept_length = server_info.json()["internal_states"][0][ + "avg_spec_accept_length" + ] + print(f"{avg_spec_accept_length=}") + self.assertGreater(avg_spec_accept_length, 3.3) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/registered/4-gpu-models/test_qwen3_next_models_mtp.py b/test/registered/4-gpu-models/test_qwen3_next_models_mtp.py index e7edfbee9e03..13ad4d0c3140 100644 --- a/test/registered/4-gpu-models/test_qwen3_next_models_mtp.py +++ b/test/registered/4-gpu-models/test_qwen3_next_models_mtp.py @@ -1,6 +1,5 @@ import unittest -from sglang.srt.environ import envs from sglang.test.ci.ci_register import register_cuda_ci from sglang.test.kits.eval_accuracy_kit import GSM8KMixin from sglang.test.kits.kl_divergence_kit import KLDivergenceMixin @@ -94,16 +93,6 @@ class TestQwen3NextMTPV2(GSM8KMixin, KLDivergenceMixin, DefaultServerBase): "128", ] - @classmethod - def setUpClass(cls): - envs.SGLANG_ENABLE_SPEC_V2.set(True) - super().setUpClass() - - @classmethod - def tearDownClass(cls): - envs.SGLANG_ENABLE_SPEC_V2.set(False) - super().tearDownClass() - if __name__ == "__main__": unittest.main() diff --git a/test/registered/8-gpu-models/test_deepseek_v32.py b/test/registered/8-gpu-models/test_deepseek_v32.py index 5a6525d204f5..b792931979df 100644 --- a/test/registered/8-gpu-models/test_deepseek_v32.py +++ b/test/registered/8-gpu-models/test_deepseek_v32.py @@ -68,7 +68,6 @@ def test_deepseek_v32_all_variants(self): DEEPSEEK_V32_MODEL_PATH, tp_size=8, extra_args=BASE_ARGS + DP_ARGS + TOOL_CALL_ARGS + MTP_ARGS, - env={"SGLANG_ENABLE_SPEC_V2": "1"}, variant="DP8+MTP", ), # Variant: "tp" - Pure TP=8 only @@ -83,7 +82,6 @@ def test_deepseek_v32_all_variants(self): DEEPSEEK_V32_MODEL_PATH, tp_size=8, extra_args=BASE_ARGS + TP_ARGS + TOOL_CALL_ARGS + MTP_ARGS, - env={"SGLANG_ENABLE_SPEC_V2": "1"}, variant="TP8+MTP", ), ] diff --git a/test/registered/8-gpu-models/test_dsa_models_mtp.py b/test/registered/8-gpu-models/test_dsa_models_mtp.py index fe5ffe1c7a17..05cd47632e00 100644 --- a/test/registered/8-gpu-models/test_dsa_models_mtp.py +++ b/test/registered/8-gpu-models/test_dsa_models_mtp.py @@ -3,7 +3,6 @@ import requests -from sglang.srt.environ import envs from sglang.srt.utils import kill_process_tree from sglang.test.ci.ci_register import register_cuda_ci from sglang.test.run_eval import run_eval @@ -48,13 +47,12 @@ def setUpClass(cls): "--model-loader-extra-config", '{"enable_multithread_load": true, "num_threads": 64}', ] - with envs.SGLANG_ENABLE_SPEC_V2.override(True): - cls.process = popen_launch_server( - cls.model, - cls.base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=other_args, - ) + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=other_args, + ) @classmethod def tearDownClass(cls): @@ -132,13 +130,12 @@ def setUpClass(cls): "--model-loader-extra-config", '{"enable_multithread_load": true, "num_threads": 64}', ] - with envs.SGLANG_ENABLE_SPEC_V2.override(True): - cls.process = popen_launch_server( - cls.model, - cls.base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=other_args, - ) + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=other_args, + ) @classmethod def tearDownClass(cls): @@ -219,13 +216,12 @@ def setUpClass(cls): "--model-loader-extra-config", '{"enable_multithread_load": true, "num_threads": 64}', ] - with envs.SGLANG_ENABLE_SPEC_V2.override(True): - cls.process = popen_launch_server( - cls.model, - cls.base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=other_args, - ) + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=other_args, + ) @classmethod def tearDownClass(cls): @@ -303,13 +299,12 @@ def setUpClass(cls): "--model-loader-extra-config", '{"enable_multithread_load": true, "num_threads": 64}', ] - with envs.SGLANG_ENABLE_SPEC_V2.override(True): - cls.process = popen_launch_server( - cls.model, - cls.base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=other_args, - ) + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=other_args, + ) @classmethod def tearDownClass(cls): diff --git a/test/registered/8-gpu-models/test_gpt_oss_120b.py b/test/registered/8-gpu-models/test_gpt_oss_120b.py index dae7bac4cbf5..114d93781886 100644 --- a/test/registered/8-gpu-models/test_gpt_oss_120b.py +++ b/test/registered/8-gpu-models/test_gpt_oss_120b.py @@ -48,7 +48,6 @@ def test_gpt_oss_120b_all_variants(self): "--speculative-num-draft-tokens=4", ] eagle3_env = { - "SGLANG_ENABLE_SPEC_V2": "1", "SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN": "1", } diff --git a/test/registered/8-gpu-models/test_mistral_large3.py b/test/registered/8-gpu-models/test_mistral_large3.py index 68b7ca7d20af..58587d45e3e2 100644 --- a/test/registered/8-gpu-models/test_mistral_large3.py +++ b/test/registered/8-gpu-models/test_mistral_large3.py @@ -73,7 +73,6 @@ def test_mistral_large3_all_variants(self): MISTRAL_LARGE3_FP8_MODEL_PATH, tp_size=8, extra_args=base_args + eagle_args, - env={"SGLANG_ENABLE_SPEC_V2": "1"}, variant="TP8+MTP", ), # Variant: "nvfp4" - NVFP4 model + TP=8 + trtllm_mla backend diff --git a/test/registered/8-gpu-models/test_qwen35.py b/test/registered/8-gpu-models/test_qwen35.py index 813552b83421..bf7fb2d01e12 100644 --- a/test/registered/8-gpu-models/test_qwen35.py +++ b/test/registered/8-gpu-models/test_qwen35.py @@ -57,7 +57,6 @@ def test_qwen35(self): tp_size=8, extra_args=base_args + dp_args + mtp_args, variant="TP8+DP8+MTP", - env={"SGLANG_ENABLE_SPEC_V2": "1"}, ), ] diff --git a/test/registered/amd/test_deepseek_r1_mxfp4_8gpu.py b/test/registered/amd/test_deepseek_r1_mxfp4_8gpu.py index 04d4f6efb7a7..a58a998090af 100644 --- a/test/registered/amd/test_deepseek_r1_mxfp4_8gpu.py +++ b/test/registered/amd/test_deepseek_r1_mxfp4_8gpu.py @@ -88,7 +88,6 @@ def setUpClass(cls): cls.model = DEEPSEEK_R1_MODEL_PATH cls.base_url = DEFAULT_URL_FOR_TEST - envs.SGLANG_ENABLE_SPEC_V2.set(True) envs.SGLANG_ENABLE_OVERLAP_PLAN_STREAM.set(True) other_args = [ diff --git a/test/registered/ascend/basic_function/speculative_inference/test_npu_eagle3.py b/test/registered/ascend/basic_function/speculative_inference/test_npu_eagle3.py index efbb5f738e02..f6f1e37f517a 100644 --- a/test/registered/ascend/basic_function/speculative_inference/test_npu_eagle3.py +++ b/test/registered/ascend/basic_function/speculative_inference/test_npu_eagle3.py @@ -63,7 +63,6 @@ def setUpClass(cls): cls.extra_envs = { "SGLANG_ENABLE_OVERLAP_PLAN_STREAM": "1", - "SGLANG_ENABLE_SPEC_V2": "1", } os.environ.update(cls.extra_envs) diff --git a/test/registered/cp/test_deepseek_v32_cp_single_node.py b/test/registered/cp/test_deepseek_v32_cp_single_node.py index 5adefd3d1ff0..55fa2190717e 100644 --- a/test/registered/cp/test_deepseek_v32_cp_single_node.py +++ b/test/registered/cp/test_deepseek_v32_cp_single_node.py @@ -1,7 +1,6 @@ import unittest from types import SimpleNamespace -from sglang.srt.environ import envs from sglang.srt.utils import kill_process_tree from sglang.test.ci.ci_register import register_cuda_ci from sglang.test.run_eval import run_eval @@ -52,13 +51,12 @@ def setUpClass(cls): "--model-loader-extra-config", '{"enable_multithread_load": true, "num_threads": 64}', ] - with envs.SGLANG_ENABLE_SPEC_V2.override(True): - cls.process = popen_launch_server( - cls.model, - cls.base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=other_args, - ) + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=other_args, + ) @classmethod def tearDownClass(cls): @@ -119,13 +117,12 @@ def setUpClass(cls): "--model-loader-extra-config", '{"enable_multithread_load": true, "num_threads": 64}', ] - with envs.SGLANG_ENABLE_SPEC_V2.override(True): - cls.process = popen_launch_server( - cls.model, - cls.base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=other_args, - ) + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=other_args, + ) @classmethod def tearDownClass(cls): diff --git a/test/registered/ep/test_deepep_large.py b/test/registered/ep/test_deepep_large.py index 5967d489eb59..a400ae73d105 100644 --- a/test/registered/ep/test_deepep_large.py +++ b/test/registered/ep/test_deepep_large.py @@ -3,6 +3,7 @@ import requests +from sglang.srt.environ import envs from sglang.srt.utils import kill_process_tree from sglang.test.ci.ci_register import register_cuda_ci from sglang.test.run_eval import run_eval @@ -86,48 +87,49 @@ class TestDeepseekMTP(CustomTestCase): def setUpClass(cls): cls.model = DEFAULT_DEEPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST - cls.process = popen_launch_server( - cls.model, - cls.base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=[ - "--trust-remote-code", - "--tp", - "8", - "--enable-dp-attention", - "--dp", - "8", - "--moe-dense-tp-size", - "1", - "--enable-dp-lm-head", - "--moe-a2a-backend", - "deepep", - "--moe-runner-backend", - "deep_gemm", - "--enable-two-batch-overlap", - "--ep-num-redundant-experts", - "32", - "--ep-dispatch-algorithm", - "dynamic", - "--eplb-algorithm", - "deepseek", - "--cuda-graph-bs", - "64", # TODO: increase it to 128 when TBO is supported in draft_extend - "--max-running-requests", - "512", - "--speculative-algorithm", - "EAGLE", - "--speculative-num-steps", - "1", - "--speculative-eagle-topk", - "1", - "--speculative-num-draft-tokens", - "2", - "--disable-radix-cache", - "--model-loader-extra-config", - '{"enable_multithread_load": true,"num_threads": 64}', - ], - ) + with envs.SGLANG_ENABLE_SPEC_V2.override(False): + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=[ + "--trust-remote-code", + "--tp", + "8", + "--enable-dp-attention", + "--dp", + "8", + "--moe-dense-tp-size", + "1", + "--enable-dp-lm-head", + "--moe-a2a-backend", + "deepep", + "--moe-runner-backend", + "deep_gemm", + "--enable-two-batch-overlap", + "--ep-num-redundant-experts", + "32", + "--ep-dispatch-algorithm", + "dynamic", + "--eplb-algorithm", + "deepseek", + "--cuda-graph-bs", + "64", # TODO: increase it to 128 when TBO is supported in draft_extend + "--max-running-requests", + "512", + "--speculative-algorithm", + "EAGLE", + "--speculative-num-steps", + "1", + "--speculative-eagle-topk", + "1", + "--speculative-num-draft-tokens", + "2", + "--disable-radix-cache", + "--model-loader-extra-config", + '{"enable_multithread_load": true,"num_threads": 64}', + ], + ) @classmethod def tearDownClass(cls): diff --git a/test/registered/mla/test_flashmla.py b/test/registered/mla/test_flashmla.py index abcad2a391a7..97fd2e2eaf8b 100644 --- a/test/registered/mla/test_flashmla.py +++ b/test/registered/mla/test_flashmla.py @@ -9,6 +9,7 @@ import requests import torch +from sglang.srt.environ import envs from sglang.srt.utils import kill_process_tree from sglang.test.ci.ci_register import register_cuda_ci from sglang.test.run_eval import run_eval @@ -97,12 +98,13 @@ def setUpClass(cls): ] ) # Use longer timeout for DeepGEMM JIT compilation which can take 10-20 minutes - cls.process = popen_launch_server( - cls.model, - cls.base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH * 2, - other_args=other_args, - ) + with envs.SGLANG_ENABLE_SPEC_V2.override(False): + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH * 2, + other_args=other_args, + ) @classmethod def tearDownClass(cls): diff --git a/test/registered/quant/test_deepseek_v32_fp4_mtp_4gpu.py b/test/registered/quant/test_deepseek_v32_fp4_mtp_4gpu.py index 59dbe8741258..1cf38cdf7115 100644 --- a/test/registered/quant/test_deepseek_v32_fp4_mtp_4gpu.py +++ b/test/registered/quant/test_deepseek_v32_fp4_mtp_4gpu.py @@ -3,7 +3,6 @@ import requests -from sglang.srt.environ import envs from sglang.srt.utils import kill_process_tree from sglang.test.ci.ci_register import register_cuda_ci from sglang.test.run_eval import run_eval @@ -54,13 +53,12 @@ def setUpClass(cls): "--model-loader-extra-config", '{"enable_multithread_load": true,"num_threads": 64}', ] - with envs.SGLANG_ENABLE_SPEC_V2.override(True): - cls.process = popen_launch_server( - cls.model, - cls.base_url, - timeout=SERVER_LAUNCH_TIMEOUT, - other_args=other_args, - ) + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=SERVER_LAUNCH_TIMEOUT, + other_args=other_args, + ) @classmethod def tearDownClass(cls): @@ -145,13 +143,12 @@ def setUpClass(cls): "--model-loader-extra-config", '{"enable_multithread_load": true,"num_threads": 64}', ] - with envs.SGLANG_ENABLE_SPEC_V2.override(True): - cls.process = popen_launch_server( - cls.model, - cls.base_url, - timeout=SERVER_LAUNCH_TIMEOUT, - other_args=other_args, - ) + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=SERVER_LAUNCH_TIMEOUT, + other_args=other_args, + ) @classmethod def tearDownClass(cls): diff --git a/test/registered/spec/eagle/test_adaptive_speculative.py b/test/registered/spec/eagle/test_adaptive_speculative.py index 6863eacb4934..4f8a0ff4754c 100644 --- a/test/registered/spec/eagle/test_adaptive_speculative.py +++ b/test/registered/spec/eagle/test_adaptive_speculative.py @@ -6,6 +6,7 @@ import requests +from sglang.srt.environ import envs from sglang.srt.utils import kill_process_tree from sglang.test.ci.ci_register import register_cuda_ci from sglang.test.run_eval import run_eval @@ -58,32 +59,33 @@ def setUpClass(cls): cls.adaptive_config_path = f.name try: - cls.process = popen_launch_server( - cls.model, - cls.base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=[ - "--trust-remote-code", - "--attention-backend", - "triton", - "--speculative-algorithm", - "EAGLE", - "--speculative-draft-model-path", - cls.draft_model, - "--speculative-num-steps", - "1", - "--speculative-eagle-topk", - "1", - "--speculative-num-draft-tokens", - "2", - "--speculative-adaptive", - "--speculative-adaptive-config", - cls.adaptive_config_path, - "--skip-server-warmup", - "--mem-fraction-static", - "0.7", - ], - ) + with envs.SGLANG_ENABLE_SPEC_V2.override(False): + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=[ + "--trust-remote-code", + "--attention-backend", + "triton", + "--speculative-algorithm", + "EAGLE", + "--speculative-draft-model-path", + cls.draft_model, + "--speculative-num-steps", + "1", + "--speculative-eagle-topk", + "1", + "--speculative-num-draft-tokens", + "2", + "--speculative-adaptive", + "--speculative-adaptive-config", + cls.adaptive_config_path, + "--skip-server-warmup", + "--mem-fraction-static", + "0.7", + ], + ) except Exception: os.unlink(cls.adaptive_config_path) raise diff --git a/test/registered/spec/eagle/test_deepseek_v3_fp4_mtp_small.py b/test/registered/spec/eagle/test_deepseek_v3_fp4_mtp_small.py index a8372c84e180..46a7cf5e55b9 100644 --- a/test/registered/spec/eagle/test_deepseek_v3_fp4_mtp_small.py +++ b/test/registered/spec/eagle/test_deepseek_v3_fp4_mtp_small.py @@ -49,13 +49,9 @@ def setUpClass(cls): "--model-loader-extra-config", '{"enable_multithread_load": true,"num_threads": 64}', ] - with envs.SGLANG_ENABLE_SPEC_V2.override( + with envs.SGLANG_SPEC_NAN_DETECTION.override( True - ), envs.SGLANG_SPEC_NAN_DETECTION.override( - True - ), envs.SGLANG_SPEC_OOB_DETECTION.override( - True - ): + ), envs.SGLANG_SPEC_OOB_DETECTION.override(True): cls.process = popen_launch_server( cls.model, cls.base_url, diff --git a/test/registered/spec/eagle/test_eagle_infer_a.py b/test/registered/spec/eagle/test_eagle_infer_a.py index eca84327cfb8..077e4f46bfbd 100644 --- a/test/registered/spec/eagle/test_eagle_infer_a.py +++ b/test/registered/spec/eagle/test_eagle_infer_a.py @@ -2,6 +2,7 @@ import unittest import sglang as sgl +from sglang.srt.environ import envs from sglang.srt.utils.hf_transformers_utils import get_tokenizer from sglang.test.ci.ci_register import register_cuda_ci from sglang.test.test_utils import ( @@ -34,6 +35,14 @@ class TestEAGLEEngine(CustomTestCase): "accept_len": 3.6, } + @classmethod + def setUpClass(cls): + envs.SGLANG_ENABLE_SPEC_V2.set(False) + + @classmethod + def tearDownClass(cls): + envs.SGLANG_ENABLE_SPEC_V2.clear() + def setUp(self): self.prompt = "Today is a sunny day and I like" self.sampling_params = {"temperature": 0, "max_new_tokens": 8} diff --git a/test/registered/spec/eagle/test_eagle_infer_b.py b/test/registered/spec/eagle/test_eagle_infer_b.py index 7e941b38e693..3d4449271e9b 100644 --- a/test/registered/spec/eagle/test_eagle_infer_b.py +++ b/test/registered/spec/eagle/test_eagle_infer_b.py @@ -30,6 +30,11 @@ class TestEAGLEServerBasic(EagleServerBase): extra_args = ["--chunked-prefill-size", 128, "--max-running-requests", 8] + @classmethod + def setUpClass(cls): + with envs.SGLANG_ENABLE_SPEC_V2.override(False): + super().setUpClass() + # FIXME(lsyin): move the test methods to kits def test_request_abort(self): concurrency = 4 diff --git a/test/registered/spec/eagle/test_eagle_infer_beta.py b/test/registered/spec/eagle/test_eagle_infer_beta.py index 96a2096a706c..436ac2001e5e 100644 --- a/test/registered/spec/eagle/test_eagle_infer_beta.py +++ b/test/registered/spec/eagle/test_eagle_infer_beta.py @@ -63,9 +63,7 @@ def setUpClass(cls): *[str(i) for i in range(1, cls.max_running_requests + 1)], ] launch_args.extend(cls.other_launch_args) - with envs.SGLANG_ENABLE_SPEC_V2.override( - True - ), envs.SGLANG_ENABLE_STRICT_MEM_CHECK_DURING_BUSY.override( + with envs.SGLANG_ENABLE_STRICT_MEM_CHECK_DURING_BUSY.override( 1 ), envs.SGLANG_SPEC_NAN_DETECTION.override( True diff --git a/test/registered/spec/eagle/test_eagle_infer_beta_dp_attention.py b/test/registered/spec/eagle/test_eagle_infer_beta_dp_attention.py index bb31b88aec70..0dcb7c5a2992 100644 --- a/test/registered/spec/eagle/test_eagle_infer_beta_dp_attention.py +++ b/test/registered/spec/eagle/test_eagle_infer_beta_dp_attention.py @@ -65,13 +65,9 @@ def setUpClass(cls): "--speculative-num-draft-tokens", "4", ] - with envs.SGLANG_ENABLE_SPEC_V2.override( + with envs.SGLANG_SPEC_NAN_DETECTION.override( True - ), envs.SGLANG_SPEC_NAN_DETECTION.override( - True - ), envs.SGLANG_SPEC_OOB_DETECTION.override( - True - ): + ), envs.SGLANG_SPEC_OOB_DETECTION.override(True): cls.process = popen_launch_server( cls.model, cls.base_url, diff --git a/test/registered/spec/eagle/test_eagle_infer_beta_dp_attention_large.py b/test/registered/spec/eagle/test_eagle_infer_beta_dp_attention_large.py index c875e995c167..c64acb19cddd 100644 --- a/test/registered/spec/eagle/test_eagle_infer_beta_dp_attention_large.py +++ b/test/registered/spec/eagle/test_eagle_infer_beta_dp_attention_large.py @@ -73,13 +73,9 @@ def setUpClass(cls): "--model-loader-extra-config", '{"enable_multithread_load": true,"num_threads": 64}', ] - with envs.SGLANG_ENABLE_SPEC_V2.override( + with envs.SGLANG_SPEC_NAN_DETECTION.override( True - ), envs.SGLANG_SPEC_NAN_DETECTION.override( - True - ), envs.SGLANG_SPEC_OOB_DETECTION.override( - True - ): + ), envs.SGLANG_SPEC_OOB_DETECTION.override(True): cls.process = popen_launch_server( cls.model, cls.base_url, diff --git a/test/registered/spec/test_standalone_speculative_decoding.py b/test/registered/spec/test_standalone_speculative_decoding.py index 6510210dfde2..240d374f64a8 100644 --- a/test/registered/spec/test_standalone_speculative_decoding.py +++ b/test/registered/spec/test_standalone_speculative_decoding.py @@ -1,4 +1,3 @@ -import os import unittest from types import SimpleNamespace @@ -80,6 +79,7 @@ def setUpClass(cls): # please don't do this if you want to make your inference workload faster envs.SGLANG_JIT_DEEPGEMM_PRECOMPILE.set(False) envs.SGLANG_ENABLE_JIT_DEEPGEMM.set(False) + envs.SGLANG_ENABLE_SPEC_V2.set(False) model = cls.model cls.process = popen_launch_server( model, @@ -91,6 +91,7 @@ def setUpClass(cls): @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) + envs.SGLANG_ENABLE_SPEC_V2.clear() def test_gsm8k(self): requests.get(self.base_url + "/flush_cache") @@ -140,7 +141,6 @@ def setUpClass(cls): # please don't do this if you want to make your inference workload faster envs.SGLANG_JIT_DEEPGEMM_PRECOMPILE.set(False) envs.SGLANG_ENABLE_JIT_DEEPGEMM.set(False) - envs.SGLANG_ENABLE_SPEC_V2.set(True) # Enable Speculative Decoding V2 model = cls.model cls.process = popen_launch_server( model, @@ -152,8 +152,6 @@ def setUpClass(cls): @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) - if "SGLANG_ENABLE_SPEC_V2" in os.environ: - envs.SGLANG_ENABLE_SPEC_V2.set(False) def test_gsm8k(self): requests.get(self.base_url + "/flush_cache")