diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 67594a7f728..03e47543a5d 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -87,53 +87,11 @@ jobs: run: | bash scripts/ci_install_dependency.sh - - name: Test data parallelism (DP=2) - timeout-minutes: 10 - run: | - cd test/srt - python3 test_data_parallelism.py - - - name: Test data parallelism attention (DP=2) - timeout-minutes: 10 - run: | - cd test/srt - python3 test_dp_attention.py - - - name: Test update weights from distributed - timeout-minutes: 10 - run: | - cd test/srt - python3 test_update_weights_from_distributed.py - - - name: Test VerlEngine - timeout-minutes: 10 - run: | - cd test/srt - python3 test_verl_engine.py - - - name: Test Patch Torch - timeout-minutes: 10 - run: | - cd test/srt - python3 test_patch_torch.py - - - name: Test expert parallelism (EP=2) - timeout-minutes: 10 - run: | - cd test/srt - python3 test_moe_ep.py - - - name: Test torch compile (TP=2) + - name: Run test timeout-minutes: 10 run: | cd test/srt - python3 test_mla_tp.py - - - name: Test lora tensor parallelism (TP=2) - timeout-minutes: 10 - run: | - cd test/srt/models/lora - python3 test_lora_tp.py + python3 run_suite.py --suite per-commit-2-gpu performance-test-1-gpu-part-1: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && diff --git a/python/sglang/srt/constrained/base_grammar_backend.py b/python/sglang/srt/constrained/base_grammar_backend.py index 5d1b0da2904..181f0d21318 100644 --- a/python/sglang/srt/constrained/base_grammar_backend.py +++ b/python/sglang/srt/constrained/base_grammar_backend.py @@ -169,7 +169,9 @@ def reset(self): self.cache.clear() -def create_grammar_backend(server_args: ServerArgs, tokenizer, vocab_size): +def create_grammar_backend( + server_args: ServerArgs, tokenizer, vocab_size: int +) -> Optional[BaseGrammarBackend]: if server_args.grammar_backend == "outlines": from sglang.srt.constrained.outlines_backend import OutlinesGrammarBackend @@ -188,6 +190,8 @@ def create_grammar_backend(server_args: ServerArgs, tokenizer, vocab_size): tokenizer=tokenizer, whitespace_pattern=server_args.constrained_json_whitespace_pattern, ) + elif server_args.grammar_backend == "none": + return None else: raise ValueError(f"Invalid grammar backend: {server_args.grammar_backend}") diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index dfdfef662c1..012788708d8 100644 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -599,6 +599,7 @@ def reset_for_retract(self): self.extend_logprob_start_len = 0 self.is_chunked = 0 self.req_pool_idx = None + self.already_computed = 0 def __repr__(self): return ( @@ -960,8 +961,6 @@ def prepare_for_extend(self): # If req.input_embeds is already a list, append its content directly input_embeds.extend(req.input_embeds) # Use extend to avoid nesting - if req.is_retracted: - req.already_computed = 0 req.cached_tokens += pre_len - req.already_computed req.already_computed = seq_len req.is_retracted = False @@ -1189,7 +1188,11 @@ def get_required_tokens(num_reqs: int): self.req_to_token_pool.free(req.req_pool_idx) else: # TODO: apply more fine-grained retraction - last_uncached_pos = len(req.prefix_indices) + last_uncached_pos = ( + (len(req.prefix_indices) + server_args.page_size - 1) + // server_args.page_size + * server_args.page_size + ) token_indices = self.req_to_token_pool.req_to_token[ req.req_pool_idx, last_uncached_pos : seq_lens_cpu[idx] ] diff --git a/python/sglang/srt/metrics/collector.py b/python/sglang/srt/metrics/collector.py index 45fe2fce628..3694490f211 100644 --- a/python/sglang/srt/metrics/collector.py +++ b/python/sglang/srt/metrics/collector.py @@ -33,7 +33,7 @@ class SchedulerMetricsCollector: def __init__(self, labels: Dict[str, str]) -> None: # We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR` - from prometheus_client import Gauge + from prometheus_client import Gauge, Histogram self.labels = labels self.last_log_time = time.time() @@ -139,10 +139,10 @@ def __init__(self, labels: Dict[str, str]) -> None: labelnames=labels.keys(), buckets=[ 0.1, - 0.3, - 0.5, - 0.7, - 0.9, + 0.2, + 0.4, + 0.6, + 0.8, 1, 2, 4, @@ -153,36 +153,9 @@ def __init__(self, labels: Dict[str, str]) -> None: 40, 60, 80, - 120, - 160, - ], - ) - - self.histogram_time_per_output_token = Histogram( - name="sglang:time_per_output_token_seconds", - documentation="Histogram of time per output token in seconds.", - labelnames=labels.keys(), - buckets=[ - 0.002, - 0.005, - 0.010, - 0.020, - 0.030, - 0.040, - 0.050, - 0.060, - 0.070, - 0.080, - 0.090, - 0.100, - 0.150, - 0.200, - 0.300, - 0.400, - 0.600, - 0.800, - 1.000, - 2.000, + 100, + 200, + 400, ], ) @@ -202,17 +175,18 @@ def __init__(self, labels: Dict[str, str]) -> None: 0.030, 0.035, 0.040, - 0.050, - 0.075, + 0.060, + 0.080, 0.100, - 0.150, 0.200, - 0.300, 0.400, - 0.500, - 0.750, + 0.600, + 0.800, 1.000, 2.000, + 4.000, + 6.000, + 8.000, ], ) @@ -224,23 +198,22 @@ def __init__(self, labels: Dict[str, str]) -> None: 0.1, 0.2, 0.4, + 0.6, 0.8, 1, 2, - 5, + 4, + 6, + 8, 10, 20, 40, 60, 80, 100, - 150, 200, - 250, - 300, - 350, - 500, - 1000, + 400, + 800, ], ) @@ -256,13 +229,10 @@ def observe_one_finished_request( ): self.prompt_tokens_total.labels(**self.labels).inc(prompt_tokens) self.generation_tokens_total.labels(**self.labels).inc(generation_tokens) - self.cached_tokens_total.labels(**self.labels).inc(cached_tokens) + if cached_tokens > 0: + self.cached_tokens_total.labels(**self.labels).inc(cached_tokens) self.num_requests_total.labels(**self.labels).inc(1) self._log_histogram(self.histogram_e2e_request_latency, e2e_latency) - if generation_tokens >= 1: - self.histogram_time_per_output_token.labels(**self.labels).observe( - e2e_latency / generation_tokens - ) def observe_time_to_first_token(self, value: float): self.histogram_time_to_first_token.labels(**self.labels).observe(value) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 67a155f8fdb..6f47254877a 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -128,7 +128,7 @@ class ServerArgs: # Kernel backend attention_backend: Optional[str] = None sampling_backend: Optional[str] = None - grammar_backend: Optional[str] = "xgrammar" + grammar_backend: Optional[str] = None # Speculative decoding speculative_algorithm: Optional[str] = None @@ -193,6 +193,13 @@ class ServerArgs: disaggregation_bootstrap_port: int = 8998 def __post_init__(self): + # Expert parallelism + if self.enable_ep_moe: + self.ep_size = self.tp_size + logger.info( + f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]." + ) + # Set missing default values if self.tokenizer_path is None: self.tokenizer_path = self.model_path @@ -274,12 +281,9 @@ def __post_init__(self): ) self.disable_cuda_graph = True - # Expert parallelism - if self.enable_ep_moe: - self.ep_size = self.tp_size - logger.info( - f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]." - ) + # Choose grammar backend + if self.grammar_backend is None: + self.grammar_backend = "xgrammar" # Data parallelism attention if self.enable_dp_attention: @@ -813,7 +817,7 @@ def add_cli_args(parser: argparse.ArgumentParser): parser.add_argument( "--grammar-backend", type=str, - choices=["xgrammar", "outlines", "llguidance"], + choices=["xgrammar", "outlines", "llguidance", "none"], default=ServerArgs.grammar_backend, help="Choose the backend for grammar-guided decoding.", ) diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 4c432ae7015..9416166819e 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -1012,9 +1012,6 @@ def run_logprob_check(self: unittest.TestCase, arg: Tuple): class CustomTestCase(unittest.TestCase): - pass - - """ def _callTestMethod(self, method): max_retry = int( os.environ.get("SGLANG_TEST_MAX_RETRY", "2" if is_in_ci() else "0") @@ -1023,4 +1020,3 @@ def _callTestMethod(self, method): lambda: super(CustomTestCase, self)._callTestMethod(method), max_retry=max_retry, ) - """ diff --git a/test/srt/models/lora/test_lora_tp.py b/test/srt/models/lora/test_lora_tp.py index 925a34bc318..5e34538b52a 100644 --- a/test/srt/models/lora/test_lora_tp.py +++ b/test/srt/models/lora/test_lora_tp.py @@ -33,6 +33,9 @@ ], max_loras_per_batch=1, ), +] + +ALL_OTHER_LORA_MODELS = [ LoRAModelCase( base="meta-llama/Llama-3.1-8B-Instruct", adaptors=[ @@ -43,9 +46,6 @@ ], max_loras_per_batch=1, ), -] - -ALL_OTHER_LORA_MODELS = [ LoRAModelCase( base="meta-llama/Llama-2-7b-hf", adaptors=[LoRAAdaptor(name="winddude/wizardLM-LlaMA-LoRA-7B")], diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 628e3946d7a..35385ab3272 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -16,7 +16,7 @@ class TestFile: TestFile("models/lora/test_lora.py", 76), TestFile("models/lora/test_lora_backend.py", 420), TestFile("models/lora/test_multi_lora_backend.py", 144), - TestFile("models/test_embedding_models.py", 119), + TestFile("models/test_embedding_models.py", 35), TestFile("models/test_generation_models.py", 103), TestFile("models/test_grok_models.py", 60), TestFile("models/test_qwen_models.py", 82), @@ -38,7 +38,7 @@ class TestFile: TestFile("test_metrics.py", 32), TestFile("test_mla.py", 92), TestFile("test_mla_deepseek_v3.py", 221), - TestFile("test_mla_int8_deepseek_v3.py", 421), + TestFile("test_mla_int8_deepseek_v3.py", 522), TestFile("test_mla_flashinfer.py", 395), TestFile("test_mla_fp8.py", 93), TestFile("test_no_chunked_prefill.py", 126), @@ -59,7 +59,7 @@ class TestFile: TestFile("test_srt_endpoint.py", 94), TestFile("test_torch_compile.py", 76), TestFile("test_torch_compile_moe.py", 85), - TestFile("test_torch_native_attention_backend.py", 149), + TestFile("test_torch_native_attention_backend.py", 123), TestFile("test_torchao.py", 70), TestFile("test_triton_attention_kernels.py", 4), TestFile("test_triton_attention_backend.py", 134), @@ -76,6 +76,16 @@ class TestFile: TestFile("test_hicache.py", 60), TestFile("test_hicache_mla.py", 90), ], + "per-commit-2-gpu": [ + TestFile("test_data_parallelism.py", 90), + TestFile("test_dp_attention.py", 90), + TestFile("test_update_weights_from_distributed.py", 100), + TestFile("test_verl_engine.py", 100), + TestFile("test_patch_torch.py", 30), + TestFile("test_moe_ep.py", 220), + TestFile("test_mla_tp.py", 420), + TestFile("test_lora_tp.py", 300), + ], "nightly": [ TestFile("test_nightly_gsm8k_eval.py"), ], diff --git a/test/srt/test_dp_attention.py b/test/srt/test_dp_attention.py index 622f9b633c7..5fb5e223b72 100644 --- a/test/srt/test_dp_attention.py +++ b/test/srt/test_dp_attention.py @@ -60,3 +60,7 @@ def test_mgsm_en(self): metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/srt/test_metrics.py b/test/srt/test_metrics.py index fadd427b707..1ab48702512 100644 --- a/test/srt/test_metrics.py +++ b/test/srt/test_metrics.py @@ -63,7 +63,6 @@ def test_metrics_enabled(self): "sglang:cached_tokens_total", "sglang:num_requests_total", "sglang:time_to_first_token_seconds", - "sglang:time_per_output_token_seconds", "sglang:inter_token_latency_seconds", "sglang:e2e_request_latency_seconds", ]