sgl-project · Kangyan-Zhou · Apr 7, 2026 · Mar 4, 2026 · Mar 4, 2026 · Mar 5, 2026
diff --git a/.codespellrc b/.codespellrc
@@ -1,3 +1,3 @@
 [codespell]
-ignore-words-list = ans, als, hel, boostrap, childs, te, vas, hsa, ment, cann, thi, makro, wil, rouge, PRIS
+ignore-words-list = ans, als, hel, boostrap, childs, te, vas, hsa, ment, cann, thi, makro, wil, rouge, PRIS, ather
 skip = *.json,*.jsonl,*.patch,*.txt
diff --git a/benchmark/kernels/fused_moe_triton/common_utils.py b/benchmark/kernels/fused_moe_triton/common_utils.py
@@ -134,6 +134,10 @@ def get_model_config(
         topk = config.num_experts_per_tok
         intermediate_size = config.moe_intermediate_size
         hidden_size = getattr(config, "moe_latent_size", None) or hidden_size
+    elif architecture == "Gemma4ForConditionalGeneration":
+        E = config.num_experts // ep_size
+        topk = config.top_k_experts
+        intermediate_size = config.moe_intermediate_size
     else:
         # Default: Mixtral
         E = config.num_local_experts // ep_size

diff --git a/benchmark/mmlu/bench_hf.py b/benchmark/mmlu/bench_hf.py
@@ -0,0 +1,151 @@
+"""
+Usage:
+python3 bench_hf.py --model-path meta-llama/Llama-2-7b-hf --data-dir data --ntrain 5
+"""
+
+import argparse
+import json
+import os
+import time
+
+import numpy as np
+import pandas as pd
+import torch
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+choices = ["A", "B", "C", "D"]
+
+
+def format_subject(subject):
+    l = subject.split("_")
+    s = ""
+    for entry in l:
+        s += " " + entry
+    return s
+
+
+def format_example(df, idx, include_answer=True):
+    prompt = df.iloc[idx, 0]
+    k = df.shape[1] - 2
+    for j in range(k):
+        prompt += "\n{}. {}".format(choices[j], df.iloc[idx, j + 1])
+    prompt += "\nAnswer:"
+    if include_answer:
+        prompt += " {}\n\n".format(df.iloc[idx, k + 1])
+    return prompt
+
+
+def gen_prompt(train_df, subject, k=-1):
+    prompt = "The following are multiple choice questions (with answers) about{}.\n\n".format(
+        format_subject(subject)
+    )
+    if k == -1:
+        k = train_df.shape[0]
+    for i in range(k):
+        prompt += format_example(train_df, i)
+    return prompt
+
+
+@torch.no_grad()
+def main(args):
+    print(f"Loading model: {args.model_path}")
+    tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model_path,
+        torch_dtype=torch.bfloat16,
+        trust_remote_code=True,
+        device_map="auto",
+    ).eval()
+
+    subjects = sorted(
+        [
+            f.split("_test.csv")[0]
+            for f in os.listdir(os.path.join(args.data_dir, "test"))
+            if "_test.csv" in f
+        ]
+    )
+
+    all_cors = []
+    num_requests = 0
+    total_latency = 0
+
+    for subject in tqdm(subjects[: args.nsub]):
+        dev_df = pd.read_csv(
+            os.path.join(args.data_dir, "dev", subject + "_dev.csv"), header=None
+        )[: args.ntrain]
+        test_df = pd.read_csv(
+            os.path.join(args.data_dir, "test", subject + "_test.csv"), header=None
+        )
+
+        k = args.ntrain
+        few_shot_examples = gen_prompt(dev_df, subject, k)
+        while len(tokenizer.encode(few_shot_examples)) > 1536:
+            k -= 1
+            if k < 0:
+                break
+            few_shot_examples = gen_prompt(dev_df, subject, k)
+
+        preds = []
+        labels = []
+        tic = time.perf_counter()
+
+        for i in range(test_df.shape[0]):
+            prompt_end = format_example(test_df, i, include_answer=False)
+            prompt = few_shot_examples + prompt_end
+
+            input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
+            output_ids = model.generate(
+                input_ids,
+                max_new_tokens=1,
+                do_sample=False,
+                pad_token_id=tokenizer.eos_token_id,
+            )
+
+            output_str = tokenizer.decode(
+                output_ids[0][input_ids.shape[-1] :], skip_special_tokens=True
+            )
+            preds.append(output_str.strip()[0] if len(output_str.strip()) > 0 else "")
+            labels.append(test_df.iloc[i, test_df.shape[1] - 1])
+
+        latency = time.perf_counter() - tic
+        total_latency += latency
+
+        cors = [pred == label for pred, label in zip(preds, labels)]
+        all_cors.append(cors)
+        num_requests += len(test_df)
+
+        print(
+            f"Subject: {subject}, Accuracy: {np.mean(cors):.3f}, Latency: {latency:.3f}s"
+        )
+
+    weighted_acc = np.mean(np.concatenate(all_cors))
+    print(f"Total Latency: {total_latency:.3f}s")
+    print(f"Average Accuracy: {weighted_acc:.3f}")
+
+    if args.output:
+        with open(args.output, "a") as fout:
+            value = {
+                "task": "mmlu",
+                "backend": "hf",
+                "model": args.model_path,
+                "latency": round(total_latency, 3),
+                "accuracy": round(weighted_acc, 3),
+                "num_requests": num_requests,
+                "other": {
+                    "nsub": args.nsub,
+                    "ntrain": args.ntrain,
+                },
+            }
+            fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, required=True)
+    parser.add_argument("--ntrain", type=int, default=5)
+    parser.add_argument("--data-dir", type=str, default="data")
+    parser.add_argument("--nsub", type=int, default=60)
+    parser.add_argument("--output", type=str, help="Output file path")
+    args = parser.parse_args()
+    main(args)
diff --git a/python/sglang/lang/chat_template.py b/python/sglang/lang/chat_template.py
@@ -404,6 +404,19 @@ def get_chat_template_by_model_path(model_path):
     )
 )
 
+register_chat_template(
+    ChatTemplate(
+        name="gemma-4-it",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": ("", ""),
+            "user": ("<|turn>user\n", "<turn|>\n"),
+            "assistant": ("<|turn>assistant\n", "<turn|>\n"),
+        },
+        style=ChatTemplateStyle.PLAIN,
+    )
+)
+
 register_chat_template(
     ChatTemplate(
         name="dbrx-instruct",
@@ -611,8 +624,10 @@ def match_chat_yi(model_path: str):
 
 
 @register_chat_template_matching_function
-def match_gemma_it(model_path: str):
-    if re.search(r"gemma.*it", model_path, re.IGNORECASE):
+def match_gemma(model_path: str):
+    if re.search(r"gemma-4.*it", model_path, re.IGNORECASE):
+        return "gemma-4-it"
+    if re.search(r"(gemma.*it)|(gemma-3)", model_path, re.IGNORECASE):
         return "gemma-it"
 
 
@@ -636,12 +651,6 @@ def match_granite_instruct(model_path: str):
         return "granite-3-instruct"
 
 
-@register_chat_template_matching_function
-def match_gemma3_instruct(model_path: str):
-    if re.search(r"gemma-3", model_path, re.IGNORECASE):
-        return "gemma-it"
-
-
 @register_chat_template_matching_function
 def match_internvl_chat(model_path: str):
     if re.search(r"internvl2_5", model_path, re.IGNORECASE):

diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py
@@ -376,6 +376,8 @@ def _derive_hybrid_model(self):
         self.is_hybrid_swa_compress = self.hf_config.architectures[0] in [
             "MiMoV2FlashForCausalLM",
             "MiMoV2MTP",
+            "Gemma4ForCausalLM",
+            "Gemma4ForConditionalGeneration",
         ]
 
     def _derive_context_length(self, context_length: int):
@@ -433,7 +435,7 @@ def _derive_model_shapes(self):
         self.swa_v_head_dim = getattr(
             self.hf_text_config,
             "swa_v_head_dim",
-            self.v_head_dim,
+            self.swa_head_dim,
         )
         # FIXME: temporary special judge for MLA architecture
         if (
@@ -1301,6 +1303,7 @@ def is_generation_model(model_architectures: List[str], is_embedding: bool = Fal
     "Ernie4_5_VLMoeForConditionalGeneration",
     "Gemma3ForConditionalGeneration",
     "Gemma3nForConditionalGeneration",
+    "Gemma4ForConditionalGeneration",
     "Glm4vForConditionalGeneration",
     "Glm4vMoeForConditionalGeneration",
     "GlmOcrForConditionalGeneration",
@@ -1447,6 +1450,8 @@ def is_hybrid_swa_model(model_architectures: List[str]):
         "MiMoV2MTP",
         "Step3p5ForCausalLM",
         "Step3p5MTP",
+        "Gemma4ForCausalLM",
+        "Gemma4ForConditionalGeneration",
     }
     return any(arch in hybrid_swa_archs for arch in model_architectures)
 
@@ -1464,7 +1469,7 @@ def get_hybrid_layer_ids(
             i for i in range(num_hidden_layers) if (i + 1) % 4 == 0
         ]
     elif "GptOssForCausalLM" in model_architectures:
-        layer_types = getattr(hf_text_config, "layer_types", None)
+        layer_types = getattr(hf_text_config, "layer_types", [])
         swa_attention_layer_ids = [
             i for i, x in enumerate(layer_types) if x == "sliding_attention"
         ]
@@ -1497,6 +1502,17 @@ def get_hybrid_layer_ids(
     elif "Step3p5MTP" in model_architectures:
         swa_attention_layer_ids = [0]
         full_attention_layer_ids = []
+    elif (
+        "Gemma4ForCausalLM" in model_architectures
+        or "Gemma4ForConditionalGeneration" in model_architectures
+    ):
+        layer_types = getattr(hf_text_config, "layer_types", [])
+        swa_attention_layer_ids = [
+            i for i, x in enumerate(layer_types) if x == "sliding_attention"
+        ]
+        full_attention_layer_ids = [
+            i for i, x in enumerate(layer_types) if x == "full_attention"
+        ]
     else:
         swa_attention_layer_ids = None
         full_attention_layer_ids = None

@@ -120,6 +120,11 @@ def __init__(
             and hasattr(self.tokenizer_manager.model_config.hf_config, "model_type")
             and self.tokenizer_manager.model_config.hf_config.model_type == "gpt_oss"
         )
+        self.is_gemma4 = (
+            hasattr(self.tokenizer_manager.model_config, "hf_config")
+            and hasattr(self.tokenizer_manager.model_config.hf_config, "model_type")
+            and self.tokenizer_manager.model_config.hf_config.model_type == "gemma4"
+        )
 
         self.use_dpsk_v32_encoding = self._use_dpsk_v32_encoding()
 
@@ -331,7 +336,7 @@ def _process_messages(
     ) -> MessageProcessingResult:
         """Process chat messages and apply chat template"""
         # GptOss model needs to keep special tokens for harmony parsing
-        if self.is_gpt_oss:
+        if self.is_gpt_oss or self.is_gemma4:
             request.skip_special_tokens = False
 
         self._patch_mistral_skip_special_tokens(request)
@@ -1280,12 +1285,18 @@ def _get_reasoning_from_request(self, request: ChatCompletionRequest) -> bool:
         """
         if not self.reasoning_parser:
             return False
-        if self.reasoning_parser in ["deepseek-v3"]:
+
+        if self.reasoning_parser == "deepseek-v3":
             # Models that require explicit enable thinking (thinking=True)
             return (
                 request.chat_template_kwargs is not None
                 and request.chat_template_kwargs.get("thinking") is True
             )
+        if self.reasoning_parser == "gemma4":
+            return (
+                request.chat_template_kwargs is not None
+                and request.chat_template_kwargs.get("enable_thinking") is True
+            )
         if self.reasoning_parser in ["kimi_k2"]:
             # Models that thinking by default, and can be disabled by setting thinking=False
             return (

@@ -14,6 +14,7 @@
 from sglang.srt.function_call.deepseekv3_detector import DeepSeekV3Detector
 from sglang.srt.function_call.deepseekv31_detector import DeepSeekV31Detector
 from sglang.srt.function_call.deepseekv32_detector import DeepSeekV32Detector
+from sglang.srt.function_call.gemma4_detector import Gemma4Detector
 from sglang.srt.function_call.gigachat3_detector import GigaChat3Detector
 from sglang.srt.function_call.glm4_moe_detector import Glm4MoeDetector
 from sglang.srt.function_call.glm47_moe_detector import Glm47MoeDetector
@@ -69,6 +70,7 @@ class FunctionCallParser:
         "interns1": InternlmDetector,
         "hermes": HermesDetector,
         "gigachat3": GigaChat3Detector,
+        "gemma4": Gemma4Detector,
     }
 
     def __init__(self, tools: List[Tool], tool_call_parser: str):