sgl-project · Fridge003 · Apr 1, 2026 · Mar 26, 2026 · Mar 26, 2026 · Mar 26, 2026
diff --git a/python/sglang/srt/models/qwen3_vl_moe.py b/python/sglang/srt/models/qwen3_vl_moe.py
@@ -179,9 +179,8 @@ def __init__(
     ):
         super().__init__(config, quant_config, prefix, language_model_cls)
 
-    # Only allow LoRA on attention projections within text layers for MoE.
     _lora_pattern_moe = re.compile(
-        r"^model\.layers\.(\d+)\.self_attn\.(?:qkv_proj|o_proj)$"
+        r"^(?:model\.layers\.(\d+)\.(?:self_attn\.(?:qkv_proj|o_proj)|mlp\.experts)|lm_head|model\.embed_tokens)$"
     )
 
     def should_apply_lora(self, module_name: str) -> bool:

diff --git a/test/manual/lora/test_lora_qwen3_vl.py b/test/manual/lora/test_lora_qwen3_vl.py
@@ -47,11 +47,15 @@ def test_qwen3_vl_moe_should_apply_lora_regex(self):
         positives = (
             "model.layers.0.self_attn.qkv_proj",
             "model.layers.5.self_attn.o_proj",
+            "model.layers.0.mlp.experts",
+            "lm_head",
+            "model.embed_tokens",
         )
         negatives = (
             "model.layers.0.mlp.gate_up_proj",
             "model.layers.0.mlp.down_proj",
             "visual.blocks.0.attn.qkv_proj",
+            "visual.blocks.0.mlp.experts",
             "model.layers.x.self_attn.qkv_proj",
             "model.layers.0.attn.qkv_proj",
         )

diff --git a/test/registered/lora/test_lora_qwen3_vl_30b_a3b_instruct_logprob_diff.py b/test/registered/lora/test_lora_qwen3_vl_30b_a3b_instruct_logprob_diff.py
@@ -0,0 +1,151 @@
+# Copyright 2023-2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""
+Regression test for Qwen3-VL-30B-A3B-Instruct LoRA logprob accuracy.
+
+Compares SGLang LoRA logprobs against reference training logprobs from a
+pre-computed dataset. The LoRA adapter and reference data are downloaded from:
+https://huggingface.co/datasets/yushengsu/lora-diff-Qwen3-VL-30B-A3B-Instruct
+
+Usage:
+    python -m unittest test_lora_qwen3_vl_30b_a3b_instruct_logprob_diff
+"""
+
+import multiprocessing as mp
+import os
+import unittest
+
+import torch
+from huggingface_hub import snapshot_download
+
+import sglang as sgl
+from sglang.test.ci.ci_register import register_cuda_ci
+from sglang.test.test_utils import CustomTestCase
+
+register_cuda_ci(
+    est_time=300,
+    suite="stage-c-test-8-gpu-h200",
+)
+
+BASE_MODEL = "Qwen/Qwen3-VL-30B-A3B-Instruct"
+LORA_HF_REPO = "yushengsu/lora-diff-Qwen3-VL-30B-A3B-Instruct"
+LORA_BACKEND = "triton"
+MAX_LORA_RANK = 32
+TP_SIZE = 8
+DISABLE_CUDA_GRAPH = True
+MOE_RUNNER_BACKEND = "triton"
+EXPERTS_SHARED_OUTER_LORAS = True
+PREFILL_ATTENTION_BACKEND = "fa4"
+DECODE_ATTENTION_BACKEND = "fa4"
+
+KL_THRESHOLD = 1e-2
+
+
+def kl_v2(a, b):
+    a = torch.tensor(a) if not torch.is_tensor(a) else a
+    b = torch.tensor(b) if not torch.is_tensor(b) else b
+    return (((a - b) ** 2) * 0.5).mean().item()
+
+
+def get_prompt_logprobs(engine, input_ids, lora_path):
+    out = engine.generate(
+        input_ids=input_ids,
+        sampling_params={"max_new_tokens": 0, "temperature": 0.0},
+        return_logprob=True,
+        logprob_start_len=0,
+        lora_path=lora_path,
+    )
+    return [logprob for logprob, _, _ in out["meta_info"]["input_token_logprobs"]][1:]
+
+
+class TestLoRAQwen3VL_30B_A3B_Instruct_LogprobDiff(CustomTestCase):
+
+    def test_lora_qwen3_vl_30b_a3b_instruct_logprob_accuracy(self):
+        adapter_path = snapshot_download(
+            LORA_HF_REPO,
+            repo_type="dataset",
+        )
+
+        engine = sgl.Engine(
+            model_path=BASE_MODEL,
+            tp_size=TP_SIZE,
+            enable_lora=True,
+            max_lora_rank=MAX_LORA_RANK,
+            lora_paths={"my_lora": adapter_path},
+            lora_backend=LORA_BACKEND,
+            attention_backend="flashinfer",
+            disable_cuda_graph=DISABLE_CUDA_GRAPH,
+            moe_runner_backend=MOE_RUNNER_BACKEND,
+            experts_shared_outer_loras=EXPERTS_SHARED_OUTER_LORAS,
+            prefill_attention_backend=PREFILL_ATTENTION_BACKEND,
+            decode_attention_backend=DECODE_ATTENTION_BACKEND,
+        )
+
+        try:
+            cdata = torch.load(
+                os.path.join(adapter_path, "compare_sample_train_data.pt"),
+                weights_only=False,
+            )
+
+            base_logprobs = get_prompt_logprobs(engine, cdata["tokens"], lora_path=None)
+            logprobs = get_prompt_logprobs(engine, cdata["tokens"], lora_path="my_lora")
+
+            base_t = torch.tensor(base_logprobs)
+            lora_t = torch.tensor(logprobs)
+            diff = (base_t - lora_t).abs()
+            print(
+                f"[VERIFY] base vs lora: mean_diff={diff.mean().item():.6f}, "
+                f"max_diff={diff.max().item():.6f}, "
+                f"identical={torch.equal(base_t, lora_t)}"
+            )
+
+            self.assertFalse(
+                torch.equal(base_t, lora_t),
+                "LoRA logprobs should differ from base model logprobs",
+            )
+
+            kl_sglang_trainer = kl_v2(cdata["training_logprobs"], logprobs)
+            kl_orig_trainer = kl_v2(
+                cdata["training_logprobs"], cdata["sampling_logprobs"]
+            )
+            kl_sglang_orig = kl_v2(logprobs, cdata["sampling_logprobs"])
+
+            print(f"KL(orig_sampler, trainer) = {kl_orig_trainer:.6e}")
+            print(f"KL(sglang, trainer)       = {kl_sglang_trainer:.6e}")
+            print(f"KL(sglang, orig_sampler)  = {kl_sglang_orig:.6e}")
+
+            self.assertLessEqual(
+                kl_sglang_trainer,
+                KL_THRESHOLD,
+                f"KL(sglang, trainer) = {kl_sglang_trainer:.6e} exceeds "
+                f"threshold {KL_THRESHOLD}",
+            )
+
+        finally:
+            engine.shutdown()
+
+
+if __name__ == "__main__":
+    try:
+        mp.set_start_method("spawn")
+    except RuntimeError:
+        pass
+
+    try:
+        unittest.main(warnings="ignore", verbosity=2)
+    finally:
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize()