Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions python/sglang/srt/models/qwen3_vl_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,9 +179,8 @@ def __init__(
):
super().__init__(config, quant_config, prefix, language_model_cls)

# Only allow LoRA on attention projections within text layers for MoE.
_lora_pattern_moe = re.compile(
r"^model\.layers\.(\d+)\.self_attn\.(?:qkv_proj|o_proj)$"
r"^(?:model\.layers\.(\d+)\.(?:self_attn\.(?:qkv_proj|o_proj)|mlp\.experts)|lm_head|model\.embed_tokens)$"
)

def should_apply_lora(self, module_name: str) -> bool:
Expand Down
4 changes: 4 additions & 0 deletions test/manual/lora/test_lora_qwen3_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,15 @@ def test_qwen3_vl_moe_should_apply_lora_regex(self):
positives = (
"model.layers.0.self_attn.qkv_proj",
"model.layers.5.self_attn.o_proj",
"model.layers.0.mlp.experts",
"lm_head",
"model.embed_tokens",
)
negatives = (
"model.layers.0.mlp.gate_up_proj",
"model.layers.0.mlp.down_proj",
"visual.blocks.0.attn.qkv_proj",
"visual.blocks.0.mlp.experts",
"model.layers.x.self_attn.qkv_proj",
"model.layers.0.attn.qkv_proj",
)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
# Copyright 2023-2025 SGLang Team
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""
Regression test for Qwen3-VL-30B-A3B-Instruct LoRA logprob accuracy.

Compares SGLang LoRA logprobs against reference training logprobs from a
pre-computed dataset. The LoRA adapter and reference data are downloaded from:
https://huggingface.co/datasets/yushengsu/lora-diff-Qwen3-VL-30B-A3B-Instruct

Usage:
python -m unittest test_lora_qwen3_vl_30b_a3b_instruct_logprob_diff
"""

import multiprocessing as mp
import os
import unittest

import torch
from huggingface_hub import snapshot_download

import sglang as sgl
from sglang.test.ci.ci_register import register_cuda_ci
from sglang.test.test_utils import CustomTestCase

register_cuda_ci(
est_time=300,
suite="stage-c-test-8-gpu-h200",
)

BASE_MODEL = "Qwen/Qwen3-VL-30B-A3B-Instruct"
LORA_HF_REPO = "yushengsu/lora-diff-Qwen3-VL-30B-A3B-Instruct"
LORA_BACKEND = "triton"
MAX_LORA_RANK = 32
TP_SIZE = 8
Comment thread
Fridge003 marked this conversation as resolved.
Outdated
DISABLE_CUDA_GRAPH = True
MOE_RUNNER_BACKEND = "triton"
EXPERTS_SHARED_OUTER_LORAS = True
PREFILL_ATTENTION_BACKEND = "fa4"
DECODE_ATTENTION_BACKEND = "fa4"

KL_THRESHOLD = 1e-2


def kl_v2(a, b):
a = torch.tensor(a) if not torch.is_tensor(a) else a
b = torch.tensor(b) if not torch.is_tensor(b) else b
return (((a - b) ** 2) * 0.5).mean().item()


def get_prompt_logprobs(engine, input_ids, lora_path):
out = engine.generate(
input_ids=input_ids,
sampling_params={"max_new_tokens": 0, "temperature": 0.0},
return_logprob=True,
logprob_start_len=0,
lora_path=lora_path,
)
return [logprob for logprob, _, _ in out["meta_info"]["input_token_logprobs"]][1:]


class TestLoRAQwen3VL_30B_A3B_Instruct_LogprobDiff(CustomTestCase):

def test_lora_qwen3_vl_30b_a3b_instruct_logprob_accuracy(self):
adapter_path = snapshot_download(
LORA_HF_REPO,
repo_type="dataset",
)

engine = sgl.Engine(
model_path=BASE_MODEL,
tp_size=TP_SIZE,
enable_lora=True,
max_lora_rank=MAX_LORA_RANK,
lora_paths={"my_lora": adapter_path},
lora_backend=LORA_BACKEND,
attention_backend="flashinfer",
disable_cuda_graph=DISABLE_CUDA_GRAPH,
moe_runner_backend=MOE_RUNNER_BACKEND,
experts_shared_outer_loras=EXPERTS_SHARED_OUTER_LORAS,
prefill_attention_backend=PREFILL_ATTENTION_BACKEND,
decode_attention_backend=DECODE_ATTENTION_BACKEND,
)

try:
cdata = torch.load(
os.path.join(adapter_path, "compare_sample_train_data.pt"),
weights_only=False,
)

base_logprobs = get_prompt_logprobs(engine, cdata["tokens"], lora_path=None)
logprobs = get_prompt_logprobs(engine, cdata["tokens"], lora_path="my_lora")

base_t = torch.tensor(base_logprobs)
lora_t = torch.tensor(logprobs)
diff = (base_t - lora_t).abs()
print(
f"[VERIFY] base vs lora: mean_diff={diff.mean().item():.6f}, "
f"max_diff={diff.max().item():.6f}, "
f"identical={torch.equal(base_t, lora_t)}"
)

self.assertFalse(
torch.equal(base_t, lora_t),
"LoRA logprobs should differ from base model logprobs",
)

kl_sglang_trainer = kl_v2(cdata["training_logprobs"], logprobs)
kl_orig_trainer = kl_v2(
cdata["training_logprobs"], cdata["sampling_logprobs"]
)
kl_sglang_orig = kl_v2(logprobs, cdata["sampling_logprobs"])

print(f"KL(orig_sampler, trainer) = {kl_orig_trainer:.6e}")
print(f"KL(sglang, trainer) = {kl_sglang_trainer:.6e}")
print(f"KL(sglang, orig_sampler) = {kl_sglang_orig:.6e}")

self.assertLessEqual(
kl_sglang_trainer,
KL_THRESHOLD,
f"KL(sglang, trainer) = {kl_sglang_trainer:.6e} exceeds "
f"threshold {KL_THRESHOLD}",
)

finally:
engine.shutdown()


if __name__ == "__main__":
try:
mp.set_start_method("spawn")
except RuntimeError:
pass

try:
unittest.main(warnings="ignore", verbosity=2)
finally:
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.synchronize()
Loading