From 9f602f9254d2d46dd46c1b7a1779be9e92593617 Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Thu, 26 Mar 2026 00:17:20 +0000 Subject: [PATCH 1/7] Support auto-detection of LoRA target modules from adapter config When adapter_config.json uses PEFT shorthands like "all-linear" or "all", SGLang previously required users to explicitly specify --lora-target-modules on the CLI. This change adds a model-scanning approach that inspects the loaded base model to discover all LoRA-compatible linear modules automatically. Changes: - utils.py: add auto_detect_lora_target_modules() that walks the model graph, collects LinearBase/FusedMoE/ParallelLMHead module suffixes, normalizes them, and filters to the set supported by get_hidden_dim and init_buffers. - lora_manager.py: in init_lora_shapes(), resolve "all-linear"/"all" via model scanning instead of raising ValueError when CLI target modules are not provided. In init_lora_modules(), guard against modules outside decoder layers (layer_id is None) to prevent TypeError on non-layer modules. Made-with: Cursor --- python/sglang/srt/lora/lora_manager.py | 26 ++++++++------ python/sglang/srt/lora/utils.py | 48 +++++++++++++++++++++++--- 2 files changed, 59 insertions(+), 15 deletions(-) diff --git a/python/sglang/srt/lora/lora_manager.py b/python/sglang/srt/lora/lora_manager.py index 73f6bc23544e..5120c97a72b2 100644 --- a/python/sglang/srt/lora/lora_manager.py +++ b/python/sglang/srt/lora/lora_manager.py @@ -37,6 +37,7 @@ from sglang.srt.lora.utils import ( LoRAType, get_normalized_target_modules, + auto_detect_lora_target_modules, get_target_module_name, ) from sglang.srt.managers.io_struct import LoRAUpdateOutput @@ -424,9 +425,6 @@ def init_lora_shapes( for lora_id, config in self.configs.items(): # Handle PEFT shorthand strings like "all-linear" or "all". - # These cannot be resolved to concrete module names without - # inspecting the base model, so we require the user to specify - # --lora-target-modules explicitly when such shorthands are used. if isinstance(config.target_modules, str): if config.target_modules in ("all-linear", "all"): if target_modules is not None: @@ -434,14 +432,20 @@ def init_lora_shapes( # per-adapter inference for this adapter. continue else: - lora_name = self.lora_refs[lora_id].lora_name - raise ValueError( - f"LoRA adapter '{lora_name}' uses " - f"target_modules='{config.target_modules}' which cannot " - "be resolved automatically. Please explicitly specify " - "--lora-target-modules during server startup. You can " - "specify 'all' to enable all supported module types." + # Resolve by scanning the base model for all + # LoRA-compatible linear modules. + adapter_target_modules = auto_detect_lora_target_modules( + self.base_model ) + logger.info( + "LoRA adapter '%s' uses target_modules='%s'. " + "Resolved to %s by inspecting the base model.", + self.lora_refs[lora_id].lora_name, + config.target_modules, + sorted(adapter_target_modules), + ) + self.target_modules.update(adapter_target_modules) + continue else: raise ValueError( f"SGLang does not recognize target_modules=" @@ -672,6 +676,8 @@ def init_lora_modules(self): # The module should be converted if it is included in target_names if module_name.split(".")[-1] in self.target_modules: layer_id = get_layer_id(module_name) + if layer_id is None: + continue self.lora_modules[layer_id][module_name] = self.set_lora_module( module_name, module ) diff --git a/python/sglang/srt/lora/utils.py b/python/sglang/srt/lora/utils.py index 45987d736d3c..490600769d48 100644 --- a/python/sglang/srt/lora/utils.py +++ b/python/sglang/srt/lora/utils.py @@ -113,12 +113,12 @@ def get_normalized_target_modules( Handles both base module names (e.g., "gate_proj") and prefixed module names (e.g., "feed_forward.gate_proj"). Also handles PEFT shorthand strings like "all-linear" or "all" by returning - {"all"} as a sentinel value (the caller should check for "all" and fall - back to the CLI --lora-target-modules to determine the concrete module set). + {"all"} as a sentinel value. Callers that need a concrete module set + should use :func:`auto_detect_lora_target_modules` to resolve the shorthand + against the loaded base model. """ - # Handle PEFT shorthand strings — these cannot be resolved to concrete - # module names without inspecting the base model, so we return {"all"} - # and let the caller fall back to the CLI --lora-target-modules. + # Handle PEFT shorthand strings — return {"all"} as sentinel. + # Callers can resolve to concrete names via auto_detect_lora_target_modules(). if isinstance(target_modules, str): return {"all"} @@ -175,6 +175,44 @@ def get_target_module_name(full_module_name: str, target_modules: Set[str]) -> s EMBEDDING_NAMES = ["embed_tokens", "lm_head"] ROW_PARALLELISM_LINEAR_LORA_NAMES = ["o_proj", "down_proj", "down_proj_moe"] +# Normalized module names that the LoRA system fully supports +# (i.e. get_hidden_dim, init_buffers, and init_lora_modules can handle them). +_KNOWN_LORA_TARGET_MODULES = frozenset( + { + "qkv_proj", + "o_proj", + "gate_up_proj", + "down_proj", + "lm_head", + } +) + + +def auto_detect_lora_target_modules(model: "torch.nn.Module") -> set: + """Discover LoRA-compatible modules by inspecting the base model. + + Walks the model graph and returns the set of *normalized* target-module + names that (a) actually exist in the model and (b) the LoRA memory pool + can handle. This is used to resolve PEFT shorthands like ``"all-linear"`` + without requiring the user to enumerate modules on the CLI. + """ + from sglang.srt.layers.linear import LinearBase + from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE + from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead + + raw_names: set = set() + for name, module in model.named_modules(): + if isinstance(module, FusedMoE): + raw_names.add("gate_up_proj") + raw_names.add("down_proj") + elif isinstance(module, ParallelLMHead): + raw_names.add("lm_head") + elif isinstance(module, LinearBase): + raw_names.add(name.split(".")[-1]) + + normalized = get_normalized_target_modules(raw_names) + return normalized & _KNOWN_LORA_TARGET_MODULES + def get_lm_head_lora_b_shard_size(output_dim: int, shard_indices=None) -> int: """Get the LoRA B output dimension for lm_head, accounting for TP. From 4f30d8c299482470de516802966d6fd77d56f50c Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Thu, 26 Mar 2026 00:45:48 +0000 Subject: [PATCH 2/7] add ci --- .../lora/test_lora_qwen3_8b_logprob_diff.py | 151 ++++++++++++++++++ 1 file changed, 151 insertions(+) create mode 100644 test/registered/lora/test_lora_qwen3_8b_logprob_diff.py diff --git a/test/registered/lora/test_lora_qwen3_8b_logprob_diff.py b/test/registered/lora/test_lora_qwen3_8b_logprob_diff.py new file mode 100644 index 000000000000..74b5fe9cdb8d --- /dev/null +++ b/test/registered/lora/test_lora_qwen3_8b_logprob_diff.py @@ -0,0 +1,151 @@ +# Copyright 2023-2025 SGLang Team +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +""" +Regression test for Qwen3-8B LoRA logprob accuracy. + +Compares SGLang LoRA logprobs against reference training logprobs from a +pre-computed dataset. The LoRA adapter and reference data are downloaded from: +https://huggingface.co/datasets/yushengsu/lora-diff-Qwen3-8B + +Usage: + python -m unittest test_lora_qwen3_8b_logprob_diff +""" + +import multiprocessing as mp +import os +import unittest + +import torch + +import sglang as sgl +from huggingface_hub import snapshot_download +from sglang.test.ci.ci_register import register_cuda_ci +from sglang.test.test_utils import CustomTestCase + +register_cuda_ci( + est_time=200, + suite="stage-b-test-1-gpu-large", +) + +BASE_MODEL = "Qwen/Qwen3-8B" +LORA_HF_REPO = "yushengsu/lora-diff-Qwen3-8B" +LORA_BACKEND = "triton" +MAX_LORA_RANK = 32 +TP_SIZE = 1 +DISABLE_CUDA_GRAPH = True +PREFILL_ATTENTION_BACKEND = "fa4" +DECODE_ATTENTION_BACKEND = "fa4" + +KL_THRESHOLD = 1e-2 + + +def kl_v2(a, b): + a = torch.tensor(a) if not torch.is_tensor(a) else a + b = torch.tensor(b) if not torch.is_tensor(b) else b + return (((a - b) ** 2) * 0.5).mean().item() + + +def get_prompt_logprobs(engine, input_ids, lora_path): + out = engine.generate( + input_ids=input_ids, + sampling_params={"max_new_tokens": 0, "temperature": 0.0}, + return_logprob=True, + logprob_start_len=0, + lora_path=lora_path, + ) + return [logprob for logprob, _, _ in out["meta_info"]["input_token_logprobs"]][1:] + + +class TestLoRAQwen3_8BLogprobDiff(CustomTestCase): + + def test_lora_qwen3_8b_logprob_accuracy(self): + adapter_path = snapshot_download( + LORA_HF_REPO, + repo_type="dataset", + ) + + engine = sgl.Engine( + model_path=BASE_MODEL, + tp_size=TP_SIZE, + enable_lora=True, + max_lora_rank=MAX_LORA_RANK, + lora_paths={"my_lora": adapter_path}, + lora_backend=LORA_BACKEND, + attention_backend="flashinfer", + disable_cuda_graph=DISABLE_CUDA_GRAPH, + prefill_attention_backend=PREFILL_ATTENTION_BACKEND, + decode_attention_backend=DECODE_ATTENTION_BACKEND, + ) + + try: + cdata = torch.load( + os.path.join(adapter_path, "compare_sample_train_data.pt"), + weights_only=False, + ) + + base_logprobs = get_prompt_logprobs( + engine, cdata["tokens"], lora_path=None + ) + logprobs = get_prompt_logprobs( + engine, cdata["tokens"], lora_path="my_lora" + ) + + base_t = torch.tensor(base_logprobs) + lora_t = torch.tensor(logprobs) + diff = (base_t - lora_t).abs() + print( + f"[VERIFY] base vs lora: mean_diff={diff.mean().item():.6f}, " + f"max_diff={diff.max().item():.6f}, " + f"identical={torch.equal(base_t, lora_t)}" + ) + + self.assertFalse( + torch.equal(base_t, lora_t), + "LoRA logprobs should differ from base model logprobs", + ) + + kl_sglang_trainer = kl_v2(cdata["training_logprobs"], logprobs) + kl_orig_trainer = kl_v2( + cdata["training_logprobs"], cdata["sampling_logprobs"] + ) + kl_sglang_orig = kl_v2(logprobs, cdata["sampling_logprobs"]) + + print(f"KL(orig_sampler, trainer) = {kl_orig_trainer:.6e}") + print(f"KL(sglang, trainer) = {kl_sglang_trainer:.6e}") + print(f"KL(sglang, orig_sampler) = {kl_sglang_orig:.6e}") + + self.assertLessEqual( + kl_sglang_trainer, + KL_THRESHOLD, + f"KL(sglang, trainer) = {kl_sglang_trainer:.6e} exceeds " + f"threshold {KL_THRESHOLD}", + ) + + finally: + engine.shutdown() + + +if __name__ == "__main__": + try: + mp.set_start_method("spawn") + except RuntimeError: + pass + + try: + unittest.main(warnings="ignore", verbosity=2) + finally: + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.synchronize() From add28dcabfae744408b747bffff5c86b1d9f581b Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Thu, 26 Mar 2026 00:54:09 +0000 Subject: [PATCH 3/7] pre-commit --- python/sglang/srt/lora/lora_manager.py | 2 +- .../registered/lora/test_lora_qwen3_8b_logprob_diff.py | 10 +++------- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/python/sglang/srt/lora/lora_manager.py b/python/sglang/srt/lora/lora_manager.py index 5120c97a72b2..7b222161f669 100644 --- a/python/sglang/srt/lora/lora_manager.py +++ b/python/sglang/srt/lora/lora_manager.py @@ -36,8 +36,8 @@ from sglang.srt.lora.mem_pool import LoRAMemoryPool from sglang.srt.lora.utils import ( LoRAType, - get_normalized_target_modules, auto_detect_lora_target_modules, + get_normalized_target_modules, get_target_module_name, ) from sglang.srt.managers.io_struct import LoRAUpdateOutput diff --git a/test/registered/lora/test_lora_qwen3_8b_logprob_diff.py b/test/registered/lora/test_lora_qwen3_8b_logprob_diff.py index 74b5fe9cdb8d..1a40ef32f8a1 100644 --- a/test/registered/lora/test_lora_qwen3_8b_logprob_diff.py +++ b/test/registered/lora/test_lora_qwen3_8b_logprob_diff.py @@ -28,9 +28,9 @@ import unittest import torch +from huggingface_hub import snapshot_download import sglang as sgl -from huggingface_hub import snapshot_download from sglang.test.ci.ci_register import register_cuda_ci from sglang.test.test_utils import CustomTestCase @@ -95,12 +95,8 @@ def test_lora_qwen3_8b_logprob_accuracy(self): weights_only=False, ) - base_logprobs = get_prompt_logprobs( - engine, cdata["tokens"], lora_path=None - ) - logprobs = get_prompt_logprobs( - engine, cdata["tokens"], lora_path="my_lora" - ) + base_logprobs = get_prompt_logprobs(engine, cdata["tokens"], lora_path=None) + logprobs = get_prompt_logprobs(engine, cdata["tokens"], lora_path="my_lora") base_t = torch.tensor(base_logprobs) lora_t = torch.tensor(logprobs) From 767655c3c62673108cc2a37027137d8ee6559706 Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Thu, 26 Mar 2026 08:47:59 +0000 Subject: [PATCH 4/7] support shared lora foramte and qwen3_30b_a3b_instruct_2507 --- python/sglang/srt/lora/layers.py | 36 +++- python/sglang/srt/lora/lora.py | 29 ++- python/sglang/srt/lora/lora_manager.py | 60 +++++- python/sglang/srt/lora/lora_moe_runners.py | 27 ++- python/sglang/srt/lora/mem_pool.py | 189 +++++++++++++----- .../srt/lora/triton_ops/sgemm_lora_b.py | 7 +- python/sglang/srt/server_args.py | 9 + ...wen3_30b_a3b_instruct_2507_logprob_diff.py | 151 ++++++++++++++ 8 files changed, 425 insertions(+), 83 deletions(-) create mode 100644 test/registered/lora/test_lora_qwen3_30b_a3b_instruct_2507_logprob_diff.py diff --git a/python/sglang/srt/lora/layers.py b/python/sglang/srt/lora/layers.py index 21ad10447b46..c0a94bd07582 100644 --- a/python/sglang/srt/lora/layers.py +++ b/python/sglang/srt/lora/layers.py @@ -711,6 +711,9 @@ def __init__( # initializes FusedMoE with its own moe_runner for base path super().__init__(base_layer, lora_backend) + self.experts_shared_outer_loras: bool = False + self.quant_method = base_layer.quant_method + self.tp_size = getattr(base_layer, "moe_tp_size", 1) self.tp_rank = getattr(base_layer, "moe_tp_rank", 0) self.intermediate_size_per_partition = getattr( @@ -782,6 +785,7 @@ def _get_lora_info(self): adapter_enabled=adapter_enabled, max_lora_rank=max_lora_rank, num_experts=self.base_layer.num_experts, + experts_shared_outer_loras=self.experts_shared_outer_loras, tp_size=self.tp_size, tp_rank=self.tp_rank, hidden_size=getattr(self.base_layer, "hidden_size", 0), @@ -838,17 +842,27 @@ def slice_lora_a_weights(self, A: torch.Tensor, tp_rank: int): def slice_lora_b_weights(self, B: torch.Tensor, tp_rank: int): return B - def slice_moe_lora_a_weights( - self, A: torch.Tensor, tp_rank: int, target_module: str - ) -> torch.Tensor: + def slice_moe_lora_a_weights(self, A, tp_rank: int, target_module: str): """Slice LoRA A weights for MoE with TP. + Accepts: + - 2D tensor [rank, hidden] (single expert) + - 3D tensor [num_experts_or_1, rank, hidden] + - dict {expert_id: 2D tensor} + Per-expert weight shapes: gate_up_proj_moe A: [rank, hidden_size] — input is full hidden_states, no slice down_proj_moe A: [rank, intermediate_size] — input is sharded intermediate """ if self.tp_size <= 1: return A + if isinstance(A, dict): + return {eid: self._slice_moe_a_2d(w, tp_rank, target_module) for eid, w in A.items()} + if isinstance(A, torch.Tensor) and A.dim() == 3: + return torch.stack([self._slice_moe_a_2d(A[i], tp_rank, target_module) for i in range(A.shape[0])]) + return self._slice_moe_a_2d(A, tp_rank, target_module) + + def _slice_moe_a_2d(self, A: torch.Tensor, tp_rank: int, target_module: str) -> torch.Tensor: if target_module == "down_proj_moe": shard_size = self.intermediate_size_per_partition start = tp_rank * shard_size @@ -856,17 +870,27 @@ def slice_moe_lora_a_weights( return A[:, start:end].contiguous() return A - def slice_moe_lora_b_weights( - self, B: torch.Tensor, tp_rank: int, target_module: str - ) -> torch.Tensor: + def slice_moe_lora_b_weights(self, B, tp_rank: int, target_module: str): """Slice LoRA B weights for MoE with TP. + Accepts: + - 2D tensor [output_dim, rank] (single expert) + - 3D tensor [num_experts_or_1, output_dim, rank] + - dict {expert_id: 2D tensor} + Per-expert weight shapes: gate_up_proj_moe B: [intermediate_size*2, rank] — output matches sharded base w13 down_proj_moe B: [hidden_size, rank] — output is all-reduced, no slice """ if self.tp_size <= 1: return B + if isinstance(B, dict): + return {eid: self._slice_moe_b_2d(w, tp_rank, target_module) for eid, w in B.items()} + if isinstance(B, torch.Tensor) and B.dim() == 3: + return torch.stack([self._slice_moe_b_2d(B[i], tp_rank, target_module) for i in range(B.shape[0])]) + return self._slice_moe_b_2d(B, tp_rank, target_module) + + def _slice_moe_b_2d(self, B: torch.Tensor, tp_rank: int, target_module: str) -> torch.Tensor: if target_module == "gate_up_proj_moe": shard_size = self.intermediate_size_per_partition start = tp_rank * shard_size diff --git a/python/sglang/srt/lora/lora.py b/python/sglang/srt/lora/lora.py index 8ccb674f9195..8a7237a9aef7 100644 --- a/python/sglang/srt/lora/lora.py +++ b/python/sglang/srt/lora/lora.py @@ -137,6 +137,8 @@ def _normalize_weights(self): for layer in self.layers: weight_names = list(layer.weights.keys()) self.normalize_qkv_proj(weight_names, layer.weights) + self._rename_expert_w_to_proj(layer.weights) + weight_names = list(layer.weights.keys()) self.normalize_gate_up_proj(weight_names, layer.weights) def normalize_qkv_proj( @@ -192,6 +194,23 @@ def normalize_qkv_proj( weights[qkv_name] = weights[qkv_name].repeat(3, 1) # else: no-op as LoRA B weight is already stacked. + def _rename_expert_w_to_proj(self, weights: Dict[str, torch.Tensor]): + """Rename w1 -> gate_proj, w3 -> up_proj, w2 -> down_proj so that + normalize_gate_up_proj can stack them into gate_up_proj.""" + renames = {} + for name in list(weights.keys()): + new_name = name + if ".w1." in name: + new_name = name.replace(".w1.", ".gate_proj.") + elif ".w3." in name: + new_name = name.replace(".w3.", ".up_proj.") + elif ".w2." in name: + new_name = name.replace(".w2.", ".down_proj.") + if new_name != name: + renames[name] = new_name + for old_name, new_name in renames.items(): + weights[new_name] = weights.pop(old_name) + def normalize_gate_up_proj( self, weight_names: List[str], weights: Dict[str, torch.Tensor] ): @@ -206,8 +225,9 @@ def normalize_gate_up_proj( f"Received backend: {self.lora_backend.name}. Please verify your backend configuration " f"or consider implementing custom initialization logic for other backends." ) + cat_dim = weights[weight_name].dim() - 2 weights[gate_up_name] = torch.cat( - (weights[weight_name], weights[up_name]), 0 + (weights[weight_name], weights[up_name]), cat_dim ) weights.pop(weight_name) if up_name in weights: @@ -216,7 +236,12 @@ def normalize_gate_up_proj( # If gate_up_proj is already stacked, we normalize it following the SGL convention gate_up_name = weight_name if "lora_A" in weight_name: - weights[gate_up_name] = weights[gate_up_name].repeat(2, 1) + ndim = weights[gate_up_name].dim() + repeat_dims = [1] * ndim + repeat_dims[ndim - 2] = 2 + weights[gate_up_name] = weights[gate_up_name].repeat( + *repeat_dims + ) # else: no-op as LoRA B weight is already stacked. def pin_weights_in_cpu(self): diff --git a/python/sglang/srt/lora/lora_manager.py b/python/sglang/srt/lora/lora_manager.py index 7b222161f669..d8ecc15e92c3 100644 --- a/python/sglang/srt/lora/lora_manager.py +++ b/python/sglang/srt/lora/lora_manager.py @@ -78,8 +78,10 @@ def __init__( server_args.enable_lora_overlap_loading ) - # Store eviction policy from server args self.eviction_policy = server_args.lora_eviction_policy + self._experts_shared_outer_override: Optional[bool] = ( + server_args.experts_shared_outer_loras + ) # LoRA backend for running sgemm kernels logger.info(f"Using {lora_backend} as backend of LoRA kernels.") @@ -303,23 +305,33 @@ def update_lora_info(self): if isinstance(module, FusedMoEWithLoRA) and all( x in self.target_modules for x in ["gate_up_proj", "down_proj"] ): + gate_up_key = ( + "gate_up_proj_moe" + if "gate_up_proj_moe" in self.memory_pool.A_buffer + else "gate_up_proj" + ) + down_key = ( + "down_proj_moe" + if "down_proj_moe" in self.memory_pool.A_buffer + else "down_proj" + ) gate_up_a = self.memory_pool.get_tensor( - target_module="gate_up_proj_moe", + target_module=gate_up_key, layer_id=layer_id, lora_type=LoRAType.LORA_A, ) gate_up_b = self.memory_pool.get_tensor( - target_module="gate_up_proj_moe", + target_module=gate_up_key, layer_id=layer_id, lora_type=LoRAType.LORA_B, ) down_a = self.memory_pool.get_tensor( - target_module="down_proj_moe", + target_module=down_key, layer_id=layer_id, lora_type=LoRAType.LORA_A, ) down_b = self.memory_pool.get_tensor( - target_module="down_proj_moe", + target_module=down_key, layer_id=layer_id, lora_type=LoRAType.LORA_B, ) @@ -387,6 +399,16 @@ def init_state( target_modules=target_modules, ) + if self._experts_shared_outer_override is not None: + self.experts_shared_outer_loras = self._experts_shared_outer_override + else: + self.experts_shared_outer_loras = self._detect_shared_outer_loras() + if self.experts_shared_outer_loras: + logger.info( + "Shared outer LoRA mode enabled: gate_up lora_A and " + "down lora_B will be shared across experts (expert_dim=1)." + ) + self.init_lora_modules() self.init_memory_pool() self.update_lora_info() @@ -412,6 +434,26 @@ def init_lora_adapters(self, lora_paths: Optional[List[LoRARef]] = None): f"Failed to load LoRA adapter {lora_ref.lora_name}: {result.error_message}" ) + def _detect_shared_outer_loras(self) -> bool: + """Auto-detect shared outer LoRA format from loaded adapter weights. + + MoE adapters with shared outer experts store 3D tensors where + dim[0]=1 indicates weights shared across all experts, while + dim[0]=num_experts indicates per-expert weights. + Returns True if gate_up lora_A has expert_dim=1 (shared). + """ + for adapter in self.loras.values(): + for layer in adapter.layers: + for name, weight in layer.weights.items(): + if ( + "gate_up_proj" in name + and "lora_A" in name + and weight.dim() == 3 + ): + return weight.shape[0] == 1 + break + return False + def init_lora_shapes( self, max_lora_rank: Optional[int] = None, @@ -589,6 +631,7 @@ def init_memory_pool(self): base_model=self.base_model, eviction_policy=self.eviction_policy, lora_added_tokens_size=self.lora_added_tokens_size, + experts_shared_outer_loras=self.experts_shared_outer_loras, ) # Initializing memory pool with base model @@ -683,11 +726,12 @@ def init_lora_modules(self): ) continue - # Temporarily workaround for FusedMoE layer if isinstance(module, FusedMoE) and all( x in self.target_modules for x in ["gate_up_proj", "down_proj"] ): layer_id = get_layer_id(module_name) - self.lora_modules[layer_id][module_name] = self.set_lora_module( - module_name, module + lora_module = self.set_lora_module(module_name, module) + lora_module.experts_shared_outer_loras = ( + self.experts_shared_outer_loras ) + self.lora_modules[layer_id][module_name] = lora_module diff --git a/python/sglang/srt/lora/lora_moe_runners.py b/python/sglang/srt/lora/lora_moe_runners.py index 76ac964f69ac..3060e2fadd19 100644 --- a/python/sglang/srt/lora/lora_moe_runners.py +++ b/python/sglang/srt/lora/lora_moe_runners.py @@ -71,17 +71,22 @@ class LoRAInfo: """LoRA weights and dispatch info for MoE computation.""" - # LoRA weights: [num_loras, num_experts, dim1, dim2] + # LoRA weights: [num_loras, num_experts_or_1, dim1, dim2] + # When experts_shared_outer_loras=True: + # gate_up_lora_a: [num_loras, 1, max_rank, hidden_dim] (shared) + # down_lora_b: [num_loras, 1, hidden_dim, max_rank] (shared) gate_up_lora_a_weights: ( torch.Tensor - ) # [num_loras, num_experts, max_rank, hidden_dim] + ) # [num_loras, num_experts_or_1, max_rank, hidden_dim] gate_up_lora_b_weights: ( torch.Tensor ) # [num_loras, num_experts, gate_up_dim, max_rank] down_lora_a_weights: ( torch.Tensor ) # [num_loras, num_experts, max_rank, intermediate_dim] - down_lora_b_weights: torch.Tensor # [num_loras, num_experts, hidden_dim, max_rank] + down_lora_b_weights: ( + torch.Tensor + ) # [num_loras, num_experts_or_1, hidden_dim, max_rank] # Indice pointers of each segment in shape (num_segments + 1, ) seg_indptr: torch.Tensor @@ -95,6 +100,7 @@ class LoRAInfo: max_lora_rank: int # Maximum LoRA rank across all adapters num_experts: int + experts_shared_outer_loras: bool = False fully_sharded: bool = False tp_size: int = 1 @@ -469,16 +475,11 @@ def _add_lora_gate_up_delta( r = lora_info.max_lora_rank gate_up_a = lora_info.gate_up_lora_a_weights + if lora_info.experts_shared_outer_loras: + gate_up_a = gate_up_a.expand(-1, lora_info.num_experts, -1, -1) gate_up_b = lora_info.gate_up_lora_b_weights inter_size = gate_up_b.shape[2] // 2 - # Split packed gate_up weights into separate gate and up slices. - # gate_up_lora_a has shape [max_loras, num_experts, 2*r, hidden_dim] - # where the first r rows are gate_lora_a and the next r are up_lora_a. - # gate_up_lora_b has shape [max_loras, num_experts, 2*inter_size, r] - # where the first inter_size rows are gate_lora_b and the rest up_lora_b. - # Using num_slices=2 lets the kernel handle gate and up independently, - # keeping the rank dimension at r so shrink and expand both match. lora_a_stacked = [gate_up_a[:, :, :r, :], gate_up_a[:, :, r : 2 * r, :]] lora_b_stacked = [ gate_up_b[:, :, :inter_size, :], @@ -542,8 +543,12 @@ def _add_lora_down_delta( if lora_info.max_lora_rank == 0: return + down_lora_b = lora_info.down_lora_b_weights + if lora_info.experts_shared_outer_loras: + down_lora_b = down_lora_b.expand(-1, lora_info.num_experts, -1, -1) + lora_a_stacked = [lora_info.down_lora_a_weights] - lora_b_stacked = [lora_info.down_lora_b_weights] + lora_b_stacked = [down_lora_b] if lora_info.fully_sharded and lora_info.tp_size > 1: shard_size = lora_info.hidden_size // lora_info.tp_size diff --git a/python/sglang/srt/lora/mem_pool.py b/python/sglang/srt/lora/mem_pool.py index ca3310a9d289..f496f66f1850 100644 --- a/python/sglang/srt/lora/mem_pool.py +++ b/python/sglang/srt/lora/mem_pool.py @@ -60,6 +60,7 @@ def __init__( base_model: torch.nn.Module, eviction_policy: str, lora_added_tokens_size: int, + experts_shared_outer_loras: bool = False, ): self.base_hf_config: AutoConfig = base_hf_config self.num_layer: int = base_hf_config.num_hidden_layers @@ -70,6 +71,7 @@ def __init__( self.lora_added_tokens_size: int = lora_added_tokens_size self.max_lora_rank: int = max_lora_rank self.target_modules: Set[str] = target_modules + self.experts_shared_outer_loras: bool = experts_shared_outer_loras # Initialize eviction policy self.eviction_policy = get_eviction_policy(eviction_policy) @@ -140,6 +142,18 @@ def is_moe_module(self, module_name: str) -> bool: """Check if module is part of MoE experts.""" return "moe" in module_name + @staticmethod + def _get_num_experts(base_model: torch.nn.Module) -> int: + cfg = base_model.config + if hasattr(cfg, "get_text_config"): + cfg = cfg.get_text_config() + return ( + getattr(cfg, "num_experts", None) + or getattr(cfg, "num_local_experts", None) + or getattr(cfg, "n_routed_experts", None) + or 1 + ) + def _get_standard_shape( self, module_name: str, @@ -178,10 +192,16 @@ def get_lora_A_shape( input_dim = divide(input_dim, self.tp_size) if self.is_moe_module(module_name): - num_experts = base_model.config.num_experts + num_experts = self._get_num_experts(base_model) + expert_dim = num_experts + if ( + self.experts_shared_outer_loras + and module_name == "gate_up_proj_moe" + ): + expert_dim = 1 return ( self.max_loras_per_batch, - num_experts, + expert_dim, max_lora_dim * c, input_dim, ) @@ -228,8 +248,14 @@ def get_lora_B_shape( # Check if MoE module and return appropriate shape if self.is_moe_module(module_name): - num_experts = base_model.config.num_experts - return (self.max_loras_per_batch, num_experts, output_dim, max_lora_dim) + num_experts = self._get_num_experts(base_model) + expert_dim = num_experts + if ( + self.experts_shared_outer_loras + and module_name == "down_proj_moe" + ): + expert_dim = 1 + return (self.max_loras_per_batch, expert_dim, output_dim, max_lora_dim) else: return (self.max_loras_per_batch, output_dim, max_lora_dim) @@ -264,32 +290,36 @@ def init_buffer( target_modules: Set[str], get_lora_shape_fn: Callable[[str, torch.nn.Module, int, int], Tuple[int]], ): - # Check if model has both shared experts and MoE experts + cfg = base_model.config + if hasattr(cfg, "get_text_config"): + cfg = cfg.get_text_config() has_shared_experts = ( - hasattr(base_model.config, "shared_expert_intermediate_size") - and base_model.config.shared_expert_intermediate_size > 0 + ( + hasattr(cfg, "shared_expert_intermediate_size") + and cfg.shared_expert_intermediate_size > 0 + ) + or (getattr(cfg, "n_shared_experts", 0) or 0) > 0 ) - has_moe = getattr(base_model.config, "num_experts", 1) > 1 + has_moe = self._get_num_experts(base_model) > 1 # Shape functions automatically handle both 3D (standard) and 4D (MoE) target_modules = target_modules - set(EMBEDDING_NAMES) for module_name in target_modules: # Special handling for ambiguous target modules that can be in different contexts ambiguous_modules = {"gate_up_proj", "down_proj"} - if module_name in ambiguous_modules and has_shared_experts and has_moe: - # Allocate separate buffers for shared and MoE contexts - # Shared expert version (3D) - shared_key = module_name - buffer[shared_key] = [ - torch.empty( - get_lora_shape_fn( - module_name, base_model, self.max_lora_rank, idx - ), - dtype=self.dtype, - device=device, - ) - for idx in range(self.num_layer) - ] + if module_name in ambiguous_modules and has_moe: + # Allocate shared expert version (3D) only when model has shared experts + if has_shared_experts: + buffer[module_name] = [ + torch.zeros( + get_lora_shape_fn( + module_name, base_model, self.max_lora_rank, idx + ), + dtype=self.dtype, + device=device, + ) + for idx in range(self.num_layer) + ] # MoE expert version (4D) moe_key = f"{module_name}_moe" @@ -521,8 +551,8 @@ def load_lora_weight_tensor( expert_match = re.search(r"experts\.(\d+)\.", name) if expert_match: + # Per-expert MoE weight — 2D tensors, one per expert target_module = target_module + "_moe" - # MoE weight - multiple tensors per module (one per expert) if temp_A_buffer[target_module] is None: temp_A_buffer[target_module] = {} temp_B_buffer[target_module] = {} @@ -532,8 +562,15 @@ def load_lora_weight_tensor( temp_A_buffer[target_module][expert_id] = weights else: temp_B_buffer[target_module][expert_id] = weights + elif "experts" in name and weights.dim() == 3: + # Shared outer MoE weight — 3D tensor [expert_dim, rank, hidden] + target_module = target_module + "_moe" + if "lora_A" in name: + temp_A_buffer[target_module] = weights + else: + temp_B_buffer[target_module] = weights else: - # Standard weight - single tensor per module + # Standard weight — single tensor per module if "lora_A" in name: temp_A_buffer[target_module] = weights else: @@ -549,20 +586,18 @@ def load_lora_weight_tensor( if isinstance(module, FusedMoEWithLoRA): moe_target_modules = ["gate_up_proj_moe", "down_proj_moe"] for target_module in moe_target_modules: - if temp_A_buffer[target_module] is None: - continue - - for expert_id in temp_A_buffer[target_module].keys(): - temp_A_buffer[target_module][expert_id] = ( + if temp_A_buffer.get(target_module) is not None: + temp_A_buffer[target_module] = ( module.slice_moe_lora_a_weights( - temp_A_buffer[target_module][expert_id], + temp_A_buffer[target_module], self.tp_rank, target_module, ) ) - temp_B_buffer[target_module][expert_id] = ( + if temp_B_buffer.get(target_module) is not None: + temp_B_buffer[target_module] = ( module.slice_moe_lora_b_weights( - temp_B_buffer[target_module][expert_id], + temp_B_buffer[target_module], self.tp_rank, target_module, ) @@ -587,22 +622,41 @@ def load_lora_weight_tensor( temp_B_buffer[target_module], self.tp_rank ) - # Load weights into buffers (handles both 3D standard and 4D MoE) for name, weights in temp_A_buffer.items(): c = get_stacked_multiply(name) target_buffer = self.A_buffer[name][layer_id] if name in ["gate_up_proj_moe", "down_proj_moe"]: - # MoE: multiple tensors per module (one per expert) - for expert_id, expert_weight in weights.items(): - # Buffer shape: [num_loras, num_experts, max_rank, hidden_dim] - buffer_view = target_buffer[ - buffer_id, expert_id, : lora_rank * c, : - ] - load_lora_weight_tensor(buffer_view, expert_weight) + if ( + self.experts_shared_outer_loras + and name == "gate_up_proj_moe" + ): + if isinstance(weights, torch.Tensor) and weights.dim() == 3: + buffer_view = target_buffer[ + buffer_id, 0, : lora_rank * c, : + ] + load_lora_weight_tensor(buffer_view, weights[0]) + elif isinstance(weights, dict) and len(weights) > 0: + rep = next(iter(weights.values())) + buffer_view = target_buffer[ + buffer_id, 0, : lora_rank * c, : + ] + load_lora_weight_tensor(buffer_view, rep) + else: + target_buffer[buffer_id].zero_() + elif isinstance(weights, torch.Tensor) and weights.dim() == 3: + for eid in range(weights.shape[0]): + buffer_view = target_buffer[ + buffer_id, eid, : lora_rank * c, : + ] + load_lora_weight_tensor(buffer_view, weights[eid]) + elif isinstance(weights, dict): + for expert_id, expert_weight in weights.items(): + buffer_view = target_buffer[ + buffer_id, expert_id, : lora_rank * c, : + ] + load_lora_weight_tensor(buffer_view, expert_weight) else: - # Standard: single tensor per module - c = get_stacked_multiply(name) buffer_view = target_buffer[buffer_id, : lora_rank * c, :] load_lora_weight_tensor(buffer_view, weights) @@ -610,18 +664,47 @@ def load_lora_weight_tensor( target_buffer = self.B_buffer[name][layer_id] if name in ["gate_up_proj_moe", "down_proj_moe"]: - # MoE: multiple tensors per module (one per expert) - for expert_id, expert_weight in weights.items(): - # Buffer shape: [num_loras, num_experts, intermediate_dim, max_rank] - buffer_view = target_buffer[buffer_id, expert_id, :, :lora_rank] - - weight_to_load = expert_weight - if weight_to_load is not None: - weight_to_load = weight_to_load * lora_adapter.scaling - - load_lora_weight_tensor(buffer_view, weight_to_load) + if ( + self.experts_shared_outer_loras + and name == "down_proj_moe" + ): + if isinstance(weights, torch.Tensor) and weights.dim() == 3: + buffer_view = target_buffer[ + buffer_id, 0, :, :lora_rank + ] + w = weights[0] + if w is not None: + w = w * lora_adapter.scaling + load_lora_weight_tensor(buffer_view, w) + elif isinstance(weights, dict) and len(weights) > 0: + rep = next(iter(weights.values())) + buffer_view = target_buffer[ + buffer_id, 0, :, :lora_rank + ] + if rep is not None: + rep = rep * lora_adapter.scaling + load_lora_weight_tensor(buffer_view, rep) + else: + target_buffer[buffer_id].zero_() + elif isinstance(weights, torch.Tensor) and weights.dim() == 3: + for eid in range(weights.shape[0]): + buffer_view = target_buffer[ + buffer_id, eid, :, :lora_rank + ] + w = weights[eid] + if w is not None: + w = w * lora_adapter.scaling + load_lora_weight_tensor(buffer_view, w) + elif isinstance(weights, dict): + for expert_id, expert_weight in weights.items(): + buffer_view = target_buffer[ + buffer_id, expert_id, :, :lora_rank + ] + w = expert_weight + if w is not None: + w = w * lora_adapter.scaling + load_lora_weight_tensor(buffer_view, w) else: - # Standard: single tensor per module buffer_view = target_buffer[buffer_id, :, :lora_rank] load_lora_weight_tensor(buffer_view, weights) diff --git a/python/sglang/srt/lora/triton_ops/sgemm_lora_b.py b/python/sglang/srt/lora/triton_ops/sgemm_lora_b.py index 357d3280548c..b796cdd0efa4 100644 --- a/python/sglang/srt/lora/triton_ops/sgemm_lora_b.py +++ b/python/sglang/srt/lora/triton_ops/sgemm_lora_b.py @@ -87,6 +87,7 @@ def _sgemm_lora_b_kernel( ) # Iterate to compute the block in output matrix + n_mask = n_offset[None, :] < N partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32) for k in range(0, tl.cdiv(K, BLOCK_K)): x_tile = tl.load( @@ -96,7 +97,7 @@ def _sgemm_lora_b_kernel( ) w_tile = tl.load( w_ptrs, - mask=(k_offset[:, None] < K - k * BLOCK_K), + mask=(k_offset[:, None] < K - k * BLOCK_K) & n_mask, other=0.0, ) partial_sum += tl.dot(x_tile, w_tile) @@ -110,8 +111,8 @@ def _sgemm_lora_b_kernel( output_ptr = (output + seg_start * output_stride_0) + ( s_offset[:, None] * output_stride_0 + n_offset[None, :] * output_stride_1 ) - output_mask = s_offset[:, None] < seg_len - partial_sum += tl.load(output_ptr, mask=output_mask) + output_mask = (s_offset[:, None] < seg_len) & n_mask + partial_sum += tl.load(output_ptr, mask=output_mask, other=0.0) tl.store(output_ptr, partial_sum, mask=output_mask) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 3b4659631fc8..ae5d144a73f9 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -464,6 +464,7 @@ class ServerArgs: lora_eviction_policy: str = "lru" lora_backend: str = "csgmv" max_lora_chunk_size: Optional[int] = 16 + experts_shared_outer_loras: Optional[bool] = None # Kernel backend attention_backend: Optional[str] = None @@ -4548,6 +4549,14 @@ def add_cli_args(parser: argparse.ArgumentParser): choices=[16, 32, 64, 128], help="Maximum chunk size for the ChunkedSGMV LoRA backend. Only used when --lora-backend is 'csgmv'. Choosing a larger value might improve performance.", ) + parser.add_argument( + "--experts-shared-outer-loras", + default=ServerArgs.experts_shared_outer_loras, + action="store_true", + help="Force shared outer LoRA mode for MoE models. " + "When set, w1/w3 lora_A and w2 lora_B are shared across experts " + "(expert_dim=1). By default this is auto-detected from adapter weights.", + ) # Kernel backend parser.add_argument( diff --git a/test/registered/lora/test_lora_qwen3_30b_a3b_instruct_2507_logprob_diff.py b/test/registered/lora/test_lora_qwen3_30b_a3b_instruct_2507_logprob_diff.py new file mode 100644 index 000000000000..8bdb58551389 --- /dev/null +++ b/test/registered/lora/test_lora_qwen3_30b_a3b_instruct_2507_logprob_diff.py @@ -0,0 +1,151 @@ +# Copyright 2023-2025 SGLang Team +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +""" +Regression test for Qwen3-30B-A3B-Instruct-2507 LoRA logprob accuracy. + +Compares SGLang LoRA logprobs against reference training logprobs from a +pre-computed dataset. The LoRA adapter and reference data are downloaded from: +https://huggingface.co/datasets/yushengsu/lora-diff-Qwen3-30B-A3B-Instruct-2507 + +Usage: + python -m unittest test_lora_qwen3_30b_a3b_instruct_2507_logprob_diff +""" + +import multiprocessing as mp +import os +import unittest + +import torch +from huggingface_hub import snapshot_download + +import sglang as sgl +from sglang.test.ci.ci_register import register_cuda_ci +from sglang.test.test_utils import CustomTestCase + +register_cuda_ci( + est_time=300, + suite="stage-c-test-8-gpu-h200", +) + +BASE_MODEL = "Qwen/Qwen3-30B-A3B-Instruct-2507" +LORA_HF_REPO = "yushengsu/lora-diff-Qwen3-30B-A3B-Instruct-2507" +LORA_BACKEND = "triton" +MAX_LORA_RANK = 32 +TP_SIZE = 8 +DISABLE_CUDA_GRAPH = True +MOE_RUNNER_BACKEND = "triton" +EXPERTS_SHARED_OUTER_LORAS = True +PREFILL_ATTENTION_BACKEND = "fa4" +DECODE_ATTENTION_BACKEND = "fa4" + +KL_THRESHOLD = 1e-2 + + +def kl_v2(a, b): + a = torch.tensor(a) if not torch.is_tensor(a) else a + b = torch.tensor(b) if not torch.is_tensor(b) else b + return (((a - b) ** 2) * 0.5).mean().item() + + +def get_prompt_logprobs(engine, input_ids, lora_path): + out = engine.generate( + input_ids=input_ids, + sampling_params={"max_new_tokens": 0, "temperature": 0.0}, + return_logprob=True, + logprob_start_len=0, + lora_path=lora_path, + ) + return [logprob for logprob, _, _ in out["meta_info"]["input_token_logprobs"]][1:] + + +class TestLoRAQwen3_30B_A3B_Instruct_2507_LogprobDiff(CustomTestCase): + + def test_lora_qwen3_30b_a3b_instruct_2507_logprob_accuracy(self): + adapter_path = snapshot_download( + LORA_HF_REPO, + repo_type="dataset", + ) + + engine = sgl.Engine( + model_path=BASE_MODEL, + tp_size=TP_SIZE, + enable_lora=True, + max_lora_rank=MAX_LORA_RANK, + lora_paths={"my_lora": adapter_path}, + lora_backend=LORA_BACKEND, + attention_backend="flashinfer", + disable_cuda_graph=DISABLE_CUDA_GRAPH, + moe_runner_backend=MOE_RUNNER_BACKEND, + experts_shared_outer_loras=EXPERTS_SHARED_OUTER_LORAS, + prefill_attention_backend=PREFILL_ATTENTION_BACKEND, + decode_attention_backend=DECODE_ATTENTION_BACKEND, + ) + + try: + cdata = torch.load( + os.path.join(adapter_path, "compare_sample_train_data.pt"), + weights_only=False, + ) + + base_logprobs = get_prompt_logprobs(engine, cdata["tokens"], lora_path=None) + logprobs = get_prompt_logprobs(engine, cdata["tokens"], lora_path="my_lora") + + base_t = torch.tensor(base_logprobs) + lora_t = torch.tensor(logprobs) + diff = (base_t - lora_t).abs() + print( + f"[VERIFY] base vs lora: mean_diff={diff.mean().item():.6f}, " + f"max_diff={diff.max().item():.6f}, " + f"identical={torch.equal(base_t, lora_t)}" + ) + + self.assertFalse( + torch.equal(base_t, lora_t), + "LoRA logprobs should differ from base model logprobs", + ) + + kl_sglang_trainer = kl_v2(cdata["training_logprobs"], logprobs) + kl_orig_trainer = kl_v2( + cdata["training_logprobs"], cdata["sampling_logprobs"] + ) + kl_sglang_orig = kl_v2(logprobs, cdata["sampling_logprobs"]) + + print(f"KL(orig_sampler, trainer) = {kl_orig_trainer:.6e}") + print(f"KL(sglang, trainer) = {kl_sglang_trainer:.6e}") + print(f"KL(sglang, orig_sampler) = {kl_sglang_orig:.6e}") + + self.assertLessEqual( + kl_sglang_trainer, + KL_THRESHOLD, + f"KL(sglang, trainer) = {kl_sglang_trainer:.6e} exceeds " + f"threshold {KL_THRESHOLD}", + ) + + finally: + engine.shutdown() + + +if __name__ == "__main__": + try: + mp.set_start_method("spawn") + except RuntimeError: + pass + + try: + unittest.main(warnings="ignore", verbosity=2) + finally: + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.synchronize() From 3ea7296fb6d9e636f1930792bde29f52eb4fecda Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Thu, 26 Mar 2026 09:37:53 +0000 Subject: [PATCH 5/7] pre-commit --- python/sglang/srt/lora/layers.py | 32 ++++++++++++++++---- python/sglang/srt/lora/lora.py | 4 +-- python/sglang/srt/lora/lora_manager.py | 4 +-- python/sglang/srt/lora/mem_pool.py | 41 +++++++------------------- 4 files changed, 38 insertions(+), 43 deletions(-) diff --git a/python/sglang/srt/lora/layers.py b/python/sglang/srt/lora/layers.py index c0a94bd07582..c246a732458c 100644 --- a/python/sglang/srt/lora/layers.py +++ b/python/sglang/srt/lora/layers.py @@ -857,12 +857,22 @@ def slice_moe_lora_a_weights(self, A, tp_rank: int, target_module: str): if self.tp_size <= 1: return A if isinstance(A, dict): - return {eid: self._slice_moe_a_2d(w, tp_rank, target_module) for eid, w in A.items()} + return { + eid: self._slice_moe_a_2d(w, tp_rank, target_module) + for eid, w in A.items() + } if isinstance(A, torch.Tensor) and A.dim() == 3: - return torch.stack([self._slice_moe_a_2d(A[i], tp_rank, target_module) for i in range(A.shape[0])]) + return torch.stack( + [ + self._slice_moe_a_2d(A[i], tp_rank, target_module) + for i in range(A.shape[0]) + ] + ) return self._slice_moe_a_2d(A, tp_rank, target_module) - def _slice_moe_a_2d(self, A: torch.Tensor, tp_rank: int, target_module: str) -> torch.Tensor: + def _slice_moe_a_2d( + self, A: torch.Tensor, tp_rank: int, target_module: str + ) -> torch.Tensor: if target_module == "down_proj_moe": shard_size = self.intermediate_size_per_partition start = tp_rank * shard_size @@ -885,12 +895,22 @@ def slice_moe_lora_b_weights(self, B, tp_rank: int, target_module: str): if self.tp_size <= 1: return B if isinstance(B, dict): - return {eid: self._slice_moe_b_2d(w, tp_rank, target_module) for eid, w in B.items()} + return { + eid: self._slice_moe_b_2d(w, tp_rank, target_module) + for eid, w in B.items() + } if isinstance(B, torch.Tensor) and B.dim() == 3: - return torch.stack([self._slice_moe_b_2d(B[i], tp_rank, target_module) for i in range(B.shape[0])]) + return torch.stack( + [ + self._slice_moe_b_2d(B[i], tp_rank, target_module) + for i in range(B.shape[0]) + ] + ) return self._slice_moe_b_2d(B, tp_rank, target_module) - def _slice_moe_b_2d(self, B: torch.Tensor, tp_rank: int, target_module: str) -> torch.Tensor: + def _slice_moe_b_2d( + self, B: torch.Tensor, tp_rank: int, target_module: str + ) -> torch.Tensor: if target_module == "gate_up_proj_moe": shard_size = self.intermediate_size_per_partition start = tp_rank * shard_size diff --git a/python/sglang/srt/lora/lora.py b/python/sglang/srt/lora/lora.py index 8a7237a9aef7..b5c9b06b640f 100644 --- a/python/sglang/srt/lora/lora.py +++ b/python/sglang/srt/lora/lora.py @@ -239,9 +239,7 @@ def normalize_gate_up_proj( ndim = weights[gate_up_name].dim() repeat_dims = [1] * ndim repeat_dims[ndim - 2] = 2 - weights[gate_up_name] = weights[gate_up_name].repeat( - *repeat_dims - ) + weights[gate_up_name] = weights[gate_up_name].repeat(*repeat_dims) # else: no-op as LoRA B weight is already stacked. def pin_weights_in_cpu(self): diff --git a/python/sglang/srt/lora/lora_manager.py b/python/sglang/srt/lora/lora_manager.py index d8ecc15e92c3..13521a4faa74 100644 --- a/python/sglang/srt/lora/lora_manager.py +++ b/python/sglang/srt/lora/lora_manager.py @@ -731,7 +731,5 @@ def init_lora_modules(self): ): layer_id = get_layer_id(module_name) lora_module = self.set_lora_module(module_name, module) - lora_module.experts_shared_outer_loras = ( - self.experts_shared_outer_loras - ) + lora_module.experts_shared_outer_loras = self.experts_shared_outer_loras self.lora_modules[layer_id][module_name] = lora_module diff --git a/python/sglang/srt/lora/mem_pool.py b/python/sglang/srt/lora/mem_pool.py index f496f66f1850..65109eebc9cb 100644 --- a/python/sglang/srt/lora/mem_pool.py +++ b/python/sglang/srt/lora/mem_pool.py @@ -194,10 +194,7 @@ def get_lora_A_shape( if self.is_moe_module(module_name): num_experts = self._get_num_experts(base_model) expert_dim = num_experts - if ( - self.experts_shared_outer_loras - and module_name == "gate_up_proj_moe" - ): + if self.experts_shared_outer_loras and module_name == "gate_up_proj_moe": expert_dim = 1 return ( self.max_loras_per_batch, @@ -250,10 +247,7 @@ def get_lora_B_shape( if self.is_moe_module(module_name): num_experts = self._get_num_experts(base_model) expert_dim = num_experts - if ( - self.experts_shared_outer_loras - and module_name == "down_proj_moe" - ): + if self.experts_shared_outer_loras and module_name == "down_proj_moe": expert_dim = 1 return (self.max_loras_per_batch, expert_dim, output_dim, max_lora_dim) else: @@ -294,12 +288,9 @@ def init_buffer( if hasattr(cfg, "get_text_config"): cfg = cfg.get_text_config() has_shared_experts = ( - ( - hasattr(cfg, "shared_expert_intermediate_size") - and cfg.shared_expert_intermediate_size > 0 - ) - or (getattr(cfg, "n_shared_experts", 0) or 0) > 0 - ) + hasattr(cfg, "shared_expert_intermediate_size") + and cfg.shared_expert_intermediate_size > 0 + ) or (getattr(cfg, "n_shared_experts", 0) or 0) > 0 has_moe = self._get_num_experts(base_model) > 1 # Shape functions automatically handle both 3D (standard) and 4D (MoE) @@ -627,10 +618,7 @@ def load_lora_weight_tensor( target_buffer = self.A_buffer[name][layer_id] if name in ["gate_up_proj_moe", "down_proj_moe"]: - if ( - self.experts_shared_outer_loras - and name == "gate_up_proj_moe" - ): + if self.experts_shared_outer_loras and name == "gate_up_proj_moe": if isinstance(weights, torch.Tensor) and weights.dim() == 3: buffer_view = target_buffer[ buffer_id, 0, : lora_rank * c, : @@ -664,23 +652,16 @@ def load_lora_weight_tensor( target_buffer = self.B_buffer[name][layer_id] if name in ["gate_up_proj_moe", "down_proj_moe"]: - if ( - self.experts_shared_outer_loras - and name == "down_proj_moe" - ): + if self.experts_shared_outer_loras and name == "down_proj_moe": if isinstance(weights, torch.Tensor) and weights.dim() == 3: - buffer_view = target_buffer[ - buffer_id, 0, :, :lora_rank - ] + buffer_view = target_buffer[buffer_id, 0, :, :lora_rank] w = weights[0] if w is not None: w = w * lora_adapter.scaling load_lora_weight_tensor(buffer_view, w) elif isinstance(weights, dict) and len(weights) > 0: rep = next(iter(weights.values())) - buffer_view = target_buffer[ - buffer_id, 0, :, :lora_rank - ] + buffer_view = target_buffer[buffer_id, 0, :, :lora_rank] if rep is not None: rep = rep * lora_adapter.scaling load_lora_weight_tensor(buffer_view, rep) @@ -688,9 +669,7 @@ def load_lora_weight_tensor( target_buffer[buffer_id].zero_() elif isinstance(weights, torch.Tensor) and weights.dim() == 3: for eid in range(weights.shape[0]): - buffer_view = target_buffer[ - buffer_id, eid, :, :lora_rank - ] + buffer_view = target_buffer[buffer_id, eid, :, :lora_rank] w = weights[eid] if w is not None: w = w * lora_adapter.scaling From 4fd3f01621c5df91b256eae5e91962577404377f Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Thu, 26 Mar 2026 09:51:39 +0000 Subject: [PATCH 6/7] update --- python/sglang/srt/models/qwen3_vl_moe.py | 3 +- test/manual/lora/test_lora_qwen3_vl.py | 4 + ..._qwen3_vl_30b_a3b_instruct_logprob_diff.py | 151 ++++++++++++++++++ 3 files changed, 156 insertions(+), 2 deletions(-) create mode 100644 test/registered/lora/test_lora_qwen3_vl_30b_a3b_instruct_logprob_diff.py diff --git a/python/sglang/srt/models/qwen3_vl_moe.py b/python/sglang/srt/models/qwen3_vl_moe.py index cf1cb3879b42..93746e5fcee8 100644 --- a/python/sglang/srt/models/qwen3_vl_moe.py +++ b/python/sglang/srt/models/qwen3_vl_moe.py @@ -179,9 +179,8 @@ def __init__( ): super().__init__(config, quant_config, prefix, language_model_cls) - # Only allow LoRA on attention projections within text layers for MoE. _lora_pattern_moe = re.compile( - r"^model\.layers\.(\d+)\.self_attn\.(?:qkv_proj|o_proj)$" + r"^(?:model\.layers\.(\d+)\.(?:self_attn\.(?:qkv_proj|o_proj)|mlp\.experts)|lm_head|model\.embed_tokens)$" ) def should_apply_lora(self, module_name: str) -> bool: diff --git a/test/manual/lora/test_lora_qwen3_vl.py b/test/manual/lora/test_lora_qwen3_vl.py index cef3649919a4..728fa8bc9959 100644 --- a/test/manual/lora/test_lora_qwen3_vl.py +++ b/test/manual/lora/test_lora_qwen3_vl.py @@ -47,11 +47,15 @@ def test_qwen3_vl_moe_should_apply_lora_regex(self): positives = ( "model.layers.0.self_attn.qkv_proj", "model.layers.5.self_attn.o_proj", + "model.layers.0.mlp.experts", + "lm_head", + "model.embed_tokens", ) negatives = ( "model.layers.0.mlp.gate_up_proj", "model.layers.0.mlp.down_proj", "visual.blocks.0.attn.qkv_proj", + "visual.blocks.0.mlp.experts", "model.layers.x.self_attn.qkv_proj", "model.layers.0.attn.qkv_proj", ) diff --git a/test/registered/lora/test_lora_qwen3_vl_30b_a3b_instruct_logprob_diff.py b/test/registered/lora/test_lora_qwen3_vl_30b_a3b_instruct_logprob_diff.py new file mode 100644 index 000000000000..cd71aa1afd50 --- /dev/null +++ b/test/registered/lora/test_lora_qwen3_vl_30b_a3b_instruct_logprob_diff.py @@ -0,0 +1,151 @@ +# Copyright 2023-2025 SGLang Team +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +""" +Regression test for Qwen3-VL-30B-A3B-Instruct LoRA logprob accuracy. + +Compares SGLang LoRA logprobs against reference training logprobs from a +pre-computed dataset. The LoRA adapter and reference data are downloaded from: +https://huggingface.co/datasets/yushengsu/lora-diff-Qwen3-VL-30B-A3B-Instruct + +Usage: + python -m unittest test_lora_qwen3_vl_30b_a3b_instruct_logprob_diff +""" + +import multiprocessing as mp +import os +import unittest + +import torch +from huggingface_hub import snapshot_download + +import sglang as sgl +from sglang.test.ci.ci_register import register_cuda_ci +from sglang.test.test_utils import CustomTestCase + +register_cuda_ci( + est_time=300, + suite="stage-c-test-8-gpu-h200", +) + +BASE_MODEL = "Qwen/Qwen3-VL-30B-A3B-Instruct" +LORA_HF_REPO = "yushengsu/lora-diff-Qwen3-VL-30B-A3B-Instruct" +LORA_BACKEND = "triton" +MAX_LORA_RANK = 32 +TP_SIZE = 8 +DISABLE_CUDA_GRAPH = True +MOE_RUNNER_BACKEND = "triton" +EXPERTS_SHARED_OUTER_LORAS = True +PREFILL_ATTENTION_BACKEND = "fa4" +DECODE_ATTENTION_BACKEND = "fa4" + +KL_THRESHOLD = 1e-2 + + +def kl_v2(a, b): + a = torch.tensor(a) if not torch.is_tensor(a) else a + b = torch.tensor(b) if not torch.is_tensor(b) else b + return (((a - b) ** 2) * 0.5).mean().item() + + +def get_prompt_logprobs(engine, input_ids, lora_path): + out = engine.generate( + input_ids=input_ids, + sampling_params={"max_new_tokens": 0, "temperature": 0.0}, + return_logprob=True, + logprob_start_len=0, + lora_path=lora_path, + ) + return [logprob for logprob, _, _ in out["meta_info"]["input_token_logprobs"]][1:] + + +class TestLoRAQwen3VL_30B_A3B_Instruct_LogprobDiff(CustomTestCase): + + def test_lora_qwen3_vl_30b_a3b_instruct_logprob_accuracy(self): + adapter_path = snapshot_download( + LORA_HF_REPO, + repo_type="dataset", + ) + + engine = sgl.Engine( + model_path=BASE_MODEL, + tp_size=TP_SIZE, + enable_lora=True, + max_lora_rank=MAX_LORA_RANK, + lora_paths={"my_lora": adapter_path}, + lora_backend=LORA_BACKEND, + attention_backend="flashinfer", + disable_cuda_graph=DISABLE_CUDA_GRAPH, + moe_runner_backend=MOE_RUNNER_BACKEND, + experts_shared_outer_loras=EXPERTS_SHARED_OUTER_LORAS, + prefill_attention_backend=PREFILL_ATTENTION_BACKEND, + decode_attention_backend=DECODE_ATTENTION_BACKEND, + ) + + try: + cdata = torch.load( + os.path.join(adapter_path, "compare_sample_train_data.pt"), + weights_only=False, + ) + + base_logprobs = get_prompt_logprobs(engine, cdata["tokens"], lora_path=None) + logprobs = get_prompt_logprobs(engine, cdata["tokens"], lora_path="my_lora") + + base_t = torch.tensor(base_logprobs) + lora_t = torch.tensor(logprobs) + diff = (base_t - lora_t).abs() + print( + f"[VERIFY] base vs lora: mean_diff={diff.mean().item():.6f}, " + f"max_diff={diff.max().item():.6f}, " + f"identical={torch.equal(base_t, lora_t)}" + ) + + self.assertFalse( + torch.equal(base_t, lora_t), + "LoRA logprobs should differ from base model logprobs", + ) + + kl_sglang_trainer = kl_v2(cdata["training_logprobs"], logprobs) + kl_orig_trainer = kl_v2( + cdata["training_logprobs"], cdata["sampling_logprobs"] + ) + kl_sglang_orig = kl_v2(logprobs, cdata["sampling_logprobs"]) + + print(f"KL(orig_sampler, trainer) = {kl_orig_trainer:.6e}") + print(f"KL(sglang, trainer) = {kl_sglang_trainer:.6e}") + print(f"KL(sglang, orig_sampler) = {kl_sglang_orig:.6e}") + + self.assertLessEqual( + kl_sglang_trainer, + KL_THRESHOLD, + f"KL(sglang, trainer) = {kl_sglang_trainer:.6e} exceeds " + f"threshold {KL_THRESHOLD}", + ) + + finally: + engine.shutdown() + + +if __name__ == "__main__": + try: + mp.set_start_method("spawn") + except RuntimeError: + pass + + try: + unittest.main(warnings="ignore", verbosity=2) + finally: + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.synchronize() From 6f29b1a25aa8e2c1613ec4a2b70a9b53bc2d03c9 Mon Sep 17 00:00:00 2001 From: Baizhou Zhang Date: Tue, 31 Mar 2026 21:59:57 -0700 Subject: [PATCH 7/7] upd --- test/manual/lora/test_lora_qwen3_vl.py | 237 ------------------ ..._qwen3_vl_30b_a3b_instruct_logprob_diff.py | 4 +- 2 files changed, 2 insertions(+), 239 deletions(-) delete mode 100644 test/manual/lora/test_lora_qwen3_vl.py diff --git a/test/manual/lora/test_lora_qwen3_vl.py b/test/manual/lora/test_lora_qwen3_vl.py deleted file mode 100644 index 728fa8bc9959..000000000000 --- a/test/manual/lora/test_lora_qwen3_vl.py +++ /dev/null @@ -1,237 +0,0 @@ -import random -import unittest -from typing import Sequence - -from sglang.srt.models.qwen3_vl import Qwen3VLForConditionalGeneration -from sglang.srt.models.qwen3_vl_moe import Qwen3VLMoeForConditionalGeneration -from sglang.test.lora_utils import ( - TORCH_DTYPES, - LoRAAdaptor, - LoRAModelCase, - ensure_reproducibility, -) -from sglang.test.runners import HFRunner, SRTRunner -from sglang.test.test_utils import CustomTestCase, calculate_rouge_l - - -class TestLoRAQwen3VLGating(CustomTestCase): - """Unit tests for should_apply_lora gating on Qwen3‑VL dense and MoE variants.""" - - def _assert_pattern( - self, pattern, positives: Sequence[str], negatives: Sequence[str] - ): - for name in positives: - self.assertTrue(bool(pattern.match(name)), f"Expected to match: {name}") - for name in negatives: - self.assertFalse(bool(pattern.match(name)), f"Should not match: {name}") - - def test_qwen3_vl_should_apply_lora_regex(self): - positives = ( - "model.layers.0.self_attn.qkv_proj", - "model.layers.1.self_attn.o_proj", - "model.layers.2.mlp.gate_up_proj", - "model.layers.3.mlp.down_proj", - ) - negatives = ( - "visual.blocks.0.attn.qkv_proj", - "model.layers.x.self_attn.qkv_proj", - "model.layers.0.attn.qkv_proj", - "model.layers.0.mlp.not_proj", - "model.layers.0.self_attn.q_proj", - ) - self._assert_pattern( - Qwen3VLForConditionalGeneration._lora_pattern, positives, negatives - ) - - def test_qwen3_vl_moe_should_apply_lora_regex(self): - positives = ( - "model.layers.0.self_attn.qkv_proj", - "model.layers.5.self_attn.o_proj", - "model.layers.0.mlp.experts", - "lm_head", - "model.embed_tokens", - ) - negatives = ( - "model.layers.0.mlp.gate_up_proj", - "model.layers.0.mlp.down_proj", - "visual.blocks.0.attn.qkv_proj", - "visual.blocks.0.mlp.experts", - "model.layers.x.self_attn.qkv_proj", - "model.layers.0.attn.qkv_proj", - ) - self._assert_pattern( - Qwen3VLMoeForConditionalGeneration._lora_pattern_moe, positives, negatives - ) - - -TEST_MULTIPLE_BATCH_PROMPTS = [ - """ - ### Instruction: - Tell me about llamas and alpacas - ### Response: - Llamas are large, long-necked animals with a woolly coat. They have two toes on each foot instead of three like other camelids (camels, dromedaries). Llamas live in the Andean mountains of South America where they graze on grasses and shrubs. Alpaca is another name for domesticated llama. The word "alpaca" comes from an Incan language meaning "golden fleece." Alpacas look very similar to llamas but are smaller than their wild relatives. Both species were used by ancient people as pack animals and for meat. Today both llamas and alpacas are raised primarily for their fiber which can be spun into yarn or knitted into clothing. - ### Question 2: - What do you know about llamas? - ### Answer: - """, - """ - ### Instruction: - Write a poem about the transformers Python library. - Mention the word "large language models" in that poem. - ### Response: - The Transformers are large language models, - They're used to make predictions on text. - """, - "AI is a field of computer science focused on", - "Computer science is the study of", - "Write a short story.", - "What are the main components of a computer?", -] - - -LORA_MODEL_VARIANTS = [ - ( - "Qwen3-VL", - LoRAModelCase( - base="Qwen/Qwen3-VL-4B-Instruct", - adaptors=[ - LoRAAdaptor( - name="mryufei/Qwen3-VL-4B-Instruct-trl-sft", - prefill_tolerance=3e-1, - ), - ], - max_loras_per_batch=1, - ), - ), - # TODO: Move 30B MoE to 2 GPU runner - # ( - # "Qwen3-VL-MoE", - # LoRAModelCase( - # base="Qwen/Qwen3-VL-30B-A3B-Instruct", - # adaptors=[ - # LoRAAdaptor( - # name="sosoai/qwen3_vl_30b_lora", - # prefill_tolerance=3e-1, - # ), - # ], - # max_loras_per_batch=1, - # ), - # ), -] - -LORA_MAX_NEW_TOKENS = 32 - - -def _run_lora_multiple_batch_on_model_cases( - model_cases: Sequence[LoRAModelCase], *, max_new_tokens: int, variant_label: str -): - for model_case in model_cases: - for torch_dtype in TORCH_DTYPES: - backend = "csgmv" - base_path = model_case.base - lora_adapter_paths = [adaptor.name for adaptor in model_case.adaptors] - - batches = [ - ( - [ - random.choice(TEST_MULTIPLE_BATCH_PROMPTS), - random.choice(TEST_MULTIPLE_BATCH_PROMPTS), - random.choice(TEST_MULTIPLE_BATCH_PROMPTS), - ], - [None, lora_adapter_paths[0], None], - ), - ( - [ - random.choice(TEST_MULTIPLE_BATCH_PROMPTS), - random.choice(TEST_MULTIPLE_BATCH_PROMPTS), - random.choice(TEST_MULTIPLE_BATCH_PROMPTS), - ], - [lora_adapter_paths[0], None, None], - ), - ( - [ - random.choice(TEST_MULTIPLE_BATCH_PROMPTS), - random.choice(TEST_MULTIPLE_BATCH_PROMPTS), - random.choice(TEST_MULTIPLE_BATCH_PROMPTS), - ], - [None, None, None], - ), - ] - - print( - f"\n=== {variant_label} LoRA parity on '{base_path}', backend={backend}, dtype={torch_dtype} ===" - ) - - ensure_reproducibility() - srt_runner = SRTRunner( - base_path, - torch_dtype=torch_dtype, - model_type="generation", - lora_paths=lora_adapter_paths, - max_loras_per_batch=model_case.max_loras_per_batch, - lora_backend=backend, - sleep_on_idle=True, - attention_backend="torch_native", - disable_radix_cache=True, - ) - - ensure_reproducibility() - hf_runner = HFRunner( - base_path, - torch_dtype=torch_dtype, - model_type="generation", - patch_model_do_sample_false=True, - ) - - with srt_runner, hf_runner: - for i, (prompts, lora_paths) in enumerate(batches): - print( - f"\n--- Running Batch {i + 1} --- prompts: {prompts}, lora_paths: {lora_paths}" - ) - - srt_outputs = srt_runner.batch_forward( - prompts, - max_new_tokens=max_new_tokens, - lora_paths=lora_paths, - ) - - hf_outputs = hf_runner.forward( - prompts, - max_new_tokens=max_new_tokens, - lora_paths=lora_paths, - ) - - print("SRT outputs:", [s for s in srt_outputs.output_strs]) - print("HF outputs:", [s for s in hf_outputs.output_strs]) - - for srt_out, hf_out in zip( - srt_outputs.output_strs, hf_outputs.output_strs - ): - srt_str = srt_out.strip() - hf_str = hf_out.strip() - rouge_tol = model_case.rouge_l_tolerance - rouge_score = calculate_rouge_l([srt_str], [hf_str])[0] - if rouge_score < rouge_tol: - raise AssertionError( - f"ROUGE-L score {rouge_score} below tolerance {rouge_tol} " - f"for base '{base_path}', adaptor '{lora_paths}', backend '{backend}', prompt: '{prompts}...'" - ) - - print(f"--- Batch {i + 1} Comparison Passed --- ") - - -class TestLoRAQwen3VLIntegration(CustomTestCase): - """Parity integration tests for Qwen3‑VL dense and MoE LoRA adapters.""" - - def test_ci_lora_models(self): - for label, model_case in LORA_MODEL_VARIANTS: - with self.subTest(variant=label): - _run_lora_multiple_batch_on_model_cases( - [model_case], - max_new_tokens=LORA_MAX_NEW_TOKENS, - variant_label=label, - ) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/registered/lora/test_lora_qwen3_vl_30b_a3b_instruct_logprob_diff.py b/test/registered/lora/test_lora_qwen3_vl_30b_a3b_instruct_logprob_diff.py index cd71aa1afd50..67031048fa14 100644 --- a/test/registered/lora/test_lora_qwen3_vl_30b_a3b_instruct_logprob_diff.py +++ b/test/registered/lora/test_lora_qwen3_vl_30b_a3b_instruct_logprob_diff.py @@ -36,14 +36,14 @@ register_cuda_ci( est_time=300, - suite="stage-c-test-8-gpu-h200", + suite="stage-c-test-4-gpu-b200", ) BASE_MODEL = "Qwen/Qwen3-VL-30B-A3B-Instruct" LORA_HF_REPO = "yushengsu/lora-diff-Qwen3-VL-30B-A3B-Instruct" LORA_BACKEND = "triton" MAX_LORA_RANK = 32 -TP_SIZE = 8 +TP_SIZE = 4 DISABLE_CUDA_GRAPH = True MOE_RUNNER_BACKEND = "triton" EXPERTS_SHARED_OUTER_LORAS = True