From 4eda12faeae1d3d2a45e40d4a8783328cf66d2ad Mon Sep 17 00:00:00 2001 From: ChenYou Date: Tue, 18 Nov 2025 11:07:26 +0800 Subject: [PATCH] Revert "[feat][dontmerge] add qknorm rope fused" --- python/sglang/srt/layers/rotary_embedding.py | 2 - python/sglang/srt/models/qwen3_moe.py | 56 +++++++------------- 2 files changed, 19 insertions(+), 39 deletions(-) diff --git a/python/sglang/srt/layers/rotary_embedding.py b/python/sglang/srt/layers/rotary_embedding.py index d34c782db4c0..02ae5e9ca076 100644 --- a/python/sglang/srt/layers/rotary_embedding.py +++ b/python/sglang/srt/layers/rotary_embedding.py @@ -2809,7 +2809,6 @@ def get_rope_wrapper( dtype: Optional[torch.dtype] = None, partial_rotary_factor: float = 1.0, device: Optional[str] = None, - dual_chunk_attention_config: Optional[Dict[str, Any]] = None, ): if device != "cpu": wrapper = aiter_get_rope if _use_aiter else get_rope @@ -2822,7 +2821,6 @@ def get_rope_wrapper( rope_scaling, dtype, partial_rotary_factor, - dual_chunk_attention_config, ) return get_rope_cpu( diff --git a/python/sglang/srt/models/qwen3_moe.py b/python/sglang/srt/models/qwen3_moe.py index 29f93eb18bc9..d3acc629ba9f 100644 --- a/python/sglang/srt/models/qwen3_moe.py +++ b/python/sglang/srt/models/qwen3_moe.py @@ -51,7 +51,7 @@ from sglang.srt.layers.moe.topk import TopK from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.layers.rotary_embedding import MRotaryEmbedding, get_rope_wrapper +from sglang.srt.layers.rotary_embedding import MRotaryEmbedding, get_rope from sglang.srt.layers.utils import get_layer_id from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead from sglang.srt.model_executor.cuda_graph_runner import get_is_capture_mode @@ -66,9 +66,7 @@ from sglang.srt.server_args import get_global_server_args from sglang.srt.utils import ( add_prefix, - get_bool_env_var, is_cuda, - is_hip, is_flashinfer_available, is_non_idle_and_non_empty, ) @@ -79,8 +77,7 @@ logger = logging.getLogger(__name__) _is_cuda = is_cuda() -_is_hip = is_hip() -_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip + class Qwen3MoeSparseMoeBlock(nn.Module): def __init__( @@ -333,10 +330,8 @@ def __init__( reduce_results=False, prefix=add_prefix("o_proj", prefix), ) - if _use_aiter and rope_scaling is not None: - rope_scaling["try_aiter_rope_fused_qknorm"] = True - self.rotary_emb = get_rope_wrapper( + self.rotary_emb = get_rope( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, @@ -405,35 +400,22 @@ def forward_prepare( return hidden_states, forward_batch, None qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - - if _use_aiter: - assert self.k_norm.variance_epsilon == self.q_norm.variance_epsilon - self.rotary_emb( - qkv, - self.k_norm.weight, - self.q_norm.weight, - positions, - self.num_heads, - self.num_kv_heads, - self.k_norm.variance_epsilon, - ) - else: - q, k = self._apply_qk_norm(q, k) - q, k = self.rotary_emb( - positions, - q, - k, - fused_set_kv_buffer_arg=( - create_fused_set_kv_buffer_arg( - value=v, - layer=self.attn, - forward_batch=forward_batch, - ) - if enable_fused_set_kv_buffer(forward_batch) - and self.compatible_with_fused_kv_buffer - else None - ), - ) + q, k = self._apply_qk_norm(q, k) + q, k = self.rotary_emb( + positions, + q, + k, + fused_set_kv_buffer_arg=( + create_fused_set_kv_buffer_arg( + value=v, + layer=self.attn, + forward_batch=forward_batch, + ) + if enable_fused_set_kv_buffer(forward_batch) + and self.compatible_with_fused_kv_buffer + else None + ), + ) inner_state = q, k, v, forward_batch return None, forward_batch, inner_state