diff --git a/python/sglang/srt/layers/moe/ep_moe/layer.py b/python/sglang/srt/layers/moe/ep_moe/layer.py index eaa65c544b8..8f5a71dff8c 100644 --- a/python/sglang/srt/layers/moe/ep_moe/layer.py +++ b/python/sglang/srt/layers/moe/ep_moe/layer.py @@ -5,7 +5,6 @@ from torch.nn import Module from vllm import _custom_ops as ops from vllm.model_executor.custom_op import CustomOp -from vllm.model_executor.layers.quantization.fp8 import Fp8Config, Fp8MoEMethod from sglang.srt.distributed import ( get_tensor_model_parallel_rank, @@ -25,6 +24,7 @@ QuantizationConfig, QuantizeMethodBase, ) +from sglang.srt.layers.quantization.fp8 import Fp8Config, Fp8MoEMethod from sglang.srt.utils import is_hip, set_weight_attrs logger = logging.getLogger(__name__) diff --git a/python/sglang/srt/lora/lora.py b/python/sglang/srt/lora/lora.py index 839d10222e2..c8cbe36602b 100644 --- a/python/sglang/srt/lora/lora.py +++ b/python/sglang/srt/lora/lora.py @@ -19,18 +19,11 @@ # https://github.com/vllm-project/vllm/blob/4abf6336ec65c270343eb895e7b18786e9274176/vllm/lora/layers.py -import json -import os import re -from typing import Any, Dict, List, Optional, Tuple -import safetensors.torch import torch from torch import nn -from vllm.model_executor.layers.vocab_parallel_embedding import ( - ParallelLMHead, - VocabParallelEmbedding, -) +from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding from sglang.srt.layers.linear import ( ColumnParallelLinear, @@ -38,7 +31,6 @@ QKVParallelLinear, RowParallelLinear, ) -from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode from sglang.srt.model_loader.loader import DefaultModelLoader diff --git a/python/sglang/srt/models/baichuan.py b/python/sglang/srt/models/baichuan.py index c973e64c7a0..d8916abacfb 100644 --- a/python/sglang/srt/models/baichuan.py +++ b/python/sglang/srt/models/baichuan.py @@ -24,11 +24,6 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.model_executor.layers.linear import ( - MergedColumnParallelLinear, - QKVParallelLinear, - RowParallelLinear, -) from vllm.model_executor.layers.rotary_embedding import get_rope from sglang.srt.distributed import ( @@ -37,6 +32,11 @@ ) from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.layernorm import RMSNorm +from sglang.srt.layers.linear import ( + MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear, +) from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.layers.radix_attention import RadixAttention diff --git a/python/sglang/srt/models/gpt2.py b/python/sglang/srt/models/gpt2.py index a99232dc201..280ff152a0c 100644 --- a/python/sglang/srt/models/gpt2.py +++ b/python/sglang/srt/models/gpt2.py @@ -22,10 +22,9 @@ import torch from torch import nn from transformers import GPT2Config -from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding from sglang.srt.distributed.parallel_state import get_tensor_model_parallel_world_size +from sglang.srt.layers.activation import get_act_fn # from sglang.srt.layers.activation import get_act_fn from sglang.srt.layers.linear import ( diff --git a/python/sglang/srt/models/minicpm3.py b/python/sglang/srt/models/minicpm3.py index f5e722a14c6..2d15af43ff2 100644 --- a/python/sglang/srt/models/minicpm3.py +++ b/python/sglang/srt/models/minicpm3.py @@ -19,17 +19,17 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.model_executor.layers.linear import ( - ColumnParallelLinear, - MergedColumnParallelLinear, - ReplicatedLinear, - RowParallelLinear, -) from vllm.model_executor.layers.rotary_embedding import get_rope from sglang.srt.distributed import get_tensor_model_parallel_world_size from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.layernorm import RMSNorm +from sglang.srt.layers.linear import ( + ColumnParallelLinear, + MergedColumnParallelLinear, + ReplicatedLinear, + RowParallelLinear, +) from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.layers.radix_attention import RadixAttention diff --git a/python/sglang/srt/models/olmo2.py b/python/sglang/srt/models/olmo2.py index df0121930f3..fafe39d7189 100755 --- a/python/sglang/srt/models/olmo2.py +++ b/python/sglang/srt/models/olmo2.py @@ -22,7 +22,6 @@ from torch import nn from transformers import PretrainedConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.model_loader.weight_utils import default_weight_loader from sglang.srt.distributed import ( get_tensor_model_parallel_rank, @@ -45,6 +44,7 @@ VocabParallelEmbedding, ) from sglang.srt.model_executor.forward_batch_info import ForwardBatch +from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.utils import make_layers diff --git a/python/sglang/srt/models/olmoe.py b/python/sglang/srt/models/olmoe.py index 74bc98372e7..9abe9ff25d9 100644 --- a/python/sglang/srt/models/olmoe.py +++ b/python/sglang/srt/models/olmoe.py @@ -23,12 +23,6 @@ import torch.nn.functional as F from torch import nn from transformers import PretrainedConfig -from vllm.model_executor.layers.linear import ( - MergedColumnParallelLinear, - QKVParallelLinear, - ReplicatedLinear, - RowParallelLinear, -) from vllm.model_executor.layers.rotary_embedding import get_rope from sglang.srt.distributed import ( @@ -37,6 +31,11 @@ ) from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.layernorm import RMSNorm +from sglang.srt.layers.linear import ( + QKVParallelLinear, + ReplicatedLinear, + RowParallelLinear, +) from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput from sglang.srt.layers.moe.fused_moe_triton import FusedMoE from sglang.srt.layers.quantization.base_config import QuantizationConfig diff --git a/python/sglang/srt/models/qwen2_vl.py b/python/sglang/srt/models/qwen2_vl.py index fc5dd49ea61..83912e894e2 100644 --- a/python/sglang/srt/models/qwen2_vl.py +++ b/python/sglang/srt/models/qwen2_vl.py @@ -22,6 +22,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen2-VL model compatible with HuggingFace weights.""" +import logging from functools import lru_cache, partial from typing import Iterable, List, Optional, Tuple, Type, TypedDict @@ -30,7 +31,6 @@ import torch.nn as nn import torch.nn.functional as F from einops import rearrange, repeat -from vllm.logger import init_logger from vllm.model_executor.layers.activation import QuickGELU from sglang.srt.configs import Qwen2VLConfig, Qwen2VLVisionConfig @@ -50,7 +50,7 @@ from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.models.qwen2 import Qwen2Model -logger = init_logger(__name__) +logger = logging.getLogger(__name__) # === Vision Inputs === # diff --git a/python/sglang/srt/models/xverse.py b/python/sglang/srt/models/xverse.py index 7d6158a9b89..799e513ae7e 100644 --- a/python/sglang/srt/models/xverse.py +++ b/python/sglang/srt/models/xverse.py @@ -21,16 +21,16 @@ import torch from torch import nn from transformers import LlamaConfig -from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import ( +from vllm.model_executor.layers.rotary_embedding import get_rope + +from sglang.srt.distributed import get_tensor_model_parallel_world_size +from sglang.srt.layers.activation import SiluAndMul +from sglang.srt.layers.layernorm import RMSNorm +from sglang.srt.layers.linear import ( MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear, ) -from vllm.model_executor.layers.rotary_embedding import get_rope - -from sglang.srt.distributed import get_tensor_model_parallel_world_size from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.layers.radix_attention import RadixAttention diff --git a/python/sglang/srt/models/xverse_moe.py b/python/sglang/srt/models/xverse_moe.py index 27f763011d5..97b62815a87 100644 --- a/python/sglang/srt/models/xverse_moe.py +++ b/python/sglang/srt/models/xverse_moe.py @@ -18,14 +18,6 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import ( - MergedColumnParallelLinear, - QKVParallelLinear, - ReplicatedLinear, - RowParallelLinear, -) from vllm.model_executor.layers.rotary_embedding import get_rope from sglang.srt.distributed import ( @@ -33,6 +25,14 @@ get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce, ) +from sglang.srt.layers.activation import SiluAndMul +from sglang.srt.layers.layernorm import RMSNorm +from sglang.srt.layers.linear import ( + MergedColumnParallelLinear, + QKVParallelLinear, + ReplicatedLinear, + RowParallelLinear, +) from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.moe.fused_moe_triton import fused_moe from sglang.srt.layers.quantization.base_config import QuantizationConfig