From e510c0d8ab35e218d612704c6386b2ed6f58e277 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 4 Jun 2024 16:29:45 +0000 Subject: [PATCH 01/14] Add CustomOp Interface --- vllm/model_executor/ops/__init__.py | 0 vllm/model_executor/ops/custom_op.py | 33 ++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) create mode 100644 vllm/model_executor/ops/__init__.py create mode 100644 vllm/model_executor/ops/custom_op.py diff --git a/vllm/model_executor/ops/__init__.py b/vllm/model_executor/ops/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/vllm/model_executor/ops/custom_op.py b/vllm/model_executor/ops/custom_op.py new file mode 100644 index 000000000000..d5b2e532b284 --- /dev/null +++ b/vllm/model_executor/ops/custom_op.py @@ -0,0 +1,33 @@ +from abc import abstractmethod + +import torch.nn as nn + +from vllm.utils import is_hip, is_cpu + + +class CustomOp(nn.Module): + + def forward(self, *args, **kwargs): + if not hasattr(self, "_forward_method"): + self._forward_method = self.dispatch_forward() + return self._forward_method(*args, **kwargs) + + @abstractmethod + def forward_cuda(self, *args, **kwargs): + raise NotImplementedError + + def forward_hip(self, *args, **kwargs): + # By default, we assume that HIP ops are compatible with CUDA ops. + return self.forward_cuda(*args, **kwargs) + + def forward_cpu(self, *args, **kwargs): + # By default, we assume that CPU ops are compatible with CUDA ops. + return self.forward_cuda(*args, **kwargs) + + def dispatch_forward(self): + if is_hip(): + return self.forward_hip + elif is_cpu(): + return self.forward_cpu + else: + return self.forward_cuda From d9d43a6e5ecc4e048b483b88ec733242457a54fe Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 4 Jun 2024 16:30:02 +0000 Subject: [PATCH 02/14] Move activation --- .../{layers => ops}/activation.py | 26 ++++++++++++------- 1 file changed, 17 insertions(+), 9 deletions(-) rename vllm/model_executor/{layers => ops}/activation.py (91%) diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/ops/activation.py similarity index 91% rename from vllm/model_executor/layers/activation.py rename to vllm/model_executor/ops/activation.py index d101aa323b0e..10a5465690e4 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/ops/activation.py @@ -6,14 +6,14 @@ import torch.nn as nn import torch.nn.functional as F -from vllm import _custom_ops as ops from vllm.distributed import (divide, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) +from vllm.model_executor.ops.custom_op import CustomOp from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.utils import set_weight_attrs -class SiluAndMul(nn.Module): +class SiluAndMul(CustomOp): """An activation function for SwiGLU. The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2. @@ -28,7 +28,9 @@ def _forward(self, x: torch.Tensor) -> torch.Tensor: d = x.shape[-1] // 2 return F.silu(x[..., :d]) * x[..., d:] - def forward(self, x: torch.Tensor) -> torch.Tensor: + def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + from vllm._C import ops + d = x.shape[-1] // 2 output_shape = (x.shape[:-1] + (d, )) out = torch.empty(output_shape, dtype=x.dtype, device=x.device) @@ -36,7 +38,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: return out -class GeluAndMul(nn.Module): +class GeluAndMul(CustomOp): """An activation function for GeGLU. The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2. @@ -57,7 +59,9 @@ def _forward(self, x: torch.Tensor) -> torch.Tensor: d = x.shape[-1] // 2 return F.gelu(x[..., :d], approximate=self.approximate) * x[..., d:] - def forward(self, x: torch.Tensor) -> torch.Tensor: + def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + from vllm._C import ops + d = x.shape[-1] // 2 output_shape = (x.shape[:-1] + (d, )) out = torch.empty(output_shape, dtype=x.dtype, device=x.device) @@ -71,7 +75,7 @@ def extra_repr(self) -> str: return f'approximate={repr(self.approximate)}' -class NewGELU(nn.Module): +class NewGELU(CustomOp): def _forward(self, x: torch.Tensor) -> torch.Tensor: """PyTorch-native implementation equivalent to forward().""" @@ -79,20 +83,24 @@ def _forward(self, x: torch.Tensor) -> torch.Tensor: return 0.5 * x * (1.0 + torch.tanh(c * (x + 0.044715 * torch.pow(x, 3.0)))) - def forward(self, x: torch.Tensor) -> torch.Tensor: + def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + from vllm._C import ops + out = torch.empty_like(x) ops.gelu_new(out, x) return out -class FastGELU(nn.Module): +class FastGELU(CustomOp): def _forward(self, x: torch.Tensor) -> torch.Tensor: """PyTorch-native implementation equivalent to forward().""" return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x))) - def forward(self, x: torch.Tensor) -> torch.Tensor: + def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + from vllm._C import ops + out = torch.empty_like(x) ops.gelu_fast(out, x) return out From 19bff1c215aa7b6cf9435b78d738c1f6486fa38e Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 4 Jun 2024 16:30:14 +0000 Subject: [PATCH 03/14] Move layernorm --- vllm/model_executor/{layers => ops}/layernorm.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) rename vllm/model_executor/{layers => ops}/layernorm.py (93%) diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/ops/layernorm.py similarity index 93% rename from vllm/model_executor/layers/layernorm.py rename to vllm/model_executor/ops/layernorm.py index 8de079415898..94f9da32cbf1 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/ops/layernorm.py @@ -4,10 +4,10 @@ import torch import torch.nn as nn -from vllm import _custom_ops as ops +from vllm.model_executor.ops.custom_op import CustomOp -class RMSNorm(nn.Module): +class RMSNorm(CustomOp): """Root mean square normalization. Computes x -> w * x / sqrt(E[x^2] + eps) where w is the learned weight. @@ -43,11 +43,13 @@ def _forward( else: return x, residual - def forward( + def forward_cuda( self, x: torch.Tensor, residual: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + from vllm._C import ops + if residual is not None: ops.fused_add_rms_norm( x, From 8bff05afc6dae6b34e3571cfff6fd08f504eb415 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 4 Jun 2024 16:33:52 +0000 Subject: [PATCH 04/14] Move RoPE --- vllm/model_executor/{layers => ops}/rotary_embedding.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) rename vllm/model_executor/{layers => ops}/rotary_embedding.py (99%) diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/ops/rotary_embedding.py similarity index 99% rename from vllm/model_executor/layers/rotary_embedding.py rename to vllm/model_executor/ops/rotary_embedding.py index d03903d206d3..be7562d338b3 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/ops/rotary_embedding.py @@ -27,7 +27,7 @@ import torch import torch.nn as nn -from vllm import _custom_ops as ops +from vllm.model_executor.ops.custom_op import CustomOp def _rotate_neox(x: torch.Tensor) -> torch.Tensor: @@ -43,7 +43,7 @@ def _rotate_gptj(x: torch.Tensor) -> torch.Tensor: return x.flatten(-2) -class RotaryEmbedding(nn.Module): +class RotaryEmbedding(CustomOp): """Original rotary positional embedding.""" def __init__( @@ -138,13 +138,15 @@ def _forward( key = key.flatten(-2) return query, key - def forward( + def forward_cuda( self, positions: torch.Tensor, query: torch.Tensor, key: torch.Tensor, offsets: Optional[torch.Tensor] = None, ) -> Tuple[torch.Tensor, torch.Tensor]: + from vllm._C import ops + self.cos_sin_cache = self.cos_sin_cache.to(positions.device, dtype=query.dtype) # ops.rotary_embedding()/batched_rotary_embedding() From af0d31e33fa145f2b7b34a430a10dc605c1ab838 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 4 Jun 2024 16:43:44 +0000 Subject: [PATCH 05/14] Minor --- benchmarks/kernels/benchmark_rope.py | 2 +- vllm/model_executor/ops/custom_op.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index 00e55f6060b5..5efcfad31ac1 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -5,7 +5,7 @@ import nvtx import torch -from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.ops.rotary_embedding import get_rope def benchmark_rope_kernels_multi_lora( diff --git a/vllm/model_executor/ops/custom_op.py b/vllm/model_executor/ops/custom_op.py index d5b2e532b284..6b88d728a815 100644 --- a/vllm/model_executor/ops/custom_op.py +++ b/vllm/model_executor/ops/custom_op.py @@ -2,7 +2,7 @@ import torch.nn as nn -from vllm.utils import is_hip, is_cpu +from vllm.utils import is_cpu, is_hip class CustomOp(nn.Module): From a631e7f75bf843cd59b8b5d67da6d7b7e2f4adb9 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 4 Jun 2024 16:44:07 +0000 Subject: [PATCH 06/14] Fix --- tests/kernels/test_activation.py | 4 ++-- tests/kernels/test_layernorm.py | 2 +- tests/kernels/test_moe.py | 2 +- tests/kernels/test_pos_encoding.py | 2 +- tests/lora/test_layers.py | 2 +- tests/lora/test_long_context.py | 2 +- tests/models/test_gptq_marlin.py | 2 +- vllm/lora/layers.py | 4 ++-- vllm/model_executor/models/arctic.py | 6 +++--- vllm/model_executor/models/baichuan.py | 6 +++--- vllm/model_executor/models/bloom.py | 2 +- vllm/model_executor/models/chatglm.py | 6 +++--- vllm/model_executor/models/commandr.py | 4 ++-- vllm/model_executor/models/dbrx.py | 2 +- vllm/model_executor/models/deepseek.py | 6 +++--- vllm/model_executor/models/falcon.py | 4 ++-- vllm/model_executor/models/gemma.py | 6 +++--- vllm/model_executor/models/gpt2.py | 2 +- vllm/model_executor/models/gpt_bigcode.py | 2 +- vllm/model_executor/models/gpt_j.py | 4 ++-- vllm/model_executor/models/gpt_neox.py | 4 ++-- vllm/model_executor/models/internlm2.py | 6 +++--- vllm/model_executor/models/llama.py | 6 +++--- vllm/model_executor/models/llava.py | 2 +- vllm/model_executor/models/minicpm.py | 6 +++--- vllm/model_executor/models/mixtral.py | 4 ++-- vllm/model_executor/models/mixtral_quant.py | 4 ++-- vllm/model_executor/models/mpt.py | 2 +- vllm/model_executor/models/olmo.py | 4 ++-- vllm/model_executor/models/opt.py | 2 +- vllm/model_executor/models/orion.py | 4 ++-- vllm/model_executor/models/phi.py | 4 ++-- vllm/model_executor/models/phi3_small.py | 2 +- vllm/model_executor/models/qwen.py | 6 +++--- vllm/model_executor/models/qwen2.py | 6 +++--- vllm/model_executor/models/qwen2_moe.py | 6 +++--- vllm/model_executor/models/stablelm.py | 4 ++-- vllm/model_executor/models/starcoder2.py | 4 ++-- vllm/model_executor/models/xverse.py | 6 +++--- 39 files changed, 76 insertions(+), 76 deletions(-) diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py index a624c4ca9ee6..d29b14d07365 100644 --- a/tests/kernels/test_activation.py +++ b/tests/kernels/test_activation.py @@ -3,8 +3,8 @@ import pytest import torch -from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul, - NewGELU, SiluAndMul) +from vllm.model_executor.ops.activation import (FastGELU, GeluAndMul, NewGELU, + SiluAndMul) from .allclose_default import get_default_atol, get_default_rtol diff --git a/tests/kernels/test_layernorm.py b/tests/kernels/test_layernorm.py index 210d59e4f32f..689ce46966be 100644 --- a/tests/kernels/test_layernorm.py +++ b/tests/kernels/test_layernorm.py @@ -1,7 +1,7 @@ import pytest import torch -from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.ops.layernorm import RMSNorm DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py index 2356b9ec18b0..5199931ebd88 100644 --- a/tests/kernels/test_moe.py +++ b/tests/kernels/test_moe.py @@ -7,9 +7,9 @@ from transformers import MixtralConfig from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock -from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import fused_moe from vllm.model_executor.models.mixtral import MixtralMoE +from vllm.model_executor.ops.activation import SiluAndMul def torch_moe(a, w1, w2, score, topk): diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py index fbabc02bf9a9..7ed2567c4f16 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/test_pos_encoding.py @@ -4,7 +4,7 @@ import pytest import torch -from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.ops.rotary_embedding import get_rope from .allclose_default import get_default_atol, get_default_rtol diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 9a2c8b04dac4..eca3c86891ca 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -30,9 +30,9 @@ QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.utils import set_random_seed from .utils import DummyLoRAManager diff --git a/tests/lora/test_long_context.py b/tests/lora/test_long_context.py index 4361e5452cdf..0f5d2232a712 100644 --- a/tests/lora/test_long_context.py +++ b/tests/lora/test_long_context.py @@ -8,7 +8,7 @@ from vllm import SamplingParams from vllm.lora.layers import LinearScalingRotaryEmbeddingWithLora from vllm.lora.request import LoRARequest -from vllm.model_executor.layers.rotary_embedding import ( +from vllm.model_executor.ops.rotary_embedding import ( LinearScalingRotaryEmbedding) from .data.long_context_test_data import prompts_and_responses diff --git a/tests/models/test_gptq_marlin.py b/tests/models/test_gptq_marlin.py index 814471b47763..0e85d1b8765c 100644 --- a/tests/models/test_gptq_marlin.py +++ b/tests/models/test_gptq_marlin.py @@ -14,7 +14,7 @@ import torch from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS -from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT +from vllm.model_executor.ops.rotary_embedding import _ROPE_DICT from .utils import check_logprobs_close diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 24b74476c3b8..41c164188857 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -22,10 +22,10 @@ QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.rotary_embedding import ( - LinearScalingRotaryEmbedding, RotaryEmbedding) from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) +from vllm.model_executor.ops.rotary_embedding import ( + LinearScalingRotaryEmbedding, RotaryEmbedding) if TYPE_CHECKING: pass diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index 313762b1353d..c225504bc351 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -10,9 +10,7 @@ get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce) from vllm.logger import init_logger -from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import fused_experts, fused_topk -from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, ReplicatedLinear, @@ -22,11 +20,13 @@ QuantizationConfig) from vllm.model_executor.layers.quantization.deepspeedfp import ( DeepSpeedFPConfig, DeepSpeedFPParameter) -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.ops.activation import SiluAndMul +from vllm.model_executor.ops.layernorm import RMSNorm +from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index babb92e7cdce..c527fd78a84f 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -29,19 +29,19 @@ from vllm.config import CacheConfig, LoRAConfig from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) -from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.ops.activation import SiluAndMul +from vllm.model_executor.ops.layernorm import RMSNorm +from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index a29aee4cffb7..b40885003c3b 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -27,7 +27,6 @@ from vllm.config import CacheConfig from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) -from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, RowParallelLinear) @@ -38,6 +37,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.ops.activation import get_act_fn from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index e3a5e43e23e1..2a3f4d94cf0c 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -11,19 +11,19 @@ from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig, LoRAConfig from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.ops.activation import SiluAndMul +from vllm.model_executor.ops.layernorm import RMSNorm +from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput from vllm.transformers_utils.configs import ChatGLMConfig diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 84786921ce1b..17532e78af79 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -32,18 +32,18 @@ from vllm.config import CacheConfig from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) -from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.ops.activation import SiluAndMul +from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index 8ff19a2015e0..5a5e987272b7 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -16,11 +16,11 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py index 8fbda2638aaa..440717607ac1 100644 --- a/vllm/model_executor/models/deepseek.py +++ b/vllm/model_executor/models/deepseek.py @@ -32,9 +32,7 @@ from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce) -from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import fused_moe -from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, ReplicatedLinear, @@ -42,11 +40,13 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.ops.activation import SiluAndMul +from vllm.model_executor.ops.layernorm import RMSNorm +from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index 9618652f70d2..6aeb01e9df27 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -31,18 +31,18 @@ from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce) -from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.ops.activation import get_act_fn +from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput from vllm.transformers_utils.configs import RWConfig diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 27dda00b66af..cbc23f8599eb 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -25,19 +25,19 @@ from vllm.config import CacheConfig, LoRAConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.logger import init_logger -from vllm.model_executor.layers.activation import GeluAndMul -from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.ops.activation import GeluAndMul +from vllm.model_executor.ops.layernorm import RMSNorm +from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index cc83f6eb6d94..a6fc3d83774c 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -26,7 +26,6 @@ from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, RowParallelLinear) @@ -37,6 +36,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.ops.activation import get_act_fn from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index 69b75763e9a3..926014545abf 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -27,7 +27,6 @@ from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig, LoRAConfig from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, RowParallelLinear) @@ -38,6 +37,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.ops.activation import get_act_fn from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index 47fd5788a4c3..2d1180289b89 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -25,18 +25,18 @@ from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.ops.activation import get_act_fn +from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index eb0fcc8f26a5..158b3daefc6c 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -25,18 +25,18 @@ from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.ops.activation import get_act_fn +from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index e75c567f589c..d7a63c4490e3 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -8,19 +8,19 @@ from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.ops.activation import SiluAndMul +from vllm.model_executor.ops.layernorm import RMSNorm +from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index d83ee9a201c0..c39688f19f6a 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -31,20 +31,20 @@ from vllm.config import CacheConfig, LoRAConfig from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) -from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, kv_cache_scales_loader) +from vllm.model_executor.ops.activation import SiluAndMul +from vllm.model_executor.ops.layernorm import RMSNorm +from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput from vllm.utils import is_hip, print_warning_once diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 3332bcc57846..35b6a87cdadd 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -8,7 +8,6 @@ from vllm.attention import AttentionMetadata from vllm.config import CacheConfig, VisionLanguageConfig -from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) @@ -16,6 +15,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.llama import LlamaModel +from vllm.model_executor.ops.activation import get_act_fn from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.image import get_dummy_image_data diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 59fbf8e1b35f..b58501779b3d 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -32,9 +32,7 @@ from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce) -from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import fused_moe -from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, ReplicatedLinear, @@ -42,11 +40,13 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.ops.activation import SiluAndMul +from vllm.model_executor.ops.layernorm import RMSNorm +from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 2f4237339486..2d7328c28808 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -34,7 +34,6 @@ get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce) from vllm.model_executor.layers.fused_moe import fused_moe -from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (QKVParallelLinear, ReplicatedLinear, RowParallelLinear) @@ -42,11 +41,12 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.quantization.fp8 import Fp8Config -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.ops.layernorm import RMSNorm +from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py index 1894c05e167d..51117f32b24e 100644 --- a/vllm/model_executor/models/mixtral_quant.py +++ b/vllm/model_executor/models/mixtral_quant.py @@ -34,18 +34,18 @@ from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce) -from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (QKVParallelLinear, ReplicatedLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.ops.layernorm import RMSNorm +from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index 5f9e4d86f3cd..12db36fc6c78 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -10,7 +10,6 @@ from vllm.config import CacheConfig from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) -from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, RowParallelLinear) @@ -21,6 +20,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.ops.activation import get_act_fn from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput from vllm.transformers_utils.configs.mpt import MPTConfig diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 39270f71ec46..d930eb6ba4dc 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -30,18 +30,18 @@ from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.ops.activation import SiluAndMul +from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index 4bf59105dbab..50219ea74761 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -26,7 +26,6 @@ from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, ReplicatedLinear, @@ -38,6 +37,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.ops.activation import get_act_fn from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index 133a10e6bb3e..46be7b8b515d 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -13,18 +13,18 @@ from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.ops.activation import SiluAndMul +from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index c8e61735a9bb..5a5ea02594fe 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -44,18 +44,18 @@ from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig, LoRAConfig from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.ops.activation import get_act_fn +from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py index 0c5298eb6f10..ec1e183f818d 100644 --- a/vllm/model_executor/models/phi3_small.py +++ b/vllm/model_executor/models/phi3_small.py @@ -15,11 +15,11 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index d22ea6b79de0..8c5ae741910f 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -13,19 +13,19 @@ from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.ops.activation import SiluAndMul +from vllm.model_executor.ops.layernorm import RMSNorm +from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 9a4829a27873..4f29d0ae2891 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -31,19 +31,19 @@ from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig, LoRAConfig from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.ops.activation import SiluAndMul +from vllm.model_executor.ops.layernorm import RMSNorm +from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 564536f2dd24..3243c947693d 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -34,9 +34,7 @@ from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce) -from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import fused_moe -from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, ReplicatedLinear, @@ -44,11 +42,13 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.ops.activation import SiluAndMul +from vllm.model_executor.ops.layernorm import RMSNorm +from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index a6ed3800bed0..e2cfb69ddf5f 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -28,18 +28,18 @@ from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.ops.activation import SiluAndMul +from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 4324bf50d4ad..6e90767c5d31 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -27,18 +27,18 @@ from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.ops.activation import get_act_fn +from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/xverse.py b/vllm/model_executor/models/xverse.py index 1e5280dde3ff..d4f7b70946d7 100644 --- a/vllm/model_executor/models/xverse.py +++ b/vllm/model_executor/models/xverse.py @@ -29,19 +29,19 @@ from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig, LoRAConfig from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.ops.activation import SiluAndMul +from vllm.model_executor.ops.layernorm import RMSNorm +from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput From a1486ffd5d54196682ca1dc7348ec84b55b47ab9 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 4 Jun 2024 16:51:53 +0000 Subject: [PATCH 07/14] Fix --- vllm/model_executor/ops/activation.py | 8 ++++---- vllm/model_executor/ops/layernorm.py | 2 +- vllm/model_executor/ops/rotary_embedding.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/ops/activation.py b/vllm/model_executor/ops/activation.py index 10a5465690e4..ec407526e5bb 100644 --- a/vllm/model_executor/ops/activation.py +++ b/vllm/model_executor/ops/activation.py @@ -29,7 +29,7 @@ def _forward(self, x: torch.Tensor) -> torch.Tensor: return F.silu(x[..., :d]) * x[..., d:] def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: - from vllm._C import ops + from vllm._custom_ops import ops d = x.shape[-1] // 2 output_shape = (x.shape[:-1] + (d, )) @@ -60,7 +60,7 @@ def _forward(self, x: torch.Tensor) -> torch.Tensor: return F.gelu(x[..., :d], approximate=self.approximate) * x[..., d:] def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: - from vllm._C import ops + from vllm._custom_ops import ops d = x.shape[-1] // 2 output_shape = (x.shape[:-1] + (d, )) @@ -84,7 +84,7 @@ def _forward(self, x: torch.Tensor) -> torch.Tensor: (x + 0.044715 * torch.pow(x, 3.0)))) def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: - from vllm._C import ops + from vllm._custom_ops import ops out = torch.empty_like(x) ops.gelu_new(out, x) @@ -99,7 +99,7 @@ def _forward(self, x: torch.Tensor) -> torch.Tensor: (1.0 + 0.044715 * x * x))) def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: - from vllm._C import ops + from vllm._custom_ops import ops out = torch.empty_like(x) ops.gelu_fast(out, x) diff --git a/vllm/model_executor/ops/layernorm.py b/vllm/model_executor/ops/layernorm.py index 94f9da32cbf1..2254de217e37 100644 --- a/vllm/model_executor/ops/layernorm.py +++ b/vllm/model_executor/ops/layernorm.py @@ -48,7 +48,7 @@ def forward_cuda( x: torch.Tensor, residual: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: - from vllm._C import ops + from vllm._custom_ops import ops if residual is not None: ops.fused_add_rms_norm( diff --git a/vllm/model_executor/ops/rotary_embedding.py b/vllm/model_executor/ops/rotary_embedding.py index be7562d338b3..4a0f2b5fdd7f 100644 --- a/vllm/model_executor/ops/rotary_embedding.py +++ b/vllm/model_executor/ops/rotary_embedding.py @@ -145,7 +145,7 @@ def forward_cuda( key: torch.Tensor, offsets: Optional[torch.Tensor] = None, ) -> Tuple[torch.Tensor, torch.Tensor]: - from vllm._C import ops + from vllm._custom_ops import ops self.cos_sin_cache = self.cos_sin_cache.to(positions.device, dtype=query.dtype) From e135eaed1a404273016e398cb3b9b0e6a656cb14 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 4 Jun 2024 16:59:41 +0000 Subject: [PATCH 08/14] Fix --- vllm/model_executor/ops/activation.py | 8 ++++---- vllm/model_executor/ops/layernorm.py | 2 +- vllm/model_executor/ops/rotary_embedding.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/ops/activation.py b/vllm/model_executor/ops/activation.py index ec407526e5bb..71be87c45ba1 100644 --- a/vllm/model_executor/ops/activation.py +++ b/vllm/model_executor/ops/activation.py @@ -29,7 +29,7 @@ def _forward(self, x: torch.Tensor) -> torch.Tensor: return F.silu(x[..., :d]) * x[..., d:] def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: - from vllm._custom_ops import ops + from vllm import _custom_ops as ops d = x.shape[-1] // 2 output_shape = (x.shape[:-1] + (d, )) @@ -60,7 +60,7 @@ def _forward(self, x: torch.Tensor) -> torch.Tensor: return F.gelu(x[..., :d], approximate=self.approximate) * x[..., d:] def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: - from vllm._custom_ops import ops + from vllm import _custom_ops as ops d = x.shape[-1] // 2 output_shape = (x.shape[:-1] + (d, )) @@ -84,7 +84,7 @@ def _forward(self, x: torch.Tensor) -> torch.Tensor: (x + 0.044715 * torch.pow(x, 3.0)))) def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: - from vllm._custom_ops import ops + from vllm import _custom_ops as ops out = torch.empty_like(x) ops.gelu_new(out, x) @@ -99,7 +99,7 @@ def _forward(self, x: torch.Tensor) -> torch.Tensor: (1.0 + 0.044715 * x * x))) def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: - from vllm._custom_ops import ops + from vllm import _custom_ops as ops out = torch.empty_like(x) ops.gelu_fast(out, x) diff --git a/vllm/model_executor/ops/layernorm.py b/vllm/model_executor/ops/layernorm.py index 2254de217e37..5b8f665cc320 100644 --- a/vllm/model_executor/ops/layernorm.py +++ b/vllm/model_executor/ops/layernorm.py @@ -48,7 +48,7 @@ def forward_cuda( x: torch.Tensor, residual: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: - from vllm._custom_ops import ops + from vllm import _custom_ops as ops if residual is not None: ops.fused_add_rms_norm( diff --git a/vllm/model_executor/ops/rotary_embedding.py b/vllm/model_executor/ops/rotary_embedding.py index 4a0f2b5fdd7f..3cacf307cf1d 100644 --- a/vllm/model_executor/ops/rotary_embedding.py +++ b/vllm/model_executor/ops/rotary_embedding.py @@ -145,7 +145,7 @@ def forward_cuda( key: torch.Tensor, offsets: Optional[torch.Tensor] = None, ) -> Tuple[torch.Tensor, torch.Tensor]: - from vllm._custom_ops import ops + from vllm import _custom_ops as ops self.cos_sin_cache = self.cos_sin_cache.to(positions.device, dtype=query.dtype) From 16bab8e43e4dbdb6806db91b5e154601a4d8c299 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 4 Jun 2024 17:40:35 +0000 Subject: [PATCH 09/14] Revert model changes --- vllm/model_executor/models/arctic.py | 6 +++--- vllm/model_executor/models/baichuan.py | 6 +++--- vllm/model_executor/models/bloom.py | 2 +- vllm/model_executor/models/chatglm.py | 6 +++--- vllm/model_executor/models/commandr.py | 4 ++-- vllm/model_executor/models/dbrx.py | 2 +- vllm/model_executor/models/deepseek.py | 6 +++--- vllm/model_executor/models/falcon.py | 4 ++-- vllm/model_executor/models/gemma.py | 6 +++--- vllm/model_executor/models/gpt2.py | 2 +- vllm/model_executor/models/gpt_bigcode.py | 2 +- vllm/model_executor/models/gpt_j.py | 4 ++-- vllm/model_executor/models/gpt_neox.py | 4 ++-- vllm/model_executor/models/internlm2.py | 6 +++--- vllm/model_executor/models/llama.py | 6 +++--- vllm/model_executor/models/llava.py | 2 +- vllm/model_executor/models/minicpm.py | 6 +++--- vllm/model_executor/models/mixtral.py | 4 ++-- vllm/model_executor/models/mixtral_quant.py | 4 ++-- vllm/model_executor/models/mpt.py | 2 +- vllm/model_executor/models/olmo.py | 4 ++-- vllm/model_executor/models/opt.py | 2 +- vllm/model_executor/models/orion.py | 4 ++-- vllm/model_executor/models/phi.py | 4 ++-- vllm/model_executor/models/phi3_small.py | 2 +- vllm/model_executor/models/qwen.py | 6 +++--- vllm/model_executor/models/qwen2.py | 6 +++--- vllm/model_executor/models/qwen2_moe.py | 6 +++--- vllm/model_executor/models/stablelm.py | 4 ++-- vllm/model_executor/models/starcoder2.py | 4 ++-- vllm/model_executor/models/xverse.py | 6 +++--- 31 files changed, 66 insertions(+), 66 deletions(-) diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index c225504bc351..313762b1353d 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -10,7 +10,9 @@ get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce) from vllm.logger import init_logger +from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import fused_experts, fused_topk +from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, ReplicatedLinear, @@ -20,13 +22,11 @@ QuantizationConfig) from vllm.model_executor.layers.quantization.deepspeedfp import ( DeepSpeedFPConfig, DeepSpeedFPParameter) +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.ops.activation import SiluAndMul -from vllm.model_executor.ops.layernorm import RMSNorm -from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index c527fd78a84f..babb92e7cdce 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -29,19 +29,19 @@ from vllm.config import CacheConfig, LoRAConfig from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.ops.activation import SiluAndMul -from vllm.model_executor.ops.layernorm import RMSNorm -from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index b40885003c3b..a29aee4cffb7 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -27,6 +27,7 @@ from vllm.config import CacheConfig from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) +from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, RowParallelLinear) @@ -37,7 +38,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.ops.activation import get_act_fn from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 2a3f4d94cf0c..e3a5e43e23e1 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -11,19 +11,19 @@ from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig, LoRAConfig from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.ops.activation import SiluAndMul -from vllm.model_executor.ops.layernorm import RMSNorm -from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput from vllm.transformers_utils.configs import ChatGLMConfig diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 17532e78af79..84786921ce1b 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -32,18 +32,18 @@ from vllm.config import CacheConfig from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) +from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.ops.activation import SiluAndMul -from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index 5a5e987272b7..8ff19a2015e0 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -16,11 +16,11 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py index 440717607ac1..8fbda2638aaa 100644 --- a/vllm/model_executor/models/deepseek.py +++ b/vllm/model_executor/models/deepseek.py @@ -32,7 +32,9 @@ from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce) +from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import fused_moe +from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, ReplicatedLinear, @@ -40,13 +42,11 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.ops.activation import SiluAndMul -from vllm.model_executor.ops.layernorm import RMSNorm -from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index 6aeb01e9df27..9618652f70d2 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -31,18 +31,18 @@ from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce) +from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.ops.activation import get_act_fn -from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput from vllm.transformers_utils.configs import RWConfig diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index cbc23f8599eb..27dda00b66af 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -25,19 +25,19 @@ from vllm.config import CacheConfig, LoRAConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.logger import init_logger +from vllm.model_executor.layers.activation import GeluAndMul +from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.ops.activation import GeluAndMul -from vllm.model_executor.ops.layernorm import RMSNorm -from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index a6fc3d83774c..cc83f6eb6d94 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -26,6 +26,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, RowParallelLinear) @@ -36,7 +37,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.ops.activation import get_act_fn from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index 926014545abf..69b75763e9a3 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -27,6 +27,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig, LoRAConfig from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, RowParallelLinear) @@ -37,7 +38,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.ops.activation import get_act_fn from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index 2d1180289b89..47fd5788a4c3 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -25,18 +25,18 @@ from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.ops.activation import get_act_fn -from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index 158b3daefc6c..eb0fcc8f26a5 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -25,18 +25,18 @@ from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.ops.activation import get_act_fn -from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index d7a63c4490e3..e75c567f589c 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -8,19 +8,19 @@ from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.ops.activation import SiluAndMul -from vllm.model_executor.ops.layernorm import RMSNorm -from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index c39688f19f6a..d83ee9a201c0 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -31,20 +31,20 @@ from vllm.config import CacheConfig, LoRAConfig from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, kv_cache_scales_loader) -from vllm.model_executor.ops.activation import SiluAndMul -from vllm.model_executor.ops.layernorm import RMSNorm -from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput from vllm.utils import is_hip, print_warning_once diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 35b6a87cdadd..3332bcc57846 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -8,6 +8,7 @@ from vllm.attention import AttentionMetadata from vllm.config import CacheConfig, VisionLanguageConfig +from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) @@ -15,7 +16,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.llama import LlamaModel -from vllm.model_executor.ops.activation import get_act_fn from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.image import get_dummy_image_data diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index b58501779b3d..59fbf8e1b35f 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -32,7 +32,9 @@ from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce) +from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import fused_moe +from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, ReplicatedLinear, @@ -40,13 +42,11 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.ops.activation import SiluAndMul -from vllm.model_executor.ops.layernorm import RMSNorm -from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 2d7328c28808..2f4237339486 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -34,6 +34,7 @@ get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce) from vllm.model_executor.layers.fused_moe import fused_moe +from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (QKVParallelLinear, ReplicatedLinear, RowParallelLinear) @@ -41,12 +42,11 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.quantization.fp8 import Fp8Config +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.ops.layernorm import RMSNorm -from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py index 51117f32b24e..1894c05e167d 100644 --- a/vllm/model_executor/models/mixtral_quant.py +++ b/vllm/model_executor/models/mixtral_quant.py @@ -34,18 +34,18 @@ from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce) +from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (QKVParallelLinear, ReplicatedLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.ops.layernorm import RMSNorm -from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index 12db36fc6c78..5f9e4d86f3cd 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -10,6 +10,7 @@ from vllm.config import CacheConfig from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) +from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, RowParallelLinear) @@ -20,7 +21,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.ops.activation import get_act_fn from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput from vllm.transformers_utils.configs.mpt import MPTConfig diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index d930eb6ba4dc..39270f71ec46 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -30,18 +30,18 @@ from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.ops.activation import SiluAndMul -from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index 50219ea74761..4bf59105dbab 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -26,6 +26,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, ReplicatedLinear, @@ -37,7 +38,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.ops.activation import get_act_fn from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index 46be7b8b515d..133a10e6bb3e 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -13,18 +13,18 @@ from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.ops.activation import SiluAndMul -from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index 5a5ea02594fe..c8e61735a9bb 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -44,18 +44,18 @@ from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig, LoRAConfig from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.ops.activation import get_act_fn -from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py index ec1e183f818d..0c5298eb6f10 100644 --- a/vllm/model_executor/models/phi3_small.py +++ b/vllm/model_executor/models/phi3_small.py @@ -15,11 +15,11 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 8c5ae741910f..d22ea6b79de0 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -13,19 +13,19 @@ from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.ops.activation import SiluAndMul -from vllm.model_executor.ops.layernorm import RMSNorm -from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 4f29d0ae2891..9a4829a27873 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -31,19 +31,19 @@ from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig, LoRAConfig from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.ops.activation import SiluAndMul -from vllm.model_executor.ops.layernorm import RMSNorm -from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 3243c947693d..564536f2dd24 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -34,7 +34,9 @@ from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce) +from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import fused_moe +from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, ReplicatedLinear, @@ -42,13 +44,11 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.ops.activation import SiluAndMul -from vllm.model_executor.ops.layernorm import RMSNorm -from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index e2cfb69ddf5f..a6ed3800bed0 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -28,18 +28,18 @@ from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.ops.activation import SiluAndMul -from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 6e90767c5d31..4324bf50d4ad 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -27,18 +27,18 @@ from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.ops.activation import get_act_fn -from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/models/xverse.py b/vllm/model_executor/models/xverse.py index d4f7b70946d7..1e5280dde3ff 100644 --- a/vllm/model_executor/models/xverse.py +++ b/vllm/model_executor/models/xverse.py @@ -29,19 +29,19 @@ from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig, LoRAConfig from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.ops.activation import SiluAndMul -from vllm.model_executor.ops.layernorm import RMSNorm -from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput From 41b9a2aa2fc344635c3cf0407f42965489b3b514 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 4 Jun 2024 17:41:42 +0000 Subject: [PATCH 10/14] move back --- vllm/model_executor/{ops => layers}/activation.py | 0 vllm/model_executor/{ops => layers}/layernorm.py | 0 vllm/model_executor/{ops => layers}/rotary_embedding.py | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename vllm/model_executor/{ops => layers}/activation.py (100%) rename vllm/model_executor/{ops => layers}/layernorm.py (100%) rename vllm/model_executor/{ops => layers}/rotary_embedding.py (100%) diff --git a/vllm/model_executor/ops/activation.py b/vllm/model_executor/layers/activation.py similarity index 100% rename from vllm/model_executor/ops/activation.py rename to vllm/model_executor/layers/activation.py diff --git a/vllm/model_executor/ops/layernorm.py b/vllm/model_executor/layers/layernorm.py similarity index 100% rename from vllm/model_executor/ops/layernorm.py rename to vllm/model_executor/layers/layernorm.py diff --git a/vllm/model_executor/ops/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py similarity index 100% rename from vllm/model_executor/ops/rotary_embedding.py rename to vllm/model_executor/layers/rotary_embedding.py From 7986c0fcd088e5a84d917f621f1ee64a908c83a0 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 4 Jun 2024 17:45:06 +0000 Subject: [PATCH 11/14] forward_native --- tests/kernels/test_activation.py | 4 +- tests/kernels/test_layernorm.py | 2 +- tests/kernels/test_pos_encoding.py | 7 ++- vllm/model_executor/custom_op.py | 56 +++++++++++++++++++ vllm/model_executor/layers/activation.py | 10 ++-- vllm/model_executor/layers/layernorm.py | 4 +- .../model_executor/layers/rotary_embedding.py | 4 +- vllm/model_executor/ops/__init__.py | 0 vllm/model_executor/ops/custom_op.py | 33 ----------- 9 files changed, 72 insertions(+), 48 deletions(-) create mode 100644 vllm/model_executor/custom_op.py delete mode 100644 vllm/model_executor/ops/__init__.py delete mode 100644 vllm/model_executor/ops/custom_op.py diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py index d29b14d07365..3284f9ec9166 100644 --- a/tests/kernels/test_activation.py +++ b/tests/kernels/test_activation.py @@ -44,7 +44,7 @@ def test_act_and_mul( elif activation == "gelu_tanh": layer = GeluAndMul(approximate="tanh") out = layer(x) - ref_out = layer._forward(x) + ref_out = layer.forward_native(x) # The SiLU and GELU implementations are equivalent to the native PyTorch # implementations, so we can do exact comparison. assert torch.allclose(out, ref_out, atol=0.0, rtol=0.0) @@ -72,7 +72,7 @@ def test_activation( x = torch.randn(num_tokens, d, dtype=dtype) layer = activation() out = layer(x) - ref_out = layer._forward(x) + ref_out = layer.forward_native(x) assert torch.allclose(out, ref_out, atol=get_default_atol(out), diff --git a/tests/kernels/test_layernorm.py b/tests/kernels/test_layernorm.py index 689ce46966be..d95194dd07af 100644 --- a/tests/kernels/test_layernorm.py +++ b/tests/kernels/test_layernorm.py @@ -42,7 +42,7 @@ def test_rms_norm( # NOTE(woosuk): The reference implementation should be executed first # because the custom kernel is in-place. - ref_out = layer._forward(x, residual) + ref_out = layer.forward_native(x, residual) out = layer(x, residual) # NOTE(woosuk): LayerNorm operators (including RMS) typically have larger # numerical errors than other operators because they involve reductions. diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py index 7ed2567c4f16..a5fb0d4c4bd8 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/test_pos_encoding.py @@ -64,7 +64,7 @@ def test_rotary_embedding( # NOTE(woosuk): The reference implementation should be executed first # because the custom kernel is in-place. - ref_query, ref_key = rope._forward(positions, query, key) + ref_query, ref_key = rope.forward_native(positions, query, key) out_query, out_key = rope.forward(positions, query, key) # Compare the results. assert torch.allclose(out_query, @@ -121,7 +121,7 @@ def test_batched_rotary_embedding( # NOTE(woosuk): The reference implementation should be executed first # because the custom kernel is in-place. - ref_query, ref_key = rope._forward(positions, query, key) + ref_query, ref_key = rope.forward_native(positions, query, key) out_query, out_key = rope.forward(positions, query, key, @@ -195,7 +195,8 @@ def test_batched_rotary_embedding_multi_lora( # NOTE(woosuk): The reference implementation should be executed first # because the custom kernel is in-place. - ref_query, ref_key = rope._forward(positions, query, key, query_offsets) + ref_query, ref_key = rope.forward_native(positions, query, key, + query_offsets) out_query, out_key = rope.forward(positions, query, key, query_offsets.flatten()) # Compare the results. diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py new file mode 100644 index 000000000000..4f683cf58d00 --- /dev/null +++ b/vllm/model_executor/custom_op.py @@ -0,0 +1,56 @@ +import torch.nn as nn + +from vllm.utils import is_cpu, is_hip + + +class CustomOp(nn.Module): + + def forward(self, *args, **kwargs): + if not hasattr(self, "_forward_method"): + self._forward_method = self.dispatch_forward() + return self._forward_method(*args, **kwargs) + + def forward_native(self, *args, **kwargs): + """PyTorch-native implementation of the forward method. + + This method is optional. If implemented, it can be used with compilers + such as torch.compile or PyTorch XLA. Also, it can be used for testing + purposes. + """ + raise NotImplementedError + + def forward_cuda(self, *args, **kwargs): + raise NotImplementedError + + def forward_hip(self, *args, **kwargs): + # By default, we assume that HIP ops are compatible with CUDA ops. + return self.forward_cuda(*args, **kwargs) + + def forward_xpu(self, *args, **kwargs): + # By default, we assume that XPU ops are compatible with CUDA ops. + # NOTE(woosuk): This is a placeholder for future extensions. + return self.forward_cuda(*args, **kwargs) + + def forward_cpu(self, *args, **kwargs): + # By default, we assume that CPU ops are compatible with CUDA ops. + return self.forward_cuda(*args, **kwargs) + + def forward_tpu(self, *args, **kwargs): + # By default, we assume that TPU ops are compatible with the + # PyTorch-native implementation. + # NOTE(woosuk): This is a placeholder for future extensions. + return self.forward_native(*args, **kwargs) + + def forward_gaudi(self, *args, **kwargs): + # By default, we assume that Gaudi ops are compatible with the + # PyTorch-native implementation. + # NOTE(woosuk): This is a placeholder for future extensions. + return self.forward_native(*args, **kwargs) + + def dispatch_forward(self): + if is_hip(): + return self.forward_hip + elif is_cpu(): + return self.forward_cpu + else: + return self.forward_cuda diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index 71be87c45ba1..4d076421f9d2 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -8,7 +8,7 @@ from vllm.distributed import (divide, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) -from vllm.model_executor.ops.custom_op import CustomOp +from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.utils import set_weight_attrs @@ -23,7 +23,7 @@ class SiluAndMul(CustomOp): return: (num_tokens, d) or (batch_size, seq_len, d) """ - def _forward(self, x: torch.Tensor) -> torch.Tensor: + def forward_native(self, x: torch.Tensor) -> torch.Tensor: """PyTorch-native implementation equivalent to forward().""" d = x.shape[-1] // 2 return F.silu(x[..., :d]) * x[..., d:] @@ -54,7 +54,7 @@ def __init__(self, approximate: str = "none"): if approximate not in ("none", "tanh"): raise ValueError(f"Unknown approximate mode: {approximate}") - def _forward(self, x: torch.Tensor) -> torch.Tensor: + def forward_native(self, x: torch.Tensor) -> torch.Tensor: """PyTorch-native implementation equivalent to forward().""" d = x.shape[-1] // 2 return F.gelu(x[..., :d], approximate=self.approximate) * x[..., d:] @@ -77,7 +77,7 @@ def extra_repr(self) -> str: class NewGELU(CustomOp): - def _forward(self, x: torch.Tensor) -> torch.Tensor: + def forward_native(self, x: torch.Tensor) -> torch.Tensor: """PyTorch-native implementation equivalent to forward().""" c = math.sqrt(2.0 / math.pi) return 0.5 * x * (1.0 + torch.tanh(c * @@ -93,7 +93,7 @@ def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: class FastGELU(CustomOp): - def _forward(self, x: torch.Tensor) -> torch.Tensor: + def forward_native(self, x: torch.Tensor) -> torch.Tensor: """PyTorch-native implementation equivalent to forward().""" return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x))) diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index 5b8f665cc320..4533adf8f83a 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -4,7 +4,7 @@ import torch import torch.nn as nn -from vllm.model_executor.ops.custom_op import CustomOp +from vllm.model_executor.custom_op import CustomOp class RMSNorm(CustomOp): @@ -23,7 +23,7 @@ def __init__( self.weight = nn.Parameter(torch.ones(hidden_size)) self.variance_epsilon = eps - def _forward( + def forward_native( self, x: torch.Tensor, residual: Optional[torch.Tensor] = None, diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 3cacf307cf1d..d2652106b844 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -27,7 +27,7 @@ import torch import torch.nn as nn -from vllm.model_executor.ops.custom_op import CustomOp +from vllm.model_executor.custom_op import CustomOp def _rotate_neox(x: torch.Tensor) -> torch.Tensor: @@ -93,7 +93,7 @@ def _compute_cos_sin_cache(self) -> torch.Tensor: cache = torch.cat((cos, sin), dim=-1) return cache - def _forward( + def forward_native( self, positions: torch.Tensor, query: torch.Tensor, diff --git a/vllm/model_executor/ops/__init__.py b/vllm/model_executor/ops/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/vllm/model_executor/ops/custom_op.py b/vllm/model_executor/ops/custom_op.py deleted file mode 100644 index 6b88d728a815..000000000000 --- a/vllm/model_executor/ops/custom_op.py +++ /dev/null @@ -1,33 +0,0 @@ -from abc import abstractmethod - -import torch.nn as nn - -from vllm.utils import is_cpu, is_hip - - -class CustomOp(nn.Module): - - def forward(self, *args, **kwargs): - if not hasattr(self, "_forward_method"): - self._forward_method = self.dispatch_forward() - return self._forward_method(*args, **kwargs) - - @abstractmethod - def forward_cuda(self, *args, **kwargs): - raise NotImplementedError - - def forward_hip(self, *args, **kwargs): - # By default, we assume that HIP ops are compatible with CUDA ops. - return self.forward_cuda(*args, **kwargs) - - def forward_cpu(self, *args, **kwargs): - # By default, we assume that CPU ops are compatible with CUDA ops. - return self.forward_cuda(*args, **kwargs) - - def dispatch_forward(self): - if is_hip(): - return self.forward_hip - elif is_cpu(): - return self.forward_cpu - else: - return self.forward_cuda From 24e11d223e1a91179c41178dd9cfebc8cb016909 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 4 Jun 2024 17:47:02 +0000 Subject: [PATCH 12/14] revert --- benchmarks/kernels/benchmark_rope.py | 2 +- tests/kernels/test_activation.py | 4 ++-- tests/kernels/test_layernorm.py | 2 +- tests/kernels/test_moe.py | 2 +- tests/kernels/test_pos_encoding.py | 2 +- tests/lora/test_layers.py | 2 +- tests/lora/test_long_context.py | 2 +- tests/models/test_gptq_marlin.py | 2 +- vllm/lora/layers.py | 4 ++-- 9 files changed, 11 insertions(+), 11 deletions(-) diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index 5efcfad31ac1..00e55f6060b5 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -5,7 +5,7 @@ import nvtx import torch -from vllm.model_executor.ops.rotary_embedding import get_rope +from vllm.model_executor.layers.rotary_embedding import get_rope def benchmark_rope_kernels_multi_lora( diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py index 3284f9ec9166..a4b9f91c7688 100644 --- a/tests/kernels/test_activation.py +++ b/tests/kernels/test_activation.py @@ -3,8 +3,8 @@ import pytest import torch -from vllm.model_executor.ops.activation import (FastGELU, GeluAndMul, NewGELU, - SiluAndMul) +from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul, + NewGELU, SiluAndMul) from .allclose_default import get_default_atol, get_default_rtol diff --git a/tests/kernels/test_layernorm.py b/tests/kernels/test_layernorm.py index d95194dd07af..a635e6c12c59 100644 --- a/tests/kernels/test_layernorm.py +++ b/tests/kernels/test_layernorm.py @@ -1,7 +1,7 @@ import pytest import torch -from vllm.model_executor.ops.layernorm import RMSNorm +from vllm.model_executor.layers.layernorm import RMSNorm DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py index 5199931ebd88..2356b9ec18b0 100644 --- a/tests/kernels/test_moe.py +++ b/tests/kernels/test_moe.py @@ -7,9 +7,9 @@ from transformers import MixtralConfig from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock +from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import fused_moe from vllm.model_executor.models.mixtral import MixtralMoE -from vllm.model_executor.ops.activation import SiluAndMul def torch_moe(a, w1, w2, score, topk): diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py index a5fb0d4c4bd8..e564e325112a 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/test_pos_encoding.py @@ -4,7 +4,7 @@ import pytest import torch -from vllm.model_executor.ops.rotary_embedding import get_rope +from vllm.model_executor.layers.rotary_embedding import get_rope from .allclose_default import get_default_atol, get_default_rtol diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index eca3c86891ca..9a2c8b04dac4 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -30,9 +30,9 @@ QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) -from vllm.model_executor.ops.rotary_embedding import get_rope from vllm.model_executor.utils import set_random_seed from .utils import DummyLoRAManager diff --git a/tests/lora/test_long_context.py b/tests/lora/test_long_context.py index 0f5d2232a712..4361e5452cdf 100644 --- a/tests/lora/test_long_context.py +++ b/tests/lora/test_long_context.py @@ -8,7 +8,7 @@ from vllm import SamplingParams from vllm.lora.layers import LinearScalingRotaryEmbeddingWithLora from vllm.lora.request import LoRARequest -from vllm.model_executor.ops.rotary_embedding import ( +from vllm.model_executor.layers.rotary_embedding import ( LinearScalingRotaryEmbedding) from .data.long_context_test_data import prompts_and_responses diff --git a/tests/models/test_gptq_marlin.py b/tests/models/test_gptq_marlin.py index 0e85d1b8765c..814471b47763 100644 --- a/tests/models/test_gptq_marlin.py +++ b/tests/models/test_gptq_marlin.py @@ -14,7 +14,7 @@ import torch from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS -from vllm.model_executor.ops.rotary_embedding import _ROPE_DICT +from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT from .utils import check_logprobs_close diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 41c164188857..24b74476c3b8 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -22,10 +22,10 @@ QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.rotary_embedding import ( + LinearScalingRotaryEmbedding, RotaryEmbedding) from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) -from vllm.model_executor.ops.rotary_embedding import ( - LinearScalingRotaryEmbedding, RotaryEmbedding) if TYPE_CHECKING: pass From cdc62a2b003d9b44f152c9310c8743fd8b61b391 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 4 Jun 2024 18:58:12 +0000 Subject: [PATCH 13/14] Move dispatch to offline --- vllm/model_executor/custom_op.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py index 4f683cf58d00..dea819e5bd52 100644 --- a/vllm/model_executor/custom_op.py +++ b/vllm/model_executor/custom_op.py @@ -5,9 +5,11 @@ class CustomOp(nn.Module): + def __init__(self, *args, **kwargs): + super().__init__() + self._forward_method = self.dispatch_forward() + def forward(self, *args, **kwargs): - if not hasattr(self, "_forward_method"): - self._forward_method = self.dispatch_forward() return self._forward_method(*args, **kwargs) def forward_native(self, *args, **kwargs): From d1182e717fee4cd87cb19379eb67837296ffba22 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 4 Jun 2024 18:59:56 +0000 Subject: [PATCH 14/14] Add note --- vllm/model_executor/custom_op.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py index dea819e5bd52..1d49213cd4ab 100644 --- a/vllm/model_executor/custom_op.py +++ b/vllm/model_executor/custom_op.py @@ -50,6 +50,8 @@ def forward_gaudi(self, *args, **kwargs): return self.forward_native(*args, **kwargs) def dispatch_forward(self): + # NOTE(woosuk): Here we assume that vLLM was built for only one + # specific backend. Currently, we do not support dynamic dispatching. if is_hip(): return self.forward_hip elif is_cpu():