diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 8b818f67c3d2..7f76be70616c 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -58,7 +58,7 @@ MRotaryEmbedding, XDRotaryEmbedding, ) -from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader +from vllm.model_executor.model_loader import get_model_loader from vllm.model_executor.model_loader.reload import ( finalize_layerwise_reload, initialize_layerwise_reload, @@ -194,7 +194,6 @@ ) if TYPE_CHECKING: - from vllm.model_executor.model_loader.tensorizer import TensorizerConfig from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput from vllm.v1.spec_decode.ngram_proposer import NgramProposer @@ -4504,16 +4503,6 @@ def reload_weights( weights_not_loaded, ) - def save_tensorized_model( - self, - tensorizer_config: "TensorizerConfig", - ) -> None: - TensorizerLoader.save_model( - self.get_model(), - tensorizer_config=tensorizer_config, - model_config=self.model_config, - ) - def _get_prompt_logprobs_dict( self, hidden_states: torch.Tensor, diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 62f0433eff61..c0654abd53a2 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -57,6 +57,7 @@ from vllm.v1.worker.worker_base import WorkerBase from vllm.v1.worker.workspace import init_workspace_manager +from ...model_executor.model_loader import TensorizerLoader from .gpu.warmup import warmup_kernels from .utils import request_memory @@ -836,12 +837,11 @@ def save_sharded_state( max_size=max_size, ) - def save_tensorized_model( - self, - tensorizer_config: "TensorizerConfig", - ) -> None: - self.model_runner.save_tensorized_model( + def save_tensorized_model(self, tensorizer_config: "TensorizerConfig") -> None: + TensorizerLoader.save_model( + self.get_model(), tensorizer_config=tensorizer_config, + model_config=self.model_config, ) def init_weight_transfer_engine(self, init_info: dict) -> None: