From 66f1e084c31e09e5225783b3e18659ca5deebaf6 Mon Sep 17 00:00:00 2001 From: Shawn Yuxuan Tong Date: Wed, 10 Jan 2024 10:10:44 +0800 Subject: [PATCH] Fix vocab_size inconsistency for sampler --- vllm/engine/llm_engine.py | 14 ++++++++------ vllm/model_executor/models/aquila.py | 5 ++++- vllm/model_executor/models/baichuan.py | 5 ++++- vllm/model_executor/models/bloom.py | 5 ++++- vllm/model_executor/models/chatglm.py | 5 ++++- vllm/model_executor/models/falcon.py | 5 ++++- vllm/model_executor/models/gpt2.py | 5 ++++- vllm/model_executor/models/gpt_bigcode.py | 5 ++++- vllm/model_executor/models/gpt_j.py | 5 ++++- vllm/model_executor/models/gpt_neox.py | 5 ++++- vllm/model_executor/models/internlm.py | 5 ++++- vllm/model_executor/models/llama.py | 5 ++++- vllm/model_executor/models/mistral.py | 5 ++++- vllm/model_executor/models/mixtral.py | 5 ++++- vllm/model_executor/models/mpt.py | 5 ++++- vllm/model_executor/models/opt.py | 5 ++++- vllm/model_executor/models/phi_1_5.py | 5 ++++- vllm/model_executor/models/qwen.py | 5 ++++- vllm/model_executor/models/yi.py | 5 ++++- 19 files changed, 80 insertions(+), 24 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 1920946a31d7..929bfbeca038 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -86,18 +86,20 @@ def __init__( # TODO(woosuk): Print more configs in debug mode. self.model_config = model_config - self.cache_config = cache_config - self.parallel_config = parallel_config - self.scheduler_config = scheduler_config - self.log_stats = log_stats - self._verify_args() - self.tokenizer = get_tokenizer( model_config.tokenizer, tokenizer_mode=model_config.tokenizer_mode, trust_remote_code=model_config.trust_remote_code, tokenizer_revision=model_config.tokenizer_revision, revision=model_config.revision) + model_config.hf_config.sampler_vocab_size = min( + len(self.tokenizer), model_config.hf_config.vocab_size) + self.cache_config = cache_config + self.parallel_config = parallel_config + self.scheduler_config = scheduler_config + self.log_stats = log_stats + self._verify_args() + self.seq_counter = Counter() # Create the parallel GPU workers. diff --git a/vllm/model_executor/models/aquila.py b/vllm/model_executor/models/aquila.py index 2f2bd5ffb4a6..d95aa5d48130 100644 --- a/vllm/model_executor/models/aquila.py +++ b/vllm/model_executor/models/aquila.py @@ -281,7 +281,10 @@ def __init__( self.linear_method = linear_method self.model = AquilaModel(config, linear_method) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) - self.sampler = Sampler(config.vocab_size) + if hasattr(config, "sampler_vocab_size"): + self.sampler = Sampler(config.sampler_vocab_size) + else: + self.sampler = Sampler(config.vocab_size) def forward( self, diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index f08c3c8d257f..84408e66833b 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -296,7 +296,10 @@ def __init__(self, self.linear_method = linear_method self.model = BaiChuanModel(config, position_embedding, linear_method) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) - self.sampler = Sampler(config.vocab_size) + if hasattr(config, "sampler_vocab_size"): + self.sampler = Sampler(config.sampler_vocab_size) + else: + self.sampler = Sampler(config.vocab_size) def forward( self, diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index 4adfb6b78102..abe6a13bb94b 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -273,7 +273,10 @@ def __init__( self.linear_method = linear_method self.transformer = BloomModel(config, linear_method) self.lm_head_weight = self.transformer.word_embeddings.weight - self.sampler = Sampler(config.vocab_size) + if hasattr(config, "sampler_vocab_size"): + self.sampler = Sampler(config.sampler_vocab_size) + else: + self.sampler = Sampler(config.vocab_size) def forward( self, diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index dca8d724f976..91b9a5fd878b 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -332,7 +332,10 @@ def __init__( self.linear_method = linear_method self.transformer = ChatGLMModel(config, linear_method) self.lm_head_weight = self.transformer.output_layer.weight - self.sampler = Sampler(config.padded_vocab_size) + if hasattr(config, "sampler_vocab_size"): + self.sampler = Sampler(config.sampler_vocab_size) + else: + self.sampler = Sampler(config.vocab_size) def forward( self, diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index 2b5e022312e3..d9aec9432f2a 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -373,7 +373,10 @@ def __init__( config.vocab_size, config.hidden_size, ) - self.sampler = Sampler(config.vocab_size) + if hasattr(config, "sampler_vocab_size"): + self.sampler = Sampler(config.sampler_vocab_size) + else: + self.sampler = Sampler(config.vocab_size) def forward( self, diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index 661da0fe0434..66ca83d27c28 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -218,7 +218,10 @@ def __init__( self.linear_method = linear_method self.transformer = GPT2Model(config, linear_method) self.lm_head_weight = self.transformer.wte.weight - self.sampler = Sampler(config.vocab_size) + if hasattr(config, "sampler_vocab_size"): + self.sampler = Sampler(config.sampler_vocab_size) + else: + self.sampler = Sampler(config.vocab_size) def forward( self, diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index ef4c1d4143c8..567ca0eb86f5 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -237,7 +237,10 @@ def __init__( self.linear_method = linear_method self.transformer = GPTBigCodeModel(config, linear_method) self.lm_head_weight = self.transformer.wte.weight - self.sampler = Sampler(config.vocab_size) + if hasattr(config, "sampler_vocab_size"): + self.sampler = Sampler(config.sampler_vocab_size) + else: + self.sampler = Sampler(config.vocab_size) def forward( self, diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index 5bab30d9d442..6d1366f05033 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -223,7 +223,10 @@ def __init__( config.n_embd, bias=True, ) - self.sampler = Sampler(config.vocab_size) + if hasattr(config, "sampler_vocab_size"): + self.sampler = Sampler(config.sampler_vocab_size) + else: + self.sampler = Sampler(config.vocab_size) def forward( self, diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index 8f7e1063e0c1..e906ce95f5f8 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -238,7 +238,10 @@ def __init__( config.vocab_size, config.hidden_size, ) - self.sampler = Sampler(config.vocab_size) + if hasattr(config, "sampler_vocab_size"): + self.sampler = Sampler(config.sampler_vocab_size) + else: + self.sampler = Sampler(config.vocab_size) def forward( self, diff --git a/vllm/model_executor/models/internlm.py b/vllm/model_executor/models/internlm.py index 5d0b93793c89..19bfb20b88cd 100644 --- a/vllm/model_executor/models/internlm.py +++ b/vllm/model_executor/models/internlm.py @@ -238,7 +238,10 @@ def __init__( self.linear_method = linear_method self.model = InternLMModel(config, linear_method) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) - self.sampler = Sampler(config.vocab_size) + if hasattr(config, "sampler_vocab_size"): + self.sampler = Sampler(config.sampler_vocab_size) + else: + self.sampler = Sampler(config.vocab_size) def forward( self, diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 3791aa893893..87c12dcf186b 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -274,7 +274,10 @@ def __init__( self.linear_method = linear_method self.model = LlamaModel(config, linear_method) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) - self.sampler = Sampler(config.vocab_size) + if hasattr(config, "sampler_vocab_size"): + self.sampler = Sampler(config.sampler_vocab_size) + else: + self.sampler = Sampler(config.vocab_size) def forward( self, diff --git a/vllm/model_executor/models/mistral.py b/vllm/model_executor/models/mistral.py index 70d033fec69f..819cf9d38691 100644 --- a/vllm/model_executor/models/mistral.py +++ b/vllm/model_executor/models/mistral.py @@ -270,7 +270,10 @@ def __init__( self.linear_method = linear_method self.model = MistralModel(config, linear_method) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) - self.sampler = Sampler(config.vocab_size) + if hasattr(config, "sampler_vocab_size"): + self.sampler = Sampler(config.sampler_vocab_size) + else: + self.sampler = Sampler(config.vocab_size) def forward( self, diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index a8dadce24aa1..b4c37cf7f4a1 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -344,7 +344,10 @@ def __init__( self.linear_method = linear_method self.model = MixtralModel(config, linear_method) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) - self.sampler = Sampler(config.vocab_size) + if hasattr(config, "sampler_vocab_size"): + self.sampler = Sampler(config.sampler_vocab_size) + else: + self.sampler = Sampler(config.vocab_size) def forward( self, diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index 22a876e2ef69..0234abe1e283 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -259,7 +259,10 @@ def __init__( self.transformer = MPTModel(config, linear_method) self.lm_head_weight = self.transformer.wte.weight - self.sampler = Sampler(config.vocab_size) + if hasattr(config, "sampler_vocab_size"): + self.sampler = Sampler(config.sampler_vocab_size) + else: + self.sampler = Sampler(config.vocab_size) def forward( self, diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index 393b2dcabcd5..16f7c47d9b60 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -292,7 +292,10 @@ def __init__( self.linear_method = linear_method self.model = OPTModel(config, linear_method) self.lm_head_weight = self.model.decoder.embed_tokens.weight - self.sampler = Sampler(config.vocab_size) + if hasattr(config, "sampler_vocab_size"): + self.sampler = Sampler(config.sampler_vocab_size) + else: + self.sampler = Sampler(config.vocab_size) def forward( self, diff --git a/vllm/model_executor/models/phi_1_5.py b/vllm/model_executor/models/phi_1_5.py index 9d4424dd0890..19e53d8b286d 100644 --- a/vllm/model_executor/models/phi_1_5.py +++ b/vllm/model_executor/models/phi_1_5.py @@ -262,7 +262,10 @@ def __init__(self, self.transformer = PhiModel(config, linear_method) self.lm_head = PhiCausalLMHead(config) - self.sampler = Sampler(config.vocab_size) + if hasattr(config, "sampler_vocab_size"): + self.sampler = Sampler(config.sampler_vocab_size) + else: + self.sampler = Sampler(config.vocab_size) def forward( self, diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index fbc7320fb45a..fb065553aed3 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -230,7 +230,10 @@ def __init__( self.linear_method = linear_method self.transformer = QWenModel(config, linear_method) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) - self.sampler = Sampler(config.vocab_size) + if hasattr(config, "sampler_vocab_size"): + self.sampler = Sampler(config.sampler_vocab_size) + else: + self.sampler = Sampler(config.vocab_size) def forward( self, diff --git a/vllm/model_executor/models/yi.py b/vllm/model_executor/models/yi.py index 53daa6c4cd93..546a403e4409 100644 --- a/vllm/model_executor/models/yi.py +++ b/vllm/model_executor/models/yi.py @@ -269,7 +269,10 @@ def __init__( self.linear_method = linear_method self.model = YiModel(config, linear_method) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) - self.sampler = Sampler(config.vocab_size) + if hasattr(config, "sampler_vocab_size"): + self.sampler = Sampler(config.sampler_vocab_size) + else: + self.sampler = Sampler(config.vocab_size) def forward( self,