From 730ba1dbbea33ef81737290f5b904d30a1308cf1 Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Wed, 30 Apr 2025 12:08:06 +0200 Subject: [PATCH 1/2] Update vLLM to 0.8.5 Signed-off-by: Jan Lasek --- nemo/export/vllm/engine.py | 2 +- nemo/export/vllm/model_config.py | 1 + nemo/export/vllm/tokenizer_group.py | 4 ++-- requirements/requirements_vllm.txt | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/nemo/export/vllm/engine.py b/nemo/export/vllm/engine.py index 4e2ae0a66d07..c3776b842b83 100644 --- a/nemo/export/vllm/engine.py +++ b/nemo/export/vllm/engine.py @@ -18,7 +18,7 @@ from sentencepiece import SentencePieceProcessor from transformers import PreTrainedTokenizerBase from vllm import LLMEngine -from vllm.transformers_utils.tokenizer_group.tokenizer_group import TokenizerGroup +from vllm.transformers_utils.tokenizer_group import TokenizerGroup from nemo.export.sentencepiece_tokenizer import SentencePieceTokenizer from nemo.export.tarutils import TarPath diff --git a/nemo/export/vllm/model_config.py b/nemo/export/vllm/model_config.py index 9812b86884f8..4f64839b17b7 100644 --- a/nemo/export/vllm/model_config.py +++ b/nemo/export/vllm/model_config.py @@ -164,6 +164,7 @@ def __init__( ) self.is_attention_free = self._init_attention_free() self.has_inner_state = self._init_has_inner_state() + self.has_noops = self._init_has_noops() self._verify_tokenizer_mode() self._verify_quantization() diff --git a/nemo/export/vllm/tokenizer_group.py b/nemo/export/vllm/tokenizer_group.py index 34d35af352c2..d99daebb417f 100644 --- a/nemo/export/vllm/tokenizer_group.py +++ b/nemo/export/vllm/tokenizer_group.py @@ -16,12 +16,12 @@ from vllm.config import TokenizerPoolConfig from vllm.lora.request import LoRARequest -from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import BaseTokenizerGroup +from vllm.transformers_utils.tokenizer_group import TokenizerGroup from nemo.export.sentencepiece_tokenizer import SentencePieceTokenizer -class NemoTokenizerGroup(BaseTokenizerGroup): +class NemoTokenizerGroup(TokenizerGroup): """ Implements a custom tokenizer for vLLM, based on SentencePieceTokenizer. """ diff --git a/requirements/requirements_vllm.txt b/requirements/requirements_vllm.txt index 1e5ce908eaa5..7bb9fff06bff 100644 --- a/requirements/requirements_vllm.txt +++ b/requirements/requirements_vllm.txt @@ -19,7 +19,7 @@ pangu rouge_score sacrebleu scikit-learn -vllm==0.8.2 +vllm==0.8.5 webdataset>=0.2.86 wget zarr>=2.18.2,<3.0.0 From dcaf3a09cade9094ae5f19495aa3c0265e29fc49 Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Wed, 30 Apr 2025 12:10:12 +0200 Subject: [PATCH 2/2] Extend Llama converter to shared embeddings option Signed-off-by: Jan Lasek --- nemo/export/vllm/model_config.py | 1 + nemo/export/vllm/model_converters.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/nemo/export/vllm/model_config.py b/nemo/export/vllm/model_config.py index 4f64839b17b7..21151adbf658 100644 --- a/nemo/export/vllm/model_config.py +++ b/nemo/export/vllm/model_config.py @@ -210,6 +210,7 @@ def _load_hf_arguments(self, nemo_config: Dict[str, Any]) -> Dict[str, Any]: 'num_key_value_heads': 'num_query_groups', # 'hidden_act': 'activation', ## <- vLLM has good defaults for the models, nemo values are wrong 'max_position_embeddings': ['max_position_embeddings', 'encoder_seq_length'], + 'tie_word_embeddings': 'share_embeddings_and_output_weights', 'rms_norm_eps': 'layernorm_epsilon', 'attention_dropout': 'attention_dropout', 'initializer_range': 'init_method_std', diff --git a/nemo/export/vllm/model_converters.py b/nemo/export/vllm/model_converters.py index f67fd594aa7b..5e4cf619d281 100644 --- a/nemo/export/vllm/model_converters.py +++ b/nemo/export/vllm/model_converters.py @@ -78,7 +78,8 @@ def convert_weights(self, nemo_model_config, state_dict): yield ('model.embed_tokens.weight', state_dict['model.embedding.word_embeddings.weight']) yield ('model.norm.weight', state_dict['model.decoder.final_layernorm.weight']) - yield ('lm_head.weight', state_dict['model.output_layer.weight']) + if not nemo_model_config.get("share_embeddings_and_output_weights", False): + yield ('lm_head.weight', state_dict['model.output_layer.weight']) for layer in range(int(num_layers)): qkv_weights = state_dict['model.decoder.layers.self_attention.linear_qkv.weight'][layer]