diff --git a/examples/pooling/score/convert_model_to_seq_cls.py b/examples/pooling/score/convert_model_to_seq_cls.py index fc659e6025b3..a3d31ceb12a7 100644 --- a/examples/pooling/score/convert_model_to_seq_cls.py +++ b/examples/pooling/score/convert_model_to_seq_cls.py @@ -108,7 +108,7 @@ def no_post_processing(causal_lm, seq_cls_model, tokenizer, tokens, device): def converting( - model_name, classifier_from_tokens, path, method, use_pad_token=False, device="cpu" + model_name, classifier_from_tokens, path, method, use_sep_token=False, device="cpu" ): """ Main conversion function to transform a CausalLM model to SequenceClassification. @@ -118,7 +118,7 @@ def converting( classifier_from_tokens: List of tokens used for classification path: Output path to save the converted model method: Conversion method ('from_2_way_softmax' or 'no_post_processing') - use_pad_token: Whether to use padding token in the sequence classification model + use_sep_token: Whether to use separating token in the sequence classification model device: Device to load the model on ('cpu' or 'cuda') """ assert method in method_map, f"Unknown method: {method}" @@ -149,10 +149,10 @@ def converting( causal_lm, seq_cls_model, tokenizer, classifier_from_tokens, device ) - # Configure padding token settings - # Note: Reranker models typically don't use padding tokens by default - seq_cls_model.config.use_pad_token = use_pad_token - seq_cls_model.config.pad_token_id = tokenizer.pad_token_id + # Configure separating token settings + # Note: `llm as reranker` defaults to not using separating token. + seq_cls_model.config.use_sep_token = use_sep_token + seq_cls_model.config.sep_token_id = tokenizer.sep_token_id # Save the converted model and tokenizer seq_cls_model.save_pretrained(path) @@ -203,6 +203,6 @@ def parse_args(): model_name=args.model_name, classifier_from_tokens=json.loads(args.classifier_from_tokens), method=args.method, - use_pad_token=args.use_pad_token, + use_sep_token=args.use_sep_token, path=args.path, ) diff --git a/tests/entrypoints/pooling/score/test_utils.py b/tests/entrypoints/pooling/score/test_utils.py index 356fd0ad6678..0a57e53be20a 100644 --- a/tests/entrypoints/pooling/score/test_utils.py +++ b/tests/entrypoints/pooling/score/test_utils.py @@ -51,9 +51,9 @@ def llm_reranker_model_config(): CROSS_ENCODER_MODEL_ID, runner="pooling", ) - # use_pad_token is a property that reads from hf_config, + # use_sep_token is a property that reads from hf_config, # so we set it there to override the default (True) - config.hf_config.use_pad_token = False + config.hf_config.use_sep_token = False return config @@ -230,7 +230,7 @@ def test_not_using_default_template( cross_encoder_tokenizer, full_prompt, engine_prompt ) - def test_fallback_with_pad_token( + def test_fallback_with_sep_token( self, cross_encoder_model_config, cross_encoder_tokenizer, @@ -238,7 +238,7 @@ def test_fallback_with_pad_token( mock_model_no_score_template, ): """Test fallback path when ChatTemplateResolutionError - and use_pad_token=True.""" + and use_sep_token=True.""" with ( patch( "vllm.model_executor.model_loader.get_model_cls", @@ -250,7 +250,7 @@ def test_fallback_with_pad_token( ), ): full_prompt, engine_prompt = get_score_prompt( - cross_encoder_model_config, # use_pad_token=True + cross_encoder_model_config, # use_sep_token=True cross_encoder_tokenizer, tokenization_kwargs, "query", @@ -281,7 +281,7 @@ def test_fallback_with_pad_token( add_special_tokens=False, ) - def test_fallback_without_pad_token( + def test_fallback_without_sep_token( self, llm_reranker_model_config, cross_encoder_tokenizer, @@ -289,7 +289,7 @@ def test_fallback_without_pad_token( mock_model_no_score_template, ): """Test fallback path when ChatTemplateResolutionError - and use_pad_token=False.""" + and use_sep_token=False.""" with ( patch( "vllm.model_executor.model_loader.get_model_cls", @@ -301,7 +301,7 @@ def test_fallback_without_pad_token( ), ): full_prompt, engine_prompt = get_score_prompt( - llm_reranker_model_config, # use_pad_token=False + llm_reranker_model_config, # use_sep_token=False cross_encoder_tokenizer, tokenization_kwargs, "query", diff --git a/vllm/config/model.py b/vllm/config/model.py index 88da91058b60..83705e9482bf 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -1434,10 +1434,18 @@ def matryoshka_dimensions(self): return getattr(self.hf_config, "matryoshka_dimensions", None) @property - def use_pad_token(self) -> bool: - # cross_encoder models defaults to using pad_token. - # `llm as reranker` models defaults to not using pad_token. - return getattr(self.hf_config, "use_pad_token", True) + def use_sep_token(self) -> bool: + # cross_encoder models defaults to using separating token. + # `llm as reranker` defaults to not using separating token. + + use_pad_token = getattr(self.hf_config, "use_pad_token", None) + if use_pad_token is not None: + logger.warning_once( + "use_pad_token has been deprecated; please use use_sep_token instead." + ) + return use_pad_token + + return getattr(self.hf_config, "use_sep_token", True) @property def head_dtype(self) -> torch.dtype: diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py index ba10a72fe7e0..a3837d9d32ab 100644 --- a/vllm/entrypoints/score_utils.py +++ b/vllm/entrypoints/score_utils.py @@ -199,14 +199,14 @@ def default_tokenizer_encode(): full_prompt = _apply_model_score_template(model_config, prompt_1, prompt_2) prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs) else: - if model_config.use_pad_token: - # cross_encoder models defaults to using pad_token. + if model_config.use_sep_token: + # cross_encoder models defaults to using separating token. prompt_inputs = tokenizer( text=prompt_1, text_pair=prompt_2, **tokenization_kwargs ) full_prompt = tokenizer.decode(prompt_inputs["input_ids"]) else: - # `llm as reranker` models defaults to not using pad_token. + # `llm as reranker` defaults to not using separating token. full_prompt = prompt_1 + prompt_2 prompt_inputs = tokenizer(text=full_prompt, **tokenization_kwargs) return full_prompt, prompt_inputs diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py index acf1e57a59a9..43303aa76efb 100644 --- a/vllm/model_executor/models/adapters.py +++ b/vllm/model_executor/models/adapters.py @@ -382,9 +382,9 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None: else: text_config.num_labels = len(tokens) - # `llm as reranker` defaults to not using pad_token - use_pad_token = getattr(text_config, "use_pad_token", False) - text_config.use_pad_token = use_pad_token + # `llm as reranker` defaults to not using separating token. + use_sep_token = getattr(text_config, "use_sep_token", False) + text_config.use_sep_token = use_sep_token def load_weights_using_from_2_way_softmax(