-
Notifications
You must be signed in to change notification settings - Fork 66
fix: do not add special tokens for custom tokenizer #279
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -176,11 +176,9 @@ def test_run_causallm_pt_and_inference(): | |
| _validate_training(tempdir) | ||
| checkpoint_path = _get_checkpoint_path(tempdir) | ||
| adapter_config = _get_adapter_config(checkpoint_path) | ||
| # tokenizer_name_or_path from model arguments is passed | ||
| # while preparing the prompt tuning config which | ||
| # defaults to model_name_or_path if not explicitly set. | ||
|
|
||
| _validate_adapter_config( | ||
| adapter_config, "PROMPT_TUNING", MODEL_ARGS.tokenizer_name_or_path | ||
| adapter_config, "PROMPT_TUNING", MODEL_ARGS.model_name_or_path | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. since we are not modifying tokenizer_name_or_path post init of the dataclass and retaining it as is (None for default), I have reverted this check to use model_name_or_path instead since a custom tokenizer is not being passed in the prior code. |
||
| ) | ||
|
|
||
| # Load the model | ||
|
|
@@ -214,11 +212,8 @@ def test_run_causallm_pt_and_inference_with_formatting_data(): | |
| _validate_training(tempdir) | ||
| checkpoint_path = _get_checkpoint_path(tempdir) | ||
| adapter_config = _get_adapter_config(checkpoint_path) | ||
| # tokenizer_name_or_path from model arguments is passed | ||
| # while preparing the prompt tuning config which | ||
| # defaults to model_name_or_path if not explicitly set. | ||
| _validate_adapter_config( | ||
| adapter_config, "PROMPT_TUNING", MODEL_ARGS.tokenizer_name_or_path | ||
| adapter_config, "PROMPT_TUNING", MODEL_ARGS.model_name_or_path | ||
| ) | ||
|
|
||
| # Load the model | ||
|
|
@@ -250,11 +245,8 @@ def test_run_causallm_pt_and_inference_JSON_file_formatter(): | |
| _validate_training(tempdir) | ||
| checkpoint_path = _get_checkpoint_path(tempdir) | ||
| adapter_config = _get_adapter_config(checkpoint_path) | ||
| # tokenizer_name_or_path from model arguments is passed | ||
| # while preparing the prompt tuning config which | ||
| # defaults to model_name_or_path if not explicitly set. | ||
| _validate_adapter_config( | ||
| adapter_config, "PROMPT_TUNING", MODEL_ARGS.tokenizer_name_or_path | ||
| adapter_config, "PROMPT_TUNING", MODEL_ARGS.model_name_or_path | ||
| ) | ||
|
|
||
| # Load the model | ||
|
|
@@ -285,11 +277,8 @@ def test_run_causallm_pt_init_text(): | |
| _validate_training(tempdir) | ||
| checkpoint_path = _get_checkpoint_path(tempdir) | ||
| adapter_config = _get_adapter_config(checkpoint_path) | ||
| # tokenizer_name_or_path from model arguments is passed | ||
| # while preparing the prompt tuning config which | ||
| # defaults to model_name_or_path if not explicitly set. | ||
| _validate_adapter_config( | ||
| adapter_config, "PROMPT_TUNING", MODEL_ARGS.tokenizer_name_or_path | ||
| adapter_config, "PROMPT_TUNING", MODEL_ARGS.model_name_or_path | ||
| ) | ||
|
|
||
|
|
||
|
|
@@ -349,6 +338,20 @@ def test_run_causallm_pt_with_validation_data_formatting(): | |
| _validate_training(tempdir, check_eval=True) | ||
|
|
||
|
|
||
| def test_run_causallm_pt_with_custom_tokenizer(): | ||
| """Check if we fail when custom tokenizer not having pad token is used in prompt tuning""" | ||
| with tempfile.TemporaryDirectory() as tempdir: | ||
| train_args = copy.deepcopy(TRAIN_ARGS) | ||
| model_args = copy.deepcopy(MODEL_ARGS) | ||
| model_args.tokenizer_name_or_path = model_args.model_name_or_path | ||
| train_args.output_dir = tempdir | ||
| train_args.eval_strategy = "epoch" | ||
| data_args = copy.deepcopy(DATA_ARGS) | ||
| data_args.validation_data_path = TWITTER_COMPLAINTS_DATA | ||
| with pytest.raises(ValueError): | ||
| sft_trainer.train(model_args, data_args, train_args, PEFT_PT_ARGS) | ||
|
|
||
|
|
||
|
Comment on lines
+341
to
+354
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added a test to reproduce the pad token error when custom tokenizer not having pad token is used in prompt tuning. This as well tests that special tokens are being avoided when a custom tokenizer is passed. |
||
| ############################# Lora Tests ############################# | ||
|
|
||
| target_modules_val_map = [ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -51,15 +51,15 @@ class ModelArguments: | |
| tokenizer_name_or_path: Optional[str] = field( | ||
| default=None, | ||
| metadata={ | ||
| "help": "Path to custom tokenizer.\ | ||
| If not provided it defaults to model_name_or_path" | ||
| "help": "Path to custom tokenizer. \ | ||
| If not provided it defaults to model_name_or_path \ | ||
| and special tokens will be added as needed for specific tokenizer classes. \ | ||
| For prompt tuning, if tokenizer_name_or_path provided, special tokens are not added, \ | ||
| otherwise, it defaults to model_name_or_path with special tokens for specific \ | ||
| tokenizer classes." | ||
|
Comment on lines
+54
to
+59
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Detailed on the usage of tokenizer_name_or_path. |
||
| }, | ||
| ) | ||
|
|
||
| def __post_init__(self): | ||
| if not self.tokenizer_name_or_path: | ||
| self.tokenizer_name_or_path = self.model_name_or_path | ||
|
|
||
|
|
||
|
Comment on lines
-59
to
63
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We dont want to do this anymore since its difficult to figure out if a custom tokenizer has been passed or not in the subsequent parts of the code. |
||
| @dataclass | ||
| class DataArguments: | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -190,32 +190,46 @@ def train( | |
|
|
||
| # TODO: Move these to a config as well | ||
| tokenizer = AutoTokenizer.from_pretrained( | ||
| model_args.tokenizer_name_or_path, cache_dir=train_args.cache_dir, use_fast=True | ||
| ( | ||
| model_args.tokenizer_name_or_path | ||
| if model_args.tokenizer_name_or_path | ||
| else model_args.model_name_or_path | ||
|
Comment on lines
+194
to
+196
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if a custom tokenizer provided we pass it otherwise default to model_name_or_path |
||
| ), | ||
| cache_dir=train_args.cache_dir, | ||
| use_fast=True, | ||
| ) | ||
|
|
||
| # Calculate and save additional metrics to track later. | ||
| additional_metrics["model_load_time"] = time.time() - model_load_time | ||
|
|
||
| peft_config = get_hf_peft_config( | ||
| task_type, peft_config, model_args.tokenizer_name_or_path | ||
| task_type, | ||
| peft_config, | ||
| ( | ||
| model_args.tokenizer_name_or_path | ||
| if model_args.tokenizer_name_or_path | ||
| else model_args.model_name_or_path | ||
| ), | ||
| ) | ||
|
Comment on lines
+208
to
213
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Even for PEFT, if custom tokenizer is passed we pass it on otherwise we still consider to pass model_name_or_path as tokenizer. |
||
|
|
||
| # TODO: understand if we need to hardcode these here or just use defaults in model | ||
| if isinstance(tokenizer, (LlamaTokenizer, LlamaTokenizerFast)): | ||
| tokenizer.add_special_tokens( | ||
| { | ||
| "bos_token": "<s>", | ||
| "eos_token": "</s>", | ||
| "unk_token": "<unk>", | ||
| "pad_token": "<pad>", | ||
| } | ||
| ) | ||
| elif isinstance(tokenizer, (GPT2Tokenizer, GPTNeoXTokenizerFast)): | ||
| tokenizer.add_special_tokens( | ||
| { | ||
| "pad_token": "<pad>", | ||
| } | ||
| ) | ||
| # add special tokens only when a custom tokenizer is not passed | ||
| if not model_args.tokenizer_name_or_path: | ||
| # TODO: understand if we need to hardcode these here or just use defaults in model | ||
| if isinstance(tokenizer, (LlamaTokenizer, LlamaTokenizerFast)): | ||
| tokenizer.add_special_tokens( | ||
| { | ||
| "bos_token": "<s>", | ||
| "eos_token": "</s>", | ||
| "unk_token": "<unk>", | ||
| "pad_token": "<pad>", | ||
| } | ||
| ) | ||
| elif isinstance(tokenizer, (GPT2Tokenizer, GPTNeoXTokenizerFast)): | ||
| tokenizer.add_special_tokens( | ||
| { | ||
| "pad_token": "<pad>", | ||
| } | ||
| ) | ||
|
Comment on lines
+216
to
+232
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Skip this step when custom tokenizer is passed |
||
|
|
||
| max_seq_length = min(train_args.max_seq_length, tokenizer.model_max_length) | ||
| logger.info("Max sequence length is %s", max_seq_length) | ||
|
|
@@ -228,20 +242,22 @@ def train( | |
| tokenizer.model_max_length, | ||
| ) | ||
|
|
||
| # TODO: we need to change this, perhaps follow what open instruct does? | ||
| # add special tokens only when a custom tokenizer is not passed | ||
| special_tokens_dict = {} | ||
| if tokenizer.pad_token is None: | ||
| logger.warning("PAD token set to default, missing in tokenizer") | ||
| special_tokens_dict["pad_token"] = configs.DEFAULT_PAD_TOKEN | ||
| if tokenizer.eos_token is None: | ||
| logger.warning("EOS token set to default, missing in tokenizer") | ||
| special_tokens_dict["eos_token"] = configs.DEFAULT_EOS_TOKEN | ||
| if tokenizer.bos_token is None: | ||
| logger.warning("BOS token set to default, missing in tokenizer") | ||
| special_tokens_dict["bos_token"] = configs.DEFAULT_BOS_TOKEN | ||
| if tokenizer.unk_token is None: | ||
| logger.warning("UNK token set to default, missing in tokenizer") | ||
| special_tokens_dict["unk_token"] = configs.DEFAULT_UNK_TOKEN | ||
| if not model_args.tokenizer_name_or_path: | ||
| # TODO: we need to change this, perhaps follow what open instruct does? | ||
| if tokenizer.pad_token is None: | ||
| logger.warning("PAD token set to default, missing in tokenizer") | ||
| special_tokens_dict["pad_token"] = configs.DEFAULT_PAD_TOKEN | ||
| if tokenizer.eos_token is None: | ||
| logger.warning("EOS token set to default, missing in tokenizer") | ||
| special_tokens_dict["eos_token"] = configs.DEFAULT_EOS_TOKEN | ||
| if tokenizer.bos_token is None: | ||
| logger.warning("BOS token set to default, missing in tokenizer") | ||
| special_tokens_dict["bos_token"] = configs.DEFAULT_BOS_TOKEN | ||
| if tokenizer.unk_token is None: | ||
| logger.warning("UNK token set to default, missing in tokenizer") | ||
| special_tokens_dict["unk_token"] = configs.DEFAULT_UNK_TOKEN | ||
|
Comment on lines
+245
to
+260
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Skip adding special tokens when custom tokenizer is passed
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. got it, makes sense |
||
|
|
||
| # TODO: lower priority but understand if resizing impacts inference quality and why its needed. | ||
| # It makes sense if we manipulate tokenizer that we also save it and provide it to inference. | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This will trigger custom tokenizer and does not add pad tokens which would fail this test, so had to remove it.