diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 2f160a881bb1..2d7f7d851878 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -2279,6 +2279,14 @@ def __call__( ) if is_batched: + if isinstance(text_pair, str): + raise TypeError( + "when tokenizing batches of text, `text_pair` must be a list or tuple with the same length as `text`." + ) + if text_pair is not None and len(text) != len(text_pair): + raise ValueError( + f"batch length of `text`: {len(text)} does not match batch length of `text_pair`: {len(text_pair)}." + ) batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text return self.batch_encode_plus( batch_text_or_text_pairs=batch_text_or_text_pairs,