diff --git a/examples/language-modeling/README.md b/examples/language-modeling/README.md index 8e238f725828..8fde5d8d2ed6 100644 --- a/examples/language-modeling/README.md +++ b/examples/language-modeling/README.md @@ -77,10 +77,16 @@ python run_clm.py \ --output_dir /tmp/test-clm ``` +If your dataset is organized with one sample per line, you can use the `--line_by_line` flag (otherwise the script +concatenates all texts and then splits them in blocks of the same length). + +**Note:** On TPU, you should use the flag `--pad_to_max_length` in conjunction with the `--line_by_line` flag to make +sure all your batches have the same length. + ### Whole word masking The BERT authors released a new version of BERT using Whole Word Masking in May 2019. Instead of masking randomly -selected tokens (which may be aprt of words), they mask randomly selected words (masking all the tokens corresponding +selected tokens (which may be part of words), they mask randomly selected words (masking all the tokens corresponding to that word). This technique has been refined for Chinese in [this paper](https://arxiv.org/abs/1906.08101). To fine-tune a model using whole word masking, use the following script: @@ -111,8 +117,8 @@ It works well on so many Chines Task like CLUE (Chinese GLUE). They use LTP, so we need LTP. Now LTP only only works well on `transformers==3.2.0`. So we don't add it to requirements.txt. -You need to create a separate enviromnent with this version of Transformers to run the `run_chinese_ref.py` script that -will create the reference files. The script is in `examples/contrib`. Once in the proper enviromnent, run the +You need to create a separate environment with this version of Transformers to run the `run_chinese_ref.py` script that +will create the reference files. The script is in `examples/contrib`. Once in the proper environment, run the following: @@ -144,6 +150,8 @@ python run_mlm_wwm.py \ --output_dir /tmp/test-mlm-wwm ``` +**Note:** On TPU, you should the flag `--pad_to_max_length` to make sure all your batches have the same length. + ### XLNet and permutation language modeling XLNet uses a different training objective, which is permutation language modeling. It is an autoregressive method @@ -179,3 +187,9 @@ python run_plm.py \ --do_eval \ --output_dir /tmp/test-plm ``` + +If your dataset is organized with one sample per line, you can use the `--line_by_line` flag (otherwise the script +concatenates all texts and then splits them in blocks of the same length). + +**Note:** On TPU, you should use the flag `--pad_to_max_length` in conjunction with the `--line_by_line` flag to make +sure all your batches have the same length. diff --git a/examples/language-modeling/run_mlm.py b/examples/language-modeling/run_mlm.py index e9a5c847d4f5..8d743822dd03 100644 --- a/examples/language-modeling/run_mlm.py +++ b/examples/language-modeling/run_mlm.py @@ -116,6 +116,17 @@ class DataTrainingArguments: mlm_probability: float = field( default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"} ) + line_by_line: bool = field( + default=False, + metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."}, + ) + pad_to_max_length: bool = field( + default=False, + metadata={ + "help": "Whether to pad all samples to `max_seq_length`. " + "If False, will pad the samples dynamically when batching to the maximum length in the batch." + }, + ) def __post_init__(self): if self.dataset_name is None and self.train_file is None and self.validation_file is None: @@ -246,18 +257,73 @@ def main(): column_names = datasets["validation"].column_names text_column_name = "text" if "text" in column_names else column_names[0] - def tokenize_function(examples): - # Remove empty lines - examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()] - return tokenizer(examples["text"], truncation=True, max_length=data_args.max_seq_length) - - tokenized_datasets = datasets.map( - tokenize_function, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=[text_column_name], - load_from_cache_file=not data_args.overwrite_cache, - ) + if data_args.line_by_line: + # When using line_by_line, we just tokenize each nonempty line. + padding = "max_length" if data_args.pad_to_max_length else False + + def tokenize_function(examples): + # Remove empty lines + examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()] + return tokenizer(examples["text"], padding=padding, truncation=True, max_length=data_args.max_seq_length) + + tokenized_datasets = datasets.map( + tokenize_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=[text_column_name], + load_from_cache_file=not data_args.overwrite_cache, + ) + else: + # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. + def tokenize_function(examples): + return tokenizer(examples[text_column_name]) + + tokenized_datasets = datasets.map( + tokenize_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=[text_column_name], + load_from_cache_file=not data_args.overwrite_cache, + ) + + if data_args.max_seq_length is None: + max_seq_length = tokenizer.model_max_length + else: + if data_args.max_seq_length > tokenizer.model_max_length: + logger.warn( + f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" + f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." + ) + max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) + + # Main data processing function that will concatenate all texts from our dataset and generate chunks of + # max_seq_length. + def group_texts(examples): + # Concatenate all texts. + concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} + total_length = len(concatenated_examples[list(examples.keys())[0]]) + # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can + # customize this part to your needs. + total_length = (total_length // max_seq_length) * max_seq_length + # Split by chunks of max_len. + result = { + k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)] + for k, t in concatenated_examples.items() + } + return result + + # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a + # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value + # might be slower to preprocess. + # + # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: + # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map + tokenized_datasets = tokenized_datasets.map( + group_texts, + batched=True, + num_proc=data_args.preprocessing_num_workers, + load_from_cache_file=not data_args.overwrite_cache, + ) # Data collator # This one will take care of randomly masking the tokens. diff --git a/examples/language-modeling/run_mlm_wwm.py b/examples/language-modeling/run_mlm_wwm.py index b4e6f796cd78..9a830f5f929f 100644 --- a/examples/language-modeling/run_mlm_wwm.py +++ b/examples/language-modeling/run_mlm_wwm.py @@ -120,6 +120,13 @@ class DataTrainingArguments: mlm_probability: float = field( default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"} ) + pad_to_max_length: bool = field( + default=False, + metadata={ + "help": "Whether to pad all samples to `max_seq_length`. " + "If False, will pad the samples dynamically when batching to the maximum length in the batch." + }, + ) def __post_init__(self): if self.train_file is not None: @@ -253,10 +260,12 @@ def main(): column_names = datasets["validation"].column_names text_column_name = "text" if "text" in column_names else column_names[0] + padding = "max_length" if data_args.pad_to_max_length else False + def tokenize_function(examples): # Remove empty lines examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()] - return tokenizer(examples["text"], truncation=True, max_length=data_args.max_seq_length) + return tokenizer(examples["text"], padding=padding, truncation=True, max_length=data_args.max_seq_length) tokenized_datasets = datasets.map( tokenize_function, diff --git a/examples/language-modeling/run_plm.py b/examples/language-modeling/run_plm.py index 219ee7115465..fc5a5dbb7656 100644 --- a/examples/language-modeling/run_plm.py +++ b/examples/language-modeling/run_plm.py @@ -113,6 +113,17 @@ class DataTrainingArguments: max_span_length: int = field( default=5, metadata={"help": "Maximum length of a span of masked tokens for permutation language modeling."} ) + line_by_line: bool = field( + default=False, + metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."}, + ) + pad_to_max_length: bool = field( + default=False, + metadata={ + "help": "Whether to pad all samples to `max_seq_length`. " + "If False, will pad the samples dynamically when batching to the maximum length in the batch." + }, + ) def __post_init__(self): if self.dataset_name is None and self.train_file is None and self.validation_file is None: @@ -243,18 +254,73 @@ def main(): column_names = datasets["validation"].column_names text_column_name = "text" if "text" in column_names else column_names[0] - def tokenize_function(examples): - # Remove empty lines - examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()] - return tokenizer(examples["text"], truncation=True, max_length=data_args.max_seq_length) - - tokenized_datasets = datasets.map( - tokenize_function, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=[text_column_name], - load_from_cache_file=not data_args.overwrite_cache, - ) + if data_args.line_by_line: + # When using line_by_line, we just tokenize each nonempty line. + padding = "max_length" if data_args.pad_to_max_length else False + + def tokenize_function(examples): + # Remove empty lines + examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()] + return tokenizer(examples["text"], padding=padding, truncation=True, max_length=data_args.max_seq_length) + + tokenized_datasets = datasets.map( + tokenize_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=[text_column_name], + load_from_cache_file=not data_args.overwrite_cache, + ) + else: + # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. + def tokenize_function(examples): + return tokenizer(examples[text_column_name]) + + tokenized_datasets = datasets.map( + tokenize_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=[text_column_name], + load_from_cache_file=not data_args.overwrite_cache, + ) + + if data_args.max_seq_length is None: + max_seq_length = tokenizer.model_max_length + else: + if data_args.max_seq_length > tokenizer.model_max_length: + logger.warn( + f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" + f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." + ) + max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) + + # Main data processing function that will concatenate all texts from our dataset and generate chunks of + # max_seq_length. + def group_texts(examples): + # Concatenate all texts. + concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} + total_length = len(concatenated_examples[list(examples.keys())[0]]) + # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can + # customize this part to your needs. + total_length = (total_length // max_seq_length) * max_seq_length + # Split by chunks of max_len. + result = { + k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)] + for k, t in concatenated_examples.items() + } + return result + + # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a + # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value + # might be slower to preprocess. + # + # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: + # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map + tokenized_datasets = tokenized_datasets.map( + group_texts, + batched=True, + num_proc=data_args.preprocessing_num_workers, + load_from_cache_file=not data_args.overwrite_cache, + ) # Data collator data_collator = DataCollatorForPermutationLanguageModeling(