diff --git a/datasets b/datasets new file mode 160000 index 000000000000..8afd0ba8c278 --- /dev/null +++ b/datasets @@ -0,0 +1 @@ +Subproject commit 8afd0ba8c27800a55ea69d9fcd702dc97d9c16d8 diff --git a/examples/pytorch/question-answering/README.md b/examples/pytorch/question-answering/README.md index e5022452284e..96bed2d06be7 100644 --- a/examples/pytorch/question-answering/README.md +++ b/examples/pytorch/question-answering/README.md @@ -172,8 +172,6 @@ accelerate test that will check everything is ready for training. Finally, you cna launch training with ```bash -export TASK_NAME=mrpc - accelerate launch run_qa_no_trainer.py \ --model_name_or_path bert-base-uncased \ --dataset_name squad \ diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py index f1d5a2d03083..e1e97bece31f 100644 --- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py @@ -80,6 +80,9 @@ def parse_args(): parser.add_argument( "--validation_file", type=str, default=None, help="A csv or a json file containing the validation data." ) + parser.add_argument( + "--test_file", type=str, default=None, help="A csv or a json file containing the Prediction data." + ) parser.add_argument( "--max_seq_length", type=int, @@ -202,8 +205,13 @@ def parse_args(): args = parser.parse_args() # Sanity checks - if args.dataset_name is None and args.train_file is None and args.validation_file is None: - raise ValueError("Need either a dataset name or a training/validation file.") + if ( + args.dataset_name is None + and args.train_file is None + and args.validation_file is None + and args.test_file is None + ): + raise ValueError("Need either a dataset name or a training/validation/test file.") else: if args.train_file is not None: extension = args.train_file.split(".")[-1] @@ -211,6 +219,9 @@ def parse_args(): if args.validation_file is not None: extension = args.validation_file.split(".")[-1] assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file." + if args.test_file is not None: + extension = args.test_file.split(".")[-1] + assert extension in ["csv", "json"], "`test_file` should be a csv or a json file." if args.output_dir is not None: os.makedirs(args.output_dir, exist_ok=True) @@ -263,8 +274,10 @@ def main(): data_files["train"] = args.train_file if args.validation_file is not None: data_files["validation"] = args.validation_file + if args.test_file is not None: + data_files["test"] = args.test_file extension = args.train_file.split(".")[-1] - raw_datasets = load_dataset(extension, data_files=data_files) + raw_datasets = load_dataset(extension, data_files=data_files, field="data") # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. @@ -535,13 +548,15 @@ def prepare_validation_features(examples): train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size ) - eval_dataset.set_format(type="torch", columns=["attention_mask", "input_ids", "token_type_ids"]) - eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size) + eval_dataset_for_model = eval_dataset.remove_columns(["example_id", "offset_mapping"]) + eval_dataloader = DataLoader( + eval_dataset_for_model, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size + ) if args.do_predict: - predict_dataset.set_format(type="torch", columns=["attention_mask", "input_ids", "token_type_ids"]) + predict_dataset_for_model = predict_dataset.remove_columns(["example_id", "offset_mapping"]) predict_dataloader = DataLoader( - predict_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size + predict_dataset_for_model, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size ) # Post-processing: @@ -709,21 +724,21 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len): start_top_index_concat = create_and_fill_np_array(all_start_top_index, eval_dataset, max_len) end_top_log_probs_concat = create_and_fill_np_array(all_end_top_log_probs, eval_dataset, max_len) end_top_index_concat = create_and_fill_np_array(all_end_top_index, eval_dataset, max_len) - all_cls_logits = np.concatenate(all_cls_logits, axis=0) + cls_logits_concat = np.concatenate(all_cls_logits, axis=0) # delete the list of numpy arrays del start_top_log_probs del start_top_index del end_top_log_probs del end_top_index + del cls_logits - eval_dataset.set_format(type=None, columns=list(eval_dataset.features.keys())) outputs_numpy = ( start_top_log_probs_concat, start_top_index_concat, end_top_log_probs_concat, end_top_index_concat, - cls_logits, + cls_logits_concat, ) prediction = post_processing_function(eval_examples, eval_dataset, outputs_numpy) eval_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids) @@ -766,21 +781,21 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len): start_top_index_concat = create_and_fill_np_array(all_start_top_index, predict_dataset, max_len) end_top_log_probs_concat = create_and_fill_np_array(all_end_top_log_probs, predict_dataset, max_len) end_top_index_concat = create_and_fill_np_array(all_end_top_index, predict_dataset, max_len) - all_cls_logits = np.concatenate(all_cls_logits, axis=0) + cls_logits_concat = np.concatenate(all_cls_logits, axis=0) # delete the list of numpy arrays del start_top_log_probs del start_top_index del end_top_log_probs del end_top_index + del cls_logits - predict_dataset.set_format(type=None, columns=list(predict_dataset.features.keys())) outputs_numpy = ( start_top_log_probs_concat, start_top_index_concat, end_top_log_probs_concat, end_top_index_concat, - cls_logits, + cls_logits_concat, ) prediction = post_processing_function(predict_examples, predict_dataset, outputs_numpy) diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py index 97e2c8b431d0..de020adb0228 100755 --- a/examples/pytorch/question-answering/run_qa_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_no_trainer.py @@ -81,10 +81,13 @@ def parse_args(): parser.add_argument( "--preprocessing_num_workers", type=int, default=4, help="A csv or a json file containing the training data." ) - parser.add_argument("--do_predict", action="store_true", help="Eval the question answering model") + parser.add_argument("--do_predict", action="store_true", help="To do prediction on the question answering model") parser.add_argument( "--validation_file", type=str, default=None, help="A csv or a json file containing the validation data." ) + parser.add_argument( + "--test_file", type=str, default=None, help="A csv or a json file containing the Prediction data." + ) parser.add_argument( "--max_seq_length", type=int, @@ -231,8 +234,13 @@ def parse_args(): args = parser.parse_args() # Sanity checks - if args.dataset_name is None and args.train_file is None and args.validation_file is None: - raise ValueError("Need either a dataset name or a training/validation file.") + if ( + args.dataset_name is None + and args.train_file is None + and args.validation_file is None + and args.test_file is None + ): + raise ValueError("Need either a dataset name or a training/validation/test file.") else: if args.train_file is not None: extension = args.train_file.split(".")[-1] @@ -240,6 +248,9 @@ def parse_args(): if args.validation_file is not None: extension = args.validation_file.split(".")[-1] assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file." + if args.test_file is not None: + extension = args.test_file.split(".")[-1] + assert extension in ["csv", "json"], "`test_file` should be a csv or a json file." if args.output_dir is not None: os.makedirs(args.output_dir, exist_ok=True) @@ -292,8 +303,10 @@ def main(): data_files["train"] = args.train_file if args.validation_file is not None: data_files["validation"] = args.validation_file + if args.test_file is not None: + data_files["test"] = args.test_file extension = args.train_file.split(".")[-1] - raw_datasets = load_dataset(extension, data_files=data_files) + raw_datasets = load_dataset(extension, data_files=data_files, field="data") # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. @@ -540,13 +553,15 @@ def prepare_validation_features(examples): train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size ) - eval_dataset.set_format(type="torch", columns=["attention_mask", "input_ids", "token_type_ids"]) - eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size) + eval_dataset_for_model = eval_dataset.remove_columns(["example_id", "offset_mapping"]) + eval_dataloader = DataLoader( + eval_dataset_for_model, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size + ) if args.do_predict: - predict_dataset.set_format(type="torch", columns=["attention_mask", "input_ids", "token_type_ids"]) + predict_dataset_for_model = predict_dataset.remove_columns(["example_id", "offset_mapping"]) predict_dataloader = DataLoader( - predict_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size + predict_dataset_for_model, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size ) # Post-processing: @@ -704,7 +719,6 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len): del all_start_logits del all_end_logits - eval_dataset.set_format(type=None, columns=list(eval_dataset.features.keys())) outputs_numpy = (start_logits_concat, end_logits_concat) prediction = post_processing_function(eval_examples, eval_dataset, outputs_numpy) eval_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids) @@ -736,8 +750,6 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len): del all_start_logits del all_end_logits - # Now we need to add extra columns which we removed for post processing - predict_dataset.set_format(type=None, columns=list(predict_dataset.features.keys())) outputs_numpy = (start_logits_concat, end_logits_concat) prediction = post_processing_function(predict_examples, predict_dataset, outputs_numpy) predict_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids)