Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions datasets
Submodule datasets added at 8afd0b
2 changes: 0 additions & 2 deletions examples/pytorch/question-answering/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -172,8 +172,6 @@ accelerate test
that will check everything is ready for training. Finally, you cna launch training with

```bash
export TASK_NAME=mrpc

accelerate launch run_qa_no_trainer.py \
--model_name_or_path bert-base-uncased \
--dataset_name squad \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,9 @@ def parse_args():
parser.add_argument(
"--validation_file", type=str, default=None, help="A csv or a json file containing the validation data."
)
parser.add_argument(
"--test_file", type=str, default=None, help="A csv or a json file containing the Prediction data."
)
parser.add_argument(
"--max_seq_length",
type=int,
Expand Down Expand Up @@ -202,15 +205,23 @@ def parse_args():
args = parser.parse_args()

# Sanity checks
if args.dataset_name is None and args.train_file is None and args.validation_file is None:
raise ValueError("Need either a dataset name or a training/validation file.")
if (
args.dataset_name is None
and args.train_file is None
and args.validation_file is None
and args.test_file is None
):
raise ValueError("Need either a dataset name or a training/validation/test file.")
else:
if args.train_file is not None:
extension = args.train_file.split(".")[-1]
assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
if args.validation_file is not None:
extension = args.validation_file.split(".")[-1]
assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
if args.test_file is not None:
extension = args.test_file.split(".")[-1]
assert extension in ["csv", "json"], "`test_file` should be a csv or a json file."

if args.output_dir is not None:
os.makedirs(args.output_dir, exist_ok=True)
Expand Down Expand Up @@ -263,8 +274,10 @@ def main():
data_files["train"] = args.train_file
if args.validation_file is not None:
data_files["validation"] = args.validation_file
if args.test_file is not None:
data_files["test"] = args.test_file
extension = args.train_file.split(".")[-1]
raw_datasets = load_dataset(extension, data_files=data_files)
raw_datasets = load_dataset(extension, data_files=data_files, field="data")
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html.

Expand Down Expand Up @@ -535,13 +548,15 @@ def prepare_validation_features(examples):
train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
)

eval_dataset.set_format(type="torch", columns=["attention_mask", "input_ids", "token_type_ids"])
eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size)
eval_dataset_for_model = eval_dataset.remove_columns(["example_id", "offset_mapping"])
eval_dataloader = DataLoader(
eval_dataset_for_model, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size
)

if args.do_predict:
predict_dataset.set_format(type="torch", columns=["attention_mask", "input_ids", "token_type_ids"])
predict_dataset_for_model = predict_dataset.remove_columns(["example_id", "offset_mapping"])
predict_dataloader = DataLoader(
predict_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size
predict_dataset_for_model, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size
)

# Post-processing:
Expand Down Expand Up @@ -709,21 +724,21 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
start_top_index_concat = create_and_fill_np_array(all_start_top_index, eval_dataset, max_len)
end_top_log_probs_concat = create_and_fill_np_array(all_end_top_log_probs, eval_dataset, max_len)
end_top_index_concat = create_and_fill_np_array(all_end_top_index, eval_dataset, max_len)
all_cls_logits = np.concatenate(all_cls_logits, axis=0)
cls_logits_concat = np.concatenate(all_cls_logits, axis=0)

# delete the list of numpy arrays
del start_top_log_probs
del start_top_index
del end_top_log_probs
del end_top_index
del cls_logits

eval_dataset.set_format(type=None, columns=list(eval_dataset.features.keys()))
outputs_numpy = (
start_top_log_probs_concat,
start_top_index_concat,
end_top_log_probs_concat,
end_top_index_concat,
cls_logits,
cls_logits_concat,
)
prediction = post_processing_function(eval_examples, eval_dataset, outputs_numpy)
eval_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids)
Expand Down Expand Up @@ -766,21 +781,21 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
start_top_index_concat = create_and_fill_np_array(all_start_top_index, predict_dataset, max_len)
end_top_log_probs_concat = create_and_fill_np_array(all_end_top_log_probs, predict_dataset, max_len)
end_top_index_concat = create_and_fill_np_array(all_end_top_index, predict_dataset, max_len)
all_cls_logits = np.concatenate(all_cls_logits, axis=0)
cls_logits_concat = np.concatenate(all_cls_logits, axis=0)

# delete the list of numpy arrays
del start_top_log_probs
del start_top_index
del end_top_log_probs
del end_top_index
del cls_logits

predict_dataset.set_format(type=None, columns=list(predict_dataset.features.keys()))
outputs_numpy = (
start_top_log_probs_concat,
start_top_index_concat,
end_top_log_probs_concat,
end_top_index_concat,
cls_logits,
cls_logits_concat,
)

prediction = post_processing_function(predict_examples, predict_dataset, outputs_numpy)
Expand Down
34 changes: 23 additions & 11 deletions examples/pytorch/question-answering/run_qa_no_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,13 @@ def parse_args():
parser.add_argument(
"--preprocessing_num_workers", type=int, default=4, help="A csv or a json file containing the training data."
)
parser.add_argument("--do_predict", action="store_true", help="Eval the question answering model")
parser.add_argument("--do_predict", action="store_true", help="To do prediction on the question answering model")
parser.add_argument(
"--validation_file", type=str, default=None, help="A csv or a json file containing the validation data."
)
parser.add_argument(
"--test_file", type=str, default=None, help="A csv or a json file containing the Prediction data."
)
parser.add_argument(
"--max_seq_length",
type=int,
Expand Down Expand Up @@ -231,15 +234,23 @@ def parse_args():
args = parser.parse_args()

# Sanity checks
if args.dataset_name is None and args.train_file is None and args.validation_file is None:
raise ValueError("Need either a dataset name or a training/validation file.")
if (
args.dataset_name is None
and args.train_file is None
and args.validation_file is None
and args.test_file is None
):
raise ValueError("Need either a dataset name or a training/validation/test file.")
else:
if args.train_file is not None:
extension = args.train_file.split(".")[-1]
assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
if args.validation_file is not None:
extension = args.validation_file.split(".")[-1]
assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
if args.test_file is not None:
extension = args.test_file.split(".")[-1]
assert extension in ["csv", "json"], "`test_file` should be a csv or a json file."

if args.output_dir is not None:
os.makedirs(args.output_dir, exist_ok=True)
Expand Down Expand Up @@ -292,8 +303,10 @@ def main():
data_files["train"] = args.train_file
if args.validation_file is not None:
data_files["validation"] = args.validation_file
if args.test_file is not None:
data_files["test"] = args.test_file
extension = args.train_file.split(".")[-1]
raw_datasets = load_dataset(extension, data_files=data_files)
raw_datasets = load_dataset(extension, data_files=data_files, field="data")
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html.

Expand Down Expand Up @@ -540,13 +553,15 @@ def prepare_validation_features(examples):
train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
)

eval_dataset.set_format(type="torch", columns=["attention_mask", "input_ids", "token_type_ids"])
eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size)
eval_dataset_for_model = eval_dataset.remove_columns(["example_id", "offset_mapping"])
eval_dataloader = DataLoader(
eval_dataset_for_model, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size
)

if args.do_predict:
predict_dataset.set_format(type="torch", columns=["attention_mask", "input_ids", "token_type_ids"])
predict_dataset_for_model = predict_dataset.remove_columns(["example_id", "offset_mapping"])
predict_dataloader = DataLoader(
predict_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size
predict_dataset_for_model, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size
)

# Post-processing:
Expand Down Expand Up @@ -704,7 +719,6 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
del all_start_logits
del all_end_logits

eval_dataset.set_format(type=None, columns=list(eval_dataset.features.keys()))
outputs_numpy = (start_logits_concat, end_logits_concat)
prediction = post_processing_function(eval_examples, eval_dataset, outputs_numpy)
eval_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids)
Expand Down Expand Up @@ -736,8 +750,6 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
del all_start_logits
del all_end_logits

# Now we need to add extra columns which we removed for post processing
predict_dataset.set_format(type=None, columns=list(predict_dataset.features.keys()))
outputs_numpy = (start_logits_concat, end_logits_concat)
prediction = post_processing_function(predict_examples, predict_dataset, outputs_numpy)
predict_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids)
Expand Down