-
Notifications
You must be signed in to change notification settings - Fork 38
/
preprocess_testset.py
executable file
·85 lines (65 loc) · 3.52 KB
/
preprocess_testset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
"""
import argparse
import os
import torch
import config
import pykp.io
parser = argparse.ArgumentParser(
description='preprocess_testset.py',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
# **Preprocess Options**
parser.add_argument('-source_dataset_root_dir', default='source_data/',
help="The path to the source data (raw json).")
parser.add_argument('-output_path_prefix', default='data/',
help="Output file for the prepared data")
config.preprocess_opts(parser)
opt = parser.parse_args()
def main():
test_dataset_names = ['inspec', 'nus', 'semeval', 'krapivin', 'duc']
src_fields = ['title', 'abstract']
trg_fields = ['keyword']
print("Loading Vocab...")
opt.vocab_path = os.path.join(opt.output_path_prefix, 'kp20k', 'kp20k.vocab.pt')
print(os.path.abspath(opt.vocab_path))
word2id, id2word, vocab = torch.load(opt.vocab_path, 'rb')
print('Vocab size = %d' % len(vocab))
for test_dataset_name in test_dataset_names:
opt.source_train_file = os.path.join(opt.source_dataset_root_dir, test_dataset_name, '%s_training.json' % (test_dataset_name))
opt.source_test_file = os.path.join(opt.source_dataset_root_dir, test_dataset_name, '%s_testing.json' % (test_dataset_name))
# output path for exporting the processed dataset
opt.output_path = os.path.join(opt.output_path_prefix, test_dataset_name)
if not os.path.exists(opt.output_path):
os.makedirs(opt.output_path)
print("Loading training/validation/test data...")
tokenized_train_pairs = pykp.io.load_src_trgs_pairs(source_json_path=opt.source_train_file,
dataset_name=test_dataset_name,
src_fields=src_fields,
trg_fields=trg_fields,
valid_check=False,
opt=opt)
tokenized_test_pairs = pykp.io.load_src_trgs_pairs(source_json_path=opt.source_test_file,
dataset_name=test_dataset_name,
src_fields=src_fields,
trg_fields=trg_fields,
valid_check=False,
opt=opt)
print("Exporting complete dataset")
pykp.io.process_and_export_dataset(tokenized_train_pairs,
word2id, id2word,
opt,
opt.output_path,
dataset_name=test_dataset_name,
data_type='train',
include_original=True)
pykp.io.process_and_export_dataset(tokenized_test_pairs,
word2id, id2word,
opt,
opt.output_path,
dataset_name=test_dataset_name,
data_type='test',
include_original=True)
if __name__ == "__main__":
main()