-
Notifications
You must be signed in to change notification settings - Fork 2
/
8_append_sentences.py
53 lines (44 loc) · 2.26 KB
/
8_append_sentences.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import csv
import ujson
import time
import datetime
from utilwebisadb import set_csv_field_size, get_ids_in_range
import mmap
#@profile
def append_sentences(in_path, out_path):
#need SSD otherwise very, very slow
# load sentences info
with open('sentences_info.json', 'r') as fp:
data = ujson.load(fp)
max_length_sentence = data['max_length_sentence']
max_length_pld = data['max_length_pld']
length_one_entry = max_length_sentence + max_length_pld
min_id = data['min_id']
max_id = data['max_id']
with open(in_path) as in_file, open(out_path, "w", newline='') as out_file, open('E:\\sentences_sorted_equidistant.bin', 'rb') as sentences_sorted_equidistant_file:
reader = csv.reader(in_file)
writer = csv.writer(out_file)
start = time.perf_counter()
for i, row in enumerate(reader):
prov_id_list = get_ids_in_range(row[14], min_id, max_id)
prov_list = []
for id in prov_id_list:
skip = (id - min_id) * length_one_entry
sentences_sorted_equidistant_file.seek(skip)
sentence = sentences_sorted_equidistant_file.read(max_length_sentence).decode("utf-8").rstrip()
pld = sentences_sorted_equidistant_file.read(max_length_pld).decode("utf-8").rstrip()
prov_list.append((id, sentence, pld))
writer.writerow(row[:14] + [ujson.dumps(prov_list)] + row[15:])
#print("{} - {}".format(i, len(prov_id_list)))
if i % 1000 == 0:
print("{} - {}".format(datetime.datetime.now(), i))
#print(i)
print('finished in {} sec'.format(time.perf_counter()-start))
if __name__ == "__main__":
set_csv_field_size()
#for i in [0]:
#append_sentences('webisa_{}.csv'.format(i), 'webisa_{}_with_sent.csv'.format(i))
#append_sentences('webisa_{}_sample_results.csv'.format(i),'webisa_{}_sample_results_with_sent.csv'.format(i))
#append_sentences('webisa_{}_cycles_results.csv'.format(i), 'webisa_{}_cycles_results_with_sent.csv'.format(i))
append_sentences('webisa_1_sentence_results.csv', 'webisa_1_sentence_results_with_sent.csv')
#append_sentences('webisa_5_sample_results.csv', 'webisa_5_sample_results_with_sent.csv')