-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocess_writtings_build_resources_from_items.py
85 lines (54 loc) · 1.7 KB
/
process_writtings_build_resources_from_items.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import json
import util
import glob
from transformers import GPT2TokenizerFast
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
def count_tokens(text: str) -> int:
"""count the number of tokens in a string"""
return len(tokenizer.encode(text))
file_item_order = {}
for file in glob.glob('anthony-speeches-and-other-writings-*/*.json'):
dir = file.split('/')[-2]
if dir == 'anthony-speeches-and-other-writings-resources':
continue
# if dir != 'anthony-speeches-and-other-writings-1862':
# continue
if dir not in file_item_order:
file_item_order[dir] = []
file_id = int(file.split('/')[-1].replace('.json',''))
file_item_order[dir].append(file_id)
file_item_order[dir] = sorted(file_item_order[dir])
print(file_item_order)
for dir in file_item_order:
all_text = ""
all_items = []
all_items_ids = []
blocks = []
for id in file_item_order[dir]:
print(id)
data = json.load(open(f"{dir}/{id}.json"))
if 'full_text' not in data:
continue
full_text = util.clean_up_transcribed_text(data['full_text'])
data['id'] = id
all_text = all_text + full_text
all_items.append(data)
all_items_ids.append(id)
if count_tokens(all_text) >= 500:
blocks.append({
'items':all_items[:],
'text':all_text,
'tokenCount': count_tokens(all_text)
})
print(all_items_ids)
all_text=''
all_items=[]
all_items_ids=[]
# leftovers
if len(all_items) >0:
blocks.append({
'items':all_items[:],
'text':all_text,
'tokenCount': count_tokens(all_text)
})
json.dump(blocks,open(f'anthony-speeches-and-other-writings-resources/{dir}.json','w'),indent=2)