-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparseExistingIndex.py
59 lines (45 loc) · 1.88 KB
/
parseExistingIndex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import re
def parse_text_file(filename):
items = []
with open(filename, 'r') as file:
text = file.read()
# Remove commas and numbers connected to words without spaces
text = re.sub(r'(?<=\D),(?=\D)', ' ', text)
text = re.sub(r'(?<=\D)(?=\d)|(?<=\d)(?=\D)', '', text)
# Extract items based on delimiter (commas or numbers)
delimiter = r',|\b\d+\b'
items = [item.strip() for item in re.split(delimiter, text) if len(item.strip()) > 2]
# Remove items with numbers and spaces or numbers, "-" and another number
items = [item for item in items if not re.search(r'\b\d+\s.*\b|\b\d+-\d+\b', item)]
# Truncate words from the right if they contain specific patterns
truncate_patterns = [" ", "ג", "€"]
for i in range(len(items)):
for pattern in truncate_patterns:
if pattern in items[i]:
items[i] = items[i].rsplit(pattern, 1)[0].strip()
return items
def process_words_list(words):
processed_words = []
for word in words:
if '(' in word and ')' in word:
start_index = word.index('(')
end_index = word.index(')')
before_bracket = word[:start_index].strip()
inside_bracket = word[start_index+1:end_index].strip()
if before_bracket:
processed_words.append(before_bracket)
if inside_bracket:
processed_words.append(inside_bracket)
else:
processed_words.append(word)
return processed_words
# usage
items = parse_text_file('existingIndex.txt')
items = [item for item in items if len(item)>2]
processed_words_list=process_words_list(items)
print(processed_words_list)
output_file_path = 'index_words_list.txt'
with open(output_file_path, 'w', encoding='utf-8') as output_file:
for word in processed_words_list:
if len(word)>2:
output_file.write(word + '\n')