parseExistingIndex.py

import re

def parse_text_file(filename):
    items = []
    with open(filename, 'r') as file:
        text = file.read()

    # Remove commas and numbers connected to words without spaces
    text = re.sub(r'(?<=\D),(?=\D)', ' ', text)
    text = re.sub(r'(?<=\D)(?=\d)|(?<=\d)(?=\D)', '', text)


    # Extract items based on delimiter (commas or numbers)
    delimiter = r',|\b\d+\b'
    items = [item.strip() for item in re.split(delimiter, text) if len(item.strip()) > 2]


    # Remove items with numbers and spaces or numbers, "-" and another number
    items = [item for item in items if not re.search(r'\b\d+\s.*\b|\b\d+-\d+\b', item)]


    # Truncate words from the right if they contain specific patterns
    truncate_patterns = ["  ", "ג", "€"]
    for i in range(len(items)):
        for pattern in truncate_patterns:
            if pattern in items[i]:
                items[i] = items[i].rsplit(pattern, 1)[0].strip()

    return items

def process_words_list(words):
    processed_words = []
    for word in words:
        if '(' in word and ')' in word:
            start_index = word.index('(')
            end_index = word.index(')')
            before_bracket = word[:start_index].strip()
            inside_bracket = word[start_index+1:end_index].strip()

            if before_bracket:
                processed_words.append(before_bracket)
            if inside_bracket:
                processed_words.append(inside_bracket)
        else:
            processed_words.append(word)

    return processed_words

# usage
items = parse_text_file('existingIndex.txt')
items = [item for item in items if len(item)>2]
processed_words_list=process_words_list(items)
print(processed_words_list)
output_file_path = 'index_words_list.txt'
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    for word in processed_words_list:
        if len(word)>2:
            output_file.write(word + '\n')