-
Notifications
You must be signed in to change notification settings - Fork 0
/
corpusGeneration.py
87 lines (74 loc) · 3.09 KB
/
corpusGeneration.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from bs4 import BeautifulSoup
import os
import glob
import re
import traceback
all_files = []
directory = 'Corpus'
# Function to read articles of cacm html file and generate corpus from them
def get_content():
try:
count = 1
path = 'cacm'
for filename in glob.glob(os.path.join(path, '*.html')):
with open(filename) as f:
article_name = filename.strip('cacm/').strip('.html')
content = f.read()
count += 1
soup = BeautifulSoup(content, 'html.parser')
soup.prettify().encode('utf-8')
pre_text = soup.find('pre').get_text().encode('utf-8')
content_text = pre_text
processed_text = text_transformation(content_text)
processed_text = processed_text.lower()
write_to_file(processed_text, article_name)
f.close()
except:
print('Error in try block of fetch_content!', traceback.format_exc())
# Function to perform text transformation on the string provided to it as argument
def text_transformation(content):
#@poorva - changed a few things here. Let the apostophe remain
content = re.sub(r'[@_!\s^&*?#=+$~%:;\\/|<>(){}[\]"]', ' ', content)
content_word_list = []
for word in content.split():
word_length = len(word)
if word[word_length - 1:word_length] == '-' \
or word[word_length - 1:word_length] == ',' \
or word[word_length - 1:word_length] == '.':
word = word[:word_length - 1]
content_word_list.append(remove_punctuation(word))
else:
content_word_list.append(remove_punctuation(word))
content_word_list = [x for x in content_word_list if x != '']
content_word_list = " ".join(content_word_list)
# Removing noise from the text
if ' PM ' in content_word_list or 'PM ' in content_word_list or 'PMB ' in content_word_list:
content_word_list_proc = content_word_list.split('PM')[0]
content_word_list_proc += " pm"
return content_word_list_proc
elif ' AM ' in content_word_list or 'AM ' in content_word_list:
content_word_list_proc = content_word_list.split('AM')[0]
content_word_list_proc += " am"
return content_word_list_proc
else:
return content_word_list
# Function to remove irrelevant punctuations before a word
def remove_punctuation(word):
while word[:1] == "-" or word[:1] == "," or word[:1] == ".":
if re.match(r'^[\-]?[0-9]*\.?[0-9]+$', word):
return word
if word[:1] == "-" or word[:1] == "." or word[:1] == ",":
word = word[1:]
else:
return word
return word
# Function to write the content in file
def write_to_file(content, file_name):
try:
if not os.path.exists(directory):
os.makedirs(directory)
file_index_terms = open(directory + '/' + file_name + '.txt', 'w')
file_index_terms.write(content)
file_index_terms.close()
except:
print("Error in try block of write_to_file!", traceback.format_exc())