-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcleanup.py
77 lines (64 loc) · 2.56 KB
/
cleanup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/usr/bin/python
# -*- coding: utf-8 -*-
import os
import re
#Set file paths and variables for moving later IN BASH OR TERMINAL:
sourcePath = r"../corpus/stripped/" #replace with file path to folder with source files
destPath = r"/path to file/filename/" #replace with file path to where you want
# clean files to be saved
#IN WINDOWS CMD:
#sourcePath = r"C:/Users/Tiff/Documents/UVic/Classes/ENGL598/samples/stripped/"
#destPath = r"C:/Users/Tiff/Documents/UVic/Classes/ENGL598/samples/oneString/"
fileName = r"" #to be used later
#If you only want a specific set of files in your source folder:
listOfFiles = [] #list file names (separated by commas) in the square brackets
#for file in listOfFiles:
#OR
#Iterate over each file in the directory
for file in os.listdir(sourcePath):
fileName = file
#print(fileName)
#Open the stripped .txt file & read it
contents = open(sourcePath + fileName, 'r+')
fulltext = contents.read()
'''Split the file into lines according to line continues
(as they appear in the raw text). Returns a list of lines.'''
listLines = fulltext.splitlines()
#print(listLines)
'''Define a function to filter out lines we don't want'''
def filterLines():
'''Filter out lines that have 2+ capital letters that are
not followed by an alpha character (e.g. chapter headings
IN ALL CAPS)'''
allCaps = re.compile(r'[A-Z][A-Z][^\w]')
filter1 = filter(lambda i: not allCaps.search(i), listLines)
'''Filter out lines that contain only one capital letter
in a word boundary (e.g. I(.) at the start of a chapter)'''
chapNum = re.compile(r'\W*\b\w{0,1}\b')
#filter2 = re.sub(r'\W*\b\w{0,1}\b', '', filter1)
filter2 = filter(lambda i: chapNum.search(i), filter1)
#Filter out Illustration lines
illus = re.compile(r'\[Illustration:')
filter3 = filter(lambda i: not illus.search(i), filter2)
#Filter out lines that have asterisks
ast = re.compile(r'\*')
filter4 = filter(lambda i: not ast.search(i), filter3)
return filter4
toReplace = ' '.join(filterLines())#Call function, pass results to toReplace
#print(toReplace)
#s = ''
#Define a function to change or delete unwanted characters
def subChars():
global toReplace
toReplace = toReplace.replace('_', '')
toReplace = toReplace.replace('=', '')
toReplace = toReplace.replace('|', '')
toReplace = re.sub(r'‘', '\'', toReplace)
toReplace = re.sub(r'-{2,}', '—', toReplace)
return toReplace
result = subChars()
#print(result)
#with open('/home/Tiffany/ENGL598/samples/test/' + fileName, 'w') as f:
f = open(destPath + fileName, 'w')
f.write(result)
f.close()