-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path2.3 textProcessing.py
100 lines (88 loc) · 4.71 KB
/
2.3 textProcessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import os
import pickle
import csv
import re
import requests
import spacy
from bs4 import BeautifulSoup
from gensim.models import Phrases
from tqdm import tqdm
from wordcloud import WordCloud
nlp = spacy.load("en_core_web_lg")
nlp.max_length = 1500000 # Set up buffer length
minutes = pickle.load(open(os.path.join(os.path.dirname(__file__), "data", "1fomcLinks"), "rb"))
# minutes.reverse() # DEBUG starting from 1993
minutesNew = []
for row in tqdm(minutes):
# if row["link"] != "https://www.federalreserve.gov/monetarypolicy/fomcminutes20091104.htm":
# continue
soup = BeautifulSoup(requests.get(row["link"]).content, "html.parser")
body = str(soup)
text = soup.find("div", id="content") if soup.find("div", id="content") != None else soup.find("body")
# Break document into paragraphs with min length 100 characters
# text.split("\n") doesn't work, minutes before 2000 use \n for every linebreak even within paragraphs!
# Straightforward solution: Get textContent of <p> tags
# Major problem: Using Html closing tags apparently ran counter to Greenspan era policy, so using the above rule
# yields no text in many instances from around 1997 to 2003.
# Solution: check whether there are any closing tags at all and apply different splitting method if so.
paragraphstmp = []
paragraphs = []
if body.count("</p></p></p>"): # bs4 adds closing tags
paragraphstmp = body.split("<p>") # Split the document on opening p tags
paragraphstmp.pop(0) # Remove the first section before the first p tag
for para in paragraphstmp:
par = re.sub('<[^>]*>', '', para)
par = re.sub("(\r\n|\n|\r|\t)", " ", par)
par = re.sub(" ", " ", par)
par.strip()
if len(par) > 300:
paragraphs.append(par)
else:
paragraphstmp = text.find_all("p")
for para in paragraphstmp:
if not len(para.find_all("p")) and len(para.text) > 300: # Assure that there's no nested paragraph and min length
paragraphs.append(para.text)
DLfilteredparagraphs = []
DLrawparagraphs = []
for para in tqdm(paragraphs, leave=False):
# Run standard spacy classification pipeline (tokenization, identification of word type etc)
paraClassified = nlp(para)
filteredTokens = []
rawTokens = []
for token in paraClassified:
if not token.is_stop and not token.is_punct and (token.pos_ == "NOUN" or token.pos_ == "ADJ" or token.pos_ == "VERB"):
filteredTokens.append(token.lemma_.lower())
rawTokens.append(token.lower_)
DLfilteredparagraphs.append(filteredTokens)
DLrawparagraphs.append(rawTokens) # only tokenized and decapitalised, no type filtering
minutesNew.append({**row, "filteredParagraphs": DLfilteredparagraphs, "rawParagraphs": DLrawparagraphs})
# Add dissent variable
minutesNewnew = []
for row in minutesNew:
soup = BeautifulSoup(requests.get(row["link"]).content, "html.parser")
text = soup.find("div", id="content") if soup.find("div", id="content") != None else soup.find("body")
dissent = 0
index = text.text.find("against this action:")+20
assert index > 0, ("against this action: not found", row["link"])
if "None" in text.text[index:index+10]:
dissent = 1
minutesNewnew.append({**row, "dissent": dissent})
# TODO Implement bigrams and trigrams!
# Add bigrams to the filteredParagraphs list, spacy has an implementation called noun chunks
if False:
bigram = Phrases(filteredParagraphs, min_count=20)
for i in range(len(filteredParagraphs)):
for token in bigram[filteredParagraphs[i]]: # gensim uses the entire lemmatized paragraph as an index in the bigram list object for some reason
if "_" in token: # Phrases object tacks bigrams onto the paragraph, so check through all to see whether there are any new ones
filteredParagraphs[i].append(token) # If bigram is found, append to old paragraph
# Preserve filtered paragraphs for further use
# pickle.dump(rawParagraphs, open("data/2rawParagraphs", "wb")) # Depercated
# pickle.dump(filteredParagraphs, open(os.path.join(os.path.dirname(__file__), "data", "2filteredParagraphs"), "wb")) # Depercated
pickle.dump(minutesNewnew, open(os.path.join(os.path.dirname(__file__), "data", "2DLparagraphs"), "wb")) # Doesn't contain bigrams
# Done for now
# TODO optional: Fine tune bigram parameter
# TODO optional: admin section cutoff by occurence of word "vote"
with open(os.path.join(os.path.dirname(__file__), "data", "processedText.csv"), "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, minutesNewnew[0].keys())
writer.writeheader()
writer.writerows(minutesNewnew)