-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathSPARQL.py
307 lines (228 loc) · 10.4 KB
/
SPARQL.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
import pandas as pd
from bs4 import BeautifulSoup
import re
import requests
import string
from SPARQLWrapper import SPARQLWrapper, JSON
pd.options.mode.chained_assignment = None
import numpy as np
from country_list import countries_for_language
import wikipedia
import math
import time
from urllib.error import HTTPError
from wikipedia import PageError, DisambiguationError
from collections import OrderedDict
import matplotlib.pyplot as plt
import spacy
import random
from spacy.util import minibatch, compounding
from pathlib import Path
import nltk
from nltk.sem import extract_rels,rtuple
from nltk.chunk import tree2conlltags
from spacy.symbols import prep, VERB, pobj, PROPN, ADP
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
import pandas as pd
from scipy import interpolate
import os
import os.path
import ssl
import stat
import subprocess
import sys
import pytesseract
import cv2
import warnings
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
import pandas as pd
import pandas as pd
pd.options.mode.chained_assignment = None
import copy
ssl._create_default_https_context = ssl._create_unverified_context
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer
import geopandas as gpd
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
pd.options.mode.chained_assignment = None
import prince
#General Wrapper functionsto prevent the actual function throwing an error
def math_wrapper_is_nan(x):
"""
Function that checks where x is nan without throwing an error
"""
try:
return math.isnan(x)
except TypeError:
return False
def soup_find_wrapper(x):
"""
Function that checks if span in Soup object without throwing an error if the Soup object is None
"""
try:
return x.find("span", itemprop="name").text
except AttributeError:
return np.nan
def extract_wikidata_identifier(wikidata_link):
"""
Function that returns the wikidata identifier from the URL or URI
"""
if math_wrapper_is_nan(wikidata_link) or wikidata_link == None:
return np.nan
identifier = wikidata_link.split('/')[-1]
return identifier
#Read in dataframes that are needed
index_of_places = pd.read_csv("IPN_GB_2021.csv", encoding = "ISO-8859-1")
countries = dict(countries_for_language('en'))
"""
To make the implementation clear, I have sectioned the code into the following sections
In each section I usually define any relevent functions and follow this with the code that calls these functions
It does take quite a while to finish
SECTION 1: DATA COLLECTION
Subtask 1 : Collect name and wikidata identifier on all MPs
Tasks:
1.
i) Extract unique identifiers, parlimentary number, and optionally place of birth from Wikidata of MPs from first UK parliament to current UK Parliament
2
i) Extract names and unique identifiers from Wikipedia of MPs from first UK parliament to current UK parliament
3
i) Join these two datasets together
Subtask 2 : Collect place of birth data
Tasks:
1.
i) Collect place of birth data from Wikipedia, Wikitree, Geni and Rush Parliamentary Archives
2.
i) Process and standarise this data
3.
i) Take the mode of this data to find historic county
Subtask 3 : Collection population data
Tasks:
1.
i) Extract British population data from Vision of Britian
ii) Combine this data into historical county
2.
i) Perform OCR and process Northern Ireland population data
SECTION 2: Graph and Map Creation
Subtask 1 : Create choropleth map
Subtask 2 : Create PCA
Subtask 3 : Create MFA
Subtask 4 : Create MCA
"""
##########################################################################
# SECTION 1 SUBTASK 1 TASK 1)i - Query Wikidata #
##########################################################################
#First SPARQL query that gathers all MPS from every parliament and optionally their date of birth, gender and place of birth
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setQuery("""
SELECT ?person ?PN ?gender ?dateOfBirth ?personLabel ?PNLabel ?genderLabel ?dateOfBirthLabel ?placeOfBirth ?placeOfBirthLabel
WHERE
{
?person wdt:P39 ?PN .
?PN wdt:P279 wd:Q16707842 .
OPTIONAL {?person wdt:P21 ?gender . }
OPTIONAL {?person wdt:P569 ?dateOfBirth . }
OPTIONAL {?person wdt:P19 ?placeOfBirth . }
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
""")
sparql.setReturnFormat(JSON)
wikidata_query_result = sparql.query().convert()
first_wikidata_query = pd.json_normalize(wikidata_query_result['results']['bindings'])
def return_party_electory_startdate(column):
"""
Function that returns the electoral district, the political party and the start time for an MP for a specific parliament they served in
These queries had to be split up as they were exceeded the quota set out by the sparql python interface
"""
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
#Create final dataframe results will be appended to
columns = ['person.value', 'group.value', 'elect.value', 'PNN.value', 'starttime.value']
final_results = pd.DataFrame(columns = columns)
nrows = 100
start = 0
end = nrows
reached_end = False
#Submit MP wikidata identifier in batches of length nrows
for i in range(math.ceil((len(column) + 1)/nrows)):
#Avoid too many requests Error
time.sleep(3)
values = '{ '
for j in range(start,end):
#Check if the end of the column has been reached
if j >= len(df):
reached_end = True
break
wikidata_identifier = column[j]
#Append the wikidata identifiers to the values
values += ' wd:' + wikidata_identifier + ' '
values += " }"
done = False
while done == False:
try:
#Submit query with specific values
sparql.setQuery("""SELECT ?person ?PNN ?starttime ?elect ?group ?electLabel ?groupLabel WHERE {
VALUES ?person """ + values + """
?person p:P39 ?statement .
?statement ps:P39 ?PNN .
OPTIONAL{ ?statement pq:P580 ?starttime .}
OPTIONAL{ ?statement pq:P768 ?elect . }
OPTIONAL{ ?statement pq:P4100 ?group . }
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
} """)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
results_df = pd.json_normalize(results['results']['bindings'])
done = True
except HTTPError:
#If too many requests error try again
pass
if 'group.value' not in list(results_df.columns):
results_df['group.value'] = np.nan
elif 'elect.value' not in list(results_df.columns):
results_df['elect.value'] = np.nan
try:
final_results = final_results.append(results_df[['person.value', 'group.value', 'elect.value', 'PNN.value', 'starttime.value']])
except KeyError:
pass
start += nrows
end += nrows
if reached_end == True:
break
final_results['person.value'] = final_results['person.value'].apply(extract_wikidata_identifier)
final_results.set_index(final_results['person.value'], inplace = True)
final_results = final_results.drop(columns=['person.value'])
return final_results
def retrive_parliament_number(string):
string = string.replace('Member of the ', '')
string = string.replace('Parliament of the United Kingdom', '')
string = string.replace('th ', '')
string = string.replace('nd', '')
string = string.replace('st', '')
string = string.replace('rd', '')
ParliamentNumber = int(string)
return ParliamentNumber
#Process the first wikidata query to extract the unique Wikidata identifier from the wikidata URI returned
first_wikidata_query['ID'] = first_wikidata_query['person.value'].apply(extract_wikidata_identifier)
first_wikidata_query['parliamentNumber'] = first_wikidata_query['PNLabel.value'].apply(retrive_parliament_number)
#Make a section wikidata query that returns parliamentary group , electoral district and start date for each parliament an MP served in
second_wikidata_query = return_party_electory_startdate(first_wikidata_query['ID'])
#Process the two query datasets so they can be joined
unique_PN = list(first_wikidata_query['PN.value'].unique())
second_wikidata_query = second_wikidata_query[second_wikidata_query['PNN.value'].isin(unique_PN)]
second_wikidata_query.drop_duplicates(inplace=True)
second_wikidata_query = second_wikidata_query.rename(columns={'PNN.value': 'PN.value'})
first_wikidata_query['person.value'] = first_wikidata_query['person.value'].apply(extract_wikidata_identifier)
#Perform left join
wikidataQuery = pd.merge(first_wikidata_query, second_wikidata_query, how='left', left_on=['person.value','PN.value'], right_on = ['person.value','PN.value'])
#Save dataframe
wikidataQuery.to_csv('wikidataQuery.csv') # combined_df
wikidataQuery = pd.read_csv('wikidataQuery.csv')