-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathprocessing_index_of_places.py
241 lines (151 loc) · 8.42 KB
/
processing_index_of_places.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
def return_historic_county(place, place_broken_up, col = 'place20nm', df = index_of_places):
"""
Function that returns the historical county and British country (Wales, England, Scotland) of the place passed in
by quering the Index of Places database for the place passed in
"""
#Find place in the index of places
filtered_df = df.loc[df[col] == place]
#If more than one exact match, then use the other columns and the rest of place to see which match the best
if len(filtered_df) > 1:
if len(place_broken_up) > 2:
place_broken_up.remove(place)
if place in place_broken_up:
print(True)
for place in place_broken_up:
try:
if place != 'United Kingdom' and place != 'UK':
filtered_df_ = filtered_df[filtered_df.apply(lambda r: r.str.contains(place, case=False, na = None).any(), axis=1)]
if len(filtered_df_) != 0:
filtered_df = filtered_df_
except:
return None, None
#If still more, take the one with most information that describes the largest place
if len(filtered_df) > 1:
filtered_df_ = filtered_df[filtered_df['ctyhistnm'].notna()]
if filtered_df_.empty:
try:
filtered_df = filtered_df[filtered_df['descnm'] == 'CTYHIST']
return filtered_df['place20nm'].iloc[0], filtered_df['ctry20nm'].iloc[0]
except IndexError:
pass
df_nan = filtered_df.isnull().sum(axis=1)
minimum = df_nan.min()
index = df_nan.loc[df_nan == minimum].index
filtered_df = filtered_df.loc[index]
if len(filtered_df) > 1:
lst_to_sort_by = ['BUA', 'BUASD', 'CA', 'CED', 'COM', 'CTY', 'CTYHIST', 'CTYLT', 'LOC', 'LONB', 'MD', 'NMD', 'NPARK', 'PAR', 'RGN', 'UA', 'WD']
filtered_df.descnm = filtered_df.descnm.astype("category")
filtered_df.descnm.cat.set_categories(lst_to_sort_by, inplace=True)
filtered_df = filtered_df.iloc[0]
if len(filtered_df) == 0:
return None, None
try:
return filtered_df.iloc[0]['ctyhistnm'], filtered_df.iloc[0]['ctry20nm']
except IndexError:
return filtered_df['ctyhistnm'], filtered_df['ctry20nm']
def search_col(place, col = 'place20nm', df = index_of_places):
"""
Function that checks whether the place passed in, is in the index of places
"""
if df[col].astype(str).str.contains(place).any():
return True
else:
return False
def process_places(place):
"""
Function that returns the tuple:
historic county - if applicable else None
the UK country if applicable (i.e. England, Scotland, Wales, Northern Ireland) else None
the country (e.g. Italy)
"""
#Returns Index in index of places if applicable (i.e. in Great Britain) else None
# the UK country if applicable (i.e. England, Scotland, Wales, Northern Ireland) else None
# the country (e.g. Italy)
if place == None or math_wrapper_is_nan(place):
return None, None, None
#Process the place
place_broken_up = place.split(',')
for i in range(len(place_broken_up)):
place_broken_up[i] = place_broken_up[i].strip()
countries = dict(countries_for_language('en'))
list_of_countries = list(countries.values())
list_of_countries.remove('United Kingdom')
for place in place_broken_up:
#Check if international country (so not included in index of places)
if place in list_of_countries:
return None, None, place
for count in list_of_countries:
if count in place:
return None, None, count
#Check if Ireland (so not included in index of places)
if place == 'Ireland':
return find_ireland(" ".join(place_broken_up))
if len(place_broken_up) == 1:
if place_broken_up[0] == 'England' or place_broken_up[0] == 'Scotland' or place_broken_up[0] == 'Wales':
return None, place_broken_up[0], 'UK'
elif place_broken_up[0] == 'UK' or place_broken_up[0] == 'United Kingdom':
return None, None, 'UK'
else:
print('len is 1 and not identified', place_broken_up)
if 'England' in place_broken_up[0]:
return None, 'England', 'UK'
elif 'Scotland' in place_broken_up[0]:
return None, 'Scotland', 'UK'
elif 'Wales' in place_broken_up[0]:
return None, 'Wales', 'UK'
else:
place_to_look_for = None
for place in place_broken_up:
#Search the place20nm column to find if in index of places
if search_col(place):
index = place_broken_up.index(place)
place_broken_up = place_broken_up[index:]
place_to_look_for = place
break
#If not found sometimes the place will be in Ireland and unlabelled as Ireland so check for this
if place_to_look_for == None:
place_to_look_for = find_ireland(" ".join(place_broken_up))
if place_to_look_for == None:
return None, None, None
else:
return place_to_look_for
else:
historic, country = return_historic_county(place_to_look_for, place_broken_up)
return historic, country, 'UK'
def apply_function_in_sections(df, f, col_to_apply_to, string, section_size = 200):
"""
Function that applies a function f in sections to a column in a dataframe by breaking the column into sections
"""
start = 0
end = section_size
columns = list(df.columns)
print(len(df), 'df')
county_str, country_UK_str, country_str = 'county_' + string, 'country_UK_' + string, 'country_' + string
columns.append(county_str)
columns.append(country_UK_str)
columns.append(country_str)
end_df = pd.DataFrame()
df.reset_index(drop=True, inplace=True)
for i in range(math.ceil((len(df)+1)/section_size)):
section_of_df = df[start:end]
section_of_df['temp'] = section_of_df[col_to_apply_to].apply(extract_wikidata_identifier)
temp_df = section_of_df['temp'].apply(pd.Series)
temp_df.columns = ['a', 'b', 'c']
section_of_df['county_' + string], section_of_df['country_UK_' + string], section_of_df['country_' + string] = temp_df['a'], temp_df['b'], temp_df['c']
section_of_df.drop(columns = ['temp'], inplace = True)
start += section_size
end += section_size
section_of_df = section_of_df.reset_index(drop = True)
end_df = end_df.reset_index(drop = True)
end_df = end_df.append(section_of_df)
return end_df
#combined_df_unique.drop(columns = ['birthPlace.value'], inplace=True)
#combined_df_unique = combined_df_unique.rename(columns = {'attribute.value':'birthPlace.value'})
combined_df_unique.drop_duplicates(subset = ['ID'], inplace = True)
combined_df_unique.set_index(combined_df_unique['ID'], inplace= True)
combined_df_unique = apply_function_in_sections(combined_df_unique, process_places, 'wikitree_place_of_birth', 'wikitree')
combined_df_unique = apply_function_in_sections(combined_df_unique, process_places, 'geni_place_of_birth', 'geni')
combined_df_unique = apply_function_in_sections(combined_df_unique, process_places, 'rush_place_of_birth', 'rush')
combined_df_unique = apply_function_in_sections(combined_df_unique, process_places, 'wikipedia_place_of_birth', 'wikipedia')
combined_df_unique = apply_function_in_sections(combined_df_unique, process_places, 'wikidata_place_of_birth', 'wikidata')
combined_df_unique.to_csv('processed_places_combined_df.csv')