Skip to content

Commit 8ed67d3

Browse files
committed
#95 handle exceptions from WikidataIntegrator in data update
1 parent 6a0be83 commit 8ed67d3

File tree

1 file changed

+70
-60
lines changed

1 file changed

+70
-60
lines changed

Data/update_data.py

+70-60
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,13 @@
88
import json
99
import os
1010
import sys
11-
from subprocess import call
1211

12+
from requests.exceptions import HTTPError
1313
from tqdm.auto import tqdm
1414
from wikidataintegrator import wdi_core
15+
from wikidataintegrator.wdi_config import config as wdi_config
16+
17+
wdi_config["BACKOFF_MAX_TRIES"] = 1
1518

1619
with open("total_data.json") as f:
1720
current_data = json.load(f)
@@ -82,11 +85,11 @@
8285

8386
possible_queries = []
8487
for d in data_dir_dirs:
85-
for wt in word_types:
86-
if "./" + d + "/" + wt in [
87-
e[: len("./" + d + "/" + wt)] for e in data_dir_elements
88+
for target_type in word_types:
89+
if "./" + d + "/" + target_type in [
90+
e[: len("./" + d + "/" + target_type)] for e in data_dir_elements
8891
]:
89-
possible_queries.append(d + "/" + wt)
92+
possible_queries.append(d + "/" + target_type)
9093

9194
queries_to_run_lists = [
9295
[q for q in possible_queries if q[: len(lang)] in current_languages]
@@ -98,6 +101,7 @@
98101
data_added_dict = {}
99102

100103
for q in tqdm(queries_to_run, desc="Data updated", unit="dirs",):
104+
lang = q.split("/")[0]
101105
target_type = q.split("/")[1]
102106
query_name = "query" + target_type.title() + ".sparql"
103107
query_path = "./" + q + "/" + query_name
@@ -106,58 +110,64 @@
106110
query_lines = file.readlines()
107111

108112
# First format the lines into a multi-line string and then pass this to wikidataintegrator.
109-
print(f"Querying {q.split('/')[0]} {q.split('/')[1]}")
110-
query = wdi_core.WDFunctionsEngine.execute_sparql_query("".join(query_lines))
111-
112-
query_results = query["results"]["bindings"]
113-
114-
# Format and save the resulting JSON.
115-
results_formatted = []
116-
for r in query_results: # query_results is also a list
117-
r_dict = {k: r[k]["value"] for k in r.keys()}
118-
119-
results_formatted.append(r_dict)
120-
121-
with open(
122-
f"./{q.split('/')[0]}/{q.split('/')[1]}/{q.split('/')[1]}Queried.json",
123-
"w",
124-
encoding="utf-8",
125-
) as f:
126-
json.dump(results_formatted, f, ensure_ascii=False, indent=2)
127-
128-
# Call the corresponding formatting file and update data changes.
129-
call(
130-
["python", f"./{q.split('/')[0]}/{q.split('/')[1]}/format_{q.split('/')[1]}"],
131-
shell=True,
132-
)
133-
134-
with open(
135-
f"./../Keyboards/LanguageKeyboards/{q.split('/')[0]}/{q.split('/')[1]}.json"
136-
) as f:
137-
new_keyboard_data = json.load(f)
138-
139-
data_added_dict[q.split("/")[0]][q.split("/")[1]] = (
140-
len(new_keyboard_data) - current_data[q.split("/")[0]][q.split("/")[1]]
141-
)
142-
143-
current_data[q.split("/")[0]][q.split("/")[1]] = len(new_keyboard_data)
144-
145-
# Update total_data.json
146-
with open("./total_data.json", "w", encoding="utf-8",) as f:
147-
json.dump(current_data, f, ensure_ascii=False, indent=2)
148-
149-
# Update data_updates.txt
150-
data_added_string = """"""
151-
for l in data_added_dict:
152-
data_added_string += f"\n{l}"
153-
for w in word_types:
154-
if data_added_dict[l][w] == 0:
155-
pass
156-
elif data_added_dict[l][w] == 1: # remove the s for label
157-
data_added_string += f"{data_added_dict[l][w]} {w[:-1]},"
158-
else:
159-
data_added_string += f"{data_added_dict[l][w]} {w},"
160-
data_added_string = data_added_string[:-1] # remove the last comma
161-
162-
with open("data_updates.txt", "w+") as f:
163-
f.writelines(data_added_string)
113+
print(f"Querying {lang} {target_type}")
114+
query = None
115+
try:
116+
query = wdi_core.WDFunctionsEngine.execute_sparql_query("".join(query_lines))
117+
except HTTPError as err:
118+
print(f"HTTPError with {query_name}: {err}")
119+
120+
if query is None:
121+
print(f"Nothing returned by the WDQS server for {query_name}")
122+
123+
else:
124+
query_results = query["results"]["bindings"]
125+
126+
# Format and save the resulting JSON.
127+
results_formatted = []
128+
for r in query_results: # query_results is also a list
129+
r_dict = {k: r[k]["value"] for k in r.keys()}
130+
131+
results_formatted.append(r_dict)
132+
133+
with open(
134+
f"./{lang}/{target_type}/{target_type}Queried.json", "w", encoding="utf-8",
135+
) as f:
136+
json.dump(results_formatted, f, ensure_ascii=False, indent=2)
137+
138+
# Call the corresponding formatting file and update data changes.
139+
os.system(f"python ./{lang}/{target_type}/format_{target_type}.py")
140+
141+
with open(
142+
f"./../Keyboards/LanguageKeyboards/{lang}/Data/{target_type}.json"
143+
) as f:
144+
new_keyboard_data = json.load(f)
145+
146+
if lang not in data_added_dict.keys():
147+
data_added_dict[lang] = {}
148+
data_added_dict[lang][target_type] = (
149+
len(new_keyboard_data) - current_data[lang][target_type]
150+
)
151+
152+
current_data[lang][target_type] = len(new_keyboard_data)
153+
154+
# Update total_data.json.
155+
with open("./total_data.json", "w", encoding="utf-8",) as f:
156+
json.dump(current_data, f, ensure_ascii=False, indent=2)
157+
158+
# Update data_updates.txt.
159+
data_added_string = ""
160+
language_keys = list(data_added_dict.keys())
161+
for l in language_keys:
162+
data_added_string += f"- {l} " if l == language_keys[0] else f"\n- {l} "
163+
for w in word_types:
164+
if data_added_dict[l][w] == 0:
165+
pass
166+
elif data_added_dict[l][w] == 1: # remove the s for label
167+
data_added_string += f"{data_added_dict[l][w]} {w[:-1]},"
168+
else:
169+
data_added_string += f"{data_added_dict[l][w]} {w},"
170+
data_added_string = data_added_string[:-1] # remove the last comma
171+
172+
with open("data_updates.txt", "w+") as f:
173+
f.writelines(data_added_string)

0 commit comments

Comments
 (0)