-
Notifications
You must be signed in to change notification settings - Fork 79
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
#95 Baseline data update scripts that can print results of a query
- Loading branch information
1 parent
f2d976b
commit 5f63856
Showing
5 changed files
with
105 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
{ | ||
"French": { | ||
"nouns": 15710, | ||
"verbs": 1241 | ||
}, | ||
"German": { | ||
"nouns": 27526, | ||
"verbs": 3125, | ||
"prepositions": 188 | ||
}, | ||
"Portuguese": { | ||
"nouns": 4530, | ||
"verbs": 188 | ||
}, | ||
"Russian": { | ||
"nouns": 194389, | ||
"verbs": 11, | ||
"prepositions": 12 | ||
}, | ||
"Spanish": { | ||
"nouns": 8310, | ||
"verbs": 87 | ||
}, | ||
"Swedish": { | ||
"nouns": 41102, | ||
"verbs": 4133 | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
""" | ||
Update Data | ||
----------- | ||
Updates all data for Scribe by running all WDQS queries and formatting scripts. | ||
""" | ||
|
||
|
||
import json | ||
import os | ||
|
||
from tqdm.auto import tqdm | ||
from wikidataintegrator import wdi_core | ||
|
||
data_dir_elements = [] | ||
|
||
for path, _, files in os.walk("."): | ||
for name in files: | ||
data_dir_elements.append(os.path.join(path, name)) | ||
|
||
data_dir_files = [f for f in os.listdir(".") if os.path.isfile(os.path.join(".", f))] | ||
|
||
data_dir_dirs = list( | ||
{ | ||
f.split("./")[1].split("/")[0] | ||
for f in data_dir_elements | ||
if f.split("./")[1] not in data_dir_files | ||
} | ||
) | ||
|
||
word_types = ["nouns", "verbs", "prepositions"] | ||
|
||
with open("total_data.json") as f: | ||
current_data = json.load(f) | ||
current_languages = list(current_data.keys()) | ||
|
||
possible_queries = [] | ||
for d in data_dir_dirs: | ||
for wt in word_types: | ||
if "./" + d + "/" + wt in [ | ||
e[: len("./" + d + "/" + wt)] for e in data_dir_elements | ||
]: | ||
possible_queries.append(d + "/" + wt) | ||
|
||
queries_to_run_lists = [ | ||
[q for q in possible_queries if q[: len(lang)] in current_languages] | ||
for lang in current_languages | ||
] | ||
|
||
queries_to_run = list({q for sub in queries_to_run_lists for q in sub}) | ||
|
||
for q in tqdm(queries_to_run[:1], desc="Data updated", unit="dirs",): | ||
target_type = q.split("/")[1] | ||
query_name = "query" + target_type.title() + ".sparql" | ||
query_path = "./" + q + "/" + query_name | ||
|
||
with open(query_path) as file: | ||
query_lines = file.readlines() | ||
|
||
# First format the lines into a multi-line string and then pass this to wikidataintegrator. | ||
query_results = wdi_core.WDFunctionsEngine.execute_sparql_query( | ||
"""{}""".format("".join(query_lines)) | ||
) | ||
|
||
print(query_results) | ||
print(query_path) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
- French (New): 15,710 nouns, 1,241 verbs (mostly infinitives), 67,609 translations | ||
- German: 401 nouns, 78 verbs, corrected many prepositions | ||
- Portuguese (New): 4,530 nouns, 188 verbs, 67,609 translations | ||
- Russian (New): 194,389 nouns, 11 verbs, 12 prepositions, 67,609 translations | ||
- Spanish: 180 nouns, 22 verbs | ||
- Swedish (New): 41,102 nouns, 4,133 verbs, 67,609 translations |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,3 +6,5 @@ dependencies: | |
- black>=19.10b0 | ||
- transformers>=4.12 | ||
- sentencepiece>=0.1.95 | ||
- pip: | ||
- wikidataintegrator>=0.9.22 |