From 5f63856b07cf0334331ee03d808fddd32db70d00 Mon Sep 17 00:00:00 2001 From: Andrew Tavis Date: Tue, 11 Jan 2022 14:04:30 +0100 Subject: [PATCH] #95 Baseline data update scripts that can print results of a query --- Data/total_data.json | 28 ++++++++++++++++++ Data/update_data.py | 66 +++++++++++++++++++++++++++++++++++++++++++ Data/updated_data.txt | 6 ++++ README.md | 4 ++- environment.yml | 2 ++ 5 files changed, 105 insertions(+), 1 deletion(-) create mode 100644 Data/total_data.json create mode 100644 Data/update_data.py create mode 100644 Data/updated_data.txt diff --git a/Data/total_data.json b/Data/total_data.json new file mode 100644 index 00000000..b5776315 --- /dev/null +++ b/Data/total_data.json @@ -0,0 +1,28 @@ +{ + "French": { + "nouns": 15710, + "verbs": 1241 + }, + "German": { + "nouns": 27526, + "verbs": 3125, + "prepositions": 188 + }, + "Portuguese": { + "nouns": 4530, + "verbs": 188 + }, + "Russian": { + "nouns": 194389, + "verbs": 11, + "prepositions": 12 + }, + "Spanish": { + "nouns": 8310, + "verbs": 87 + }, + "Swedish": { + "nouns": 41102, + "verbs": 4133 + } +} diff --git a/Data/update_data.py b/Data/update_data.py new file mode 100644 index 00000000..94b99936 --- /dev/null +++ b/Data/update_data.py @@ -0,0 +1,66 @@ +""" +Update Data +----------- + +Updates all data for Scribe by running all WDQS queries and formatting scripts. +""" + + +import json +import os + +from tqdm.auto import tqdm +from wikidataintegrator import wdi_core + +data_dir_elements = [] + +for path, _, files in os.walk("."): + for name in files: + data_dir_elements.append(os.path.join(path, name)) + +data_dir_files = [f for f in os.listdir(".") if os.path.isfile(os.path.join(".", f))] + +data_dir_dirs = list( + { + f.split("./")[1].split("/")[0] + for f in data_dir_elements + if f.split("./")[1] not in data_dir_files + } +) + +word_types = ["nouns", "verbs", "prepositions"] + +with open("total_data.json") as f: + current_data = json.load(f) +current_languages = list(current_data.keys()) + +possible_queries = [] +for d in data_dir_dirs: + for wt in word_types: + if "./" + d + "/" + wt in [ + e[: len("./" + d + "/" + wt)] for e in data_dir_elements + ]: + possible_queries.append(d + "/" + wt) + +queries_to_run_lists = [ + [q for q in possible_queries if q[: len(lang)] in current_languages] + for lang in current_languages +] + +queries_to_run = list({q for sub in queries_to_run_lists for q in sub}) + +for q in tqdm(queries_to_run[:1], desc="Data updated", unit="dirs",): + target_type = q.split("/")[1] + query_name = "query" + target_type.title() + ".sparql" + query_path = "./" + q + "/" + query_name + + with open(query_path) as file: + query_lines = file.readlines() + + # First format the lines into a multi-line string and then pass this to wikidataintegrator. + query_results = wdi_core.WDFunctionsEngine.execute_sparql_query( + """{}""".format("".join(query_lines)) + ) + +print(query_results) +print(query_path) diff --git a/Data/updated_data.txt b/Data/updated_data.txt new file mode 100644 index 00000000..fa3e0768 --- /dev/null +++ b/Data/updated_data.txt @@ -0,0 +1,6 @@ +- French (New): 15,710 nouns, 1,241 verbs (mostly infinitives), 67,609 translations +- German: 401 nouns, 78 verbs, corrected many prepositions +- Portuguese (New): 4,530 nouns, 188 verbs, 67,609 translations +- Russian (New): 194,389 nouns, 11 verbs, 12 prepositions, 67,609 translations +- Spanish: 180 nouns, 22 verbs +- Swedish (New): 41,102 nouns, 4,133 verbs, 67,609 translations diff --git a/README.md b/README.md index 19569f19..8023e9da 100644 --- a/README.md +++ b/README.md @@ -74,10 +74,12 @@ The Scribe team has interest in creating keyboards for all languages of interest `*` Given the current **`beta`** status where words are machine translated. -`†` Adjective-preposition support is in progress [(see issue)](https://github.com/scribe-org/Scribe-iOS/issues/86). +`†` Adjective-preposition support is in progress [(see #86)](https://github.com/scribe-org/Scribe-iOS/issues/86). `‡` Only for languages for which preposition annotation is needed. +WIP [#95](https://github.com/scribe-org/Scribe-iOS/issues/95): Updates to the above data can be done using [Data/update_data.py](https://github.com/scribe-org/Scribe-iOS/tree/main/Data/update_data.py). + # Setup [`⇧`](#contents) Users access Scribe language keyboards through the following: diff --git a/environment.yml b/environment.yml index 13b2d4bb..69c2580e 100644 --- a/environment.yml +++ b/environment.yml @@ -6,3 +6,5 @@ dependencies: - black>=19.10b0 - transformers>=4.12 - sentencepiece>=0.1.95 + - pip: + - wikidataintegrator>=0.9.22