diff --git a/Data/French/nouns/format_nouns.py b/Data/French/nouns/format_nouns.py index e0e8c545..e9902dda 100644 --- a/Data/French/nouns/format_nouns.py +++ b/Data/French/nouns/format_nouns.py @@ -16,12 +16,12 @@ def map_genders(wikidata_gender): """ Maps those genders from Wikidata to succinct versions. """ - if wikidata_gender == "masculine": + if wikidata_gender in ["masculine", "Q499327"]: return "M" - elif wikidata_gender == "feminine": + if wikidata_gender in ["feminine", "Q1775415"]: return "F" else: - return "" # necessary as French has words that are incorrectly marked common + return "" # nouns could have a gender that is not valid as an attribute def order_annotations(annotation): diff --git a/Data/German/nouns/format_nouns.py b/Data/German/nouns/format_nouns.py index 728efa88..5f3bbbcc 100644 --- a/Data/German/nouns/format_nouns.py +++ b/Data/German/nouns/format_nouns.py @@ -21,12 +21,14 @@ def map_genders(wikidata_gender): wikidata_gender : str The gender of the noun that was queried from WikiData """ - if wikidata_gender == "masculine": + if wikidata_gender in ["masculine", "Q499327"]: return "M" - if wikidata_gender == "feminine": + if wikidata_gender in ["feminine", "Q1775415"]: return "F" - if wikidata_gender == "neuter": + if wikidata_gender in ["neuter", "Q1775461"]: return "N" + else: + return "" # nouns could have a gender that is not valid as an attribute def order_annotations(annotation): diff --git a/Data/German/prepositions/format_prepositions.py b/Data/German/prepositions/format_prepositions.py index dba1237a..78a6ffa8 100644 --- a/Data/German/prepositions/format_prepositions.py +++ b/Data/German/prepositions/format_prepositions.py @@ -17,12 +17,14 @@ def convert_cases(case): Converts cases as found on Wikidata to more succinct versions. """ case = case.split(" case")[0] - if case == "accusative": + if case in ["accusative", "Q146078"]: return "Akk" - if case == "dative": + elif case in ["dative", "Q145599"]: return "Dat" - if case == "genitive": + elif case in ["genitive", "Q146233"]: return "Gen" + else: + return "" def order_annotations(annotation): diff --git a/Data/Portuguese/nouns/format_nouns.py b/Data/Portuguese/nouns/format_nouns.py index 777ab314..73be9a27 100644 --- a/Data/Portuguese/nouns/format_nouns.py +++ b/Data/Portuguese/nouns/format_nouns.py @@ -16,12 +16,12 @@ def map_genders(wikidata_gender): """ Maps those genders from Wikidata to succinct versions. """ - if wikidata_gender == "masculine": + if wikidata_gender in ["masculine", "Q499327"]: return "M" - elif wikidata_gender == "feminine": + if wikidata_gender in ["feminine", "Q1775415"]: return "F" else: - return "" + return "" # nouns could have a gender that is not valid as an attribute def order_annotations(annotation): diff --git a/Data/Russian/nouns/format_nouns.py b/Data/Russian/nouns/format_nouns.py index f1581c37..5908d117 100644 --- a/Data/Russian/nouns/format_nouns.py +++ b/Data/Russian/nouns/format_nouns.py @@ -21,14 +21,14 @@ def map_genders(wikidata_gender): wikidata_gender : str The gender of the noun that was queried from WikiData """ - if wikidata_gender == "masculine": + if wikidata_gender in ["masculine", "Q499327"]: return "M" - elif wikidata_gender == "feminine": + if wikidata_gender in ["feminine", "Q1775415"]: return "F" - elif wikidata_gender == "neuter": + if wikidata_gender in ["neuter", "Q1775461"]: return "N" else: - return "" + return "" # nouns could have a gender that is not valid as an attribute def order_annotations(annotation): diff --git a/Data/Russian/prepositions/format_prepositions.py b/Data/Russian/prepositions/format_prepositions.py index 5dcec5c1..57aa2c8e 100644 --- a/Data/Russian/prepositions/format_prepositions.py +++ b/Data/Russian/prepositions/format_prepositions.py @@ -17,20 +17,18 @@ def convert_cases(case): Converts cases as found on Wikidata to more succinct versions. """ case = case.split(" case")[0] - if case == "accusative": + if case in ["accusative", "Q146078"]: return "Akk" - elif case == "dative": + elif case in ["dative", "Q145599"]: return "Dat" - elif case == "genitive": + elif case in ["genitive", "Q146233"]: return "Gen" - elif case == "instrumental": + elif case in ["instrumental", "Q192997"]: return "Ins" - elif case == "prepositional": + elif case in ["prepositional", "Q2114906"]: return "Pre" - elif case == "locative": + elif case in ["locative", "Q202142"]: return "Loc" - elif case == "nominative": - return "Nom" else: return "" diff --git a/Data/Spanish/nouns/format_nouns.py b/Data/Spanish/nouns/format_nouns.py index 8aa4950f..c128f4ef 100644 --- a/Data/Spanish/nouns/format_nouns.py +++ b/Data/Spanish/nouns/format_nouns.py @@ -16,12 +16,12 @@ def map_genders(wikidata_gender): """ Maps those genders from Wikidata to succinct versions. """ - if wikidata_gender == "masculine": + if wikidata_gender in ["masculine", "Q499327"]: return "M" - elif wikidata_gender == "feminine": + if wikidata_gender in ["feminine", "Q1775415"]: return "F" else: - return "" + return "" # nouns could have a gender that is not valid as an attribute def order_annotations(annotation): diff --git a/Data/Swedish/nouns/format_nouns.py b/Data/Swedish/nouns/format_nouns.py index f2d2967e..9148d531 100644 --- a/Data/Swedish/nouns/format_nouns.py +++ b/Data/Swedish/nouns/format_nouns.py @@ -16,10 +16,12 @@ def map_genders(wikidata_gender): """ Maps those genders from Wikidata to succinct versions. """ - if wikidata_gender == "common gender": + if wikidata_gender in ["common gender", "Q1305037"]: return "C" - if wikidata_gender == "neuter": + if wikidata_gender in ["neuter", "Q1775461"]: return "N" + else: + return "" # nouns could have a gender that is not valid as an attribute def order_annotations(annotation): diff --git a/Data/update_data.py b/Data/update_data.py index 94b99936..86429a77 100644 --- a/Data/update_data.py +++ b/Data/update_data.py @@ -8,10 +8,42 @@ import json import os +import sys from tqdm.auto import tqdm from wikidataintegrator import wdi_core +with open("total_data.json") as f: + current_data = json.load(f) + +current_languages = list(current_data.keys()) +word_types = ["nouns", "verbs", "prepositions"] + +language = None +word_type = None +if len(sys.argv) == 2: + arg = sys.argv[1] + if arg in current_languages: + language = arg + elif arg in word_types: + word_type = arg + else: + InterruptedError( + """" + An invalid argument was specified. + For languages, please choose from those found as keys in total_data.json. + For grammatical types, please choose from nouns, verbs or prepositions. + """ + ) + +elif len(sys.argv) == 3: + language = sys.argv[1] + word_type = sys.argv[2] + + print(language) + print(word_type) + +# Derive Data directory elements for potential queries. data_dir_elements = [] for path, _, files in os.walk("."): @@ -28,11 +60,28 @@ } ) -word_types = ["nouns", "verbs", "prepositions"] - -with open("total_data.json") as f: - current_data = json.load(f) -current_languages = list(current_data.keys()) +# Subset current_languages and word_types if arguments have been passed. +if language is not None: + if language in current_languages: + current_languages = [l for l in current_languages if l == language] + else: + InterruptedError( + """" + An invalid language was specified. + Please choose from those found as keys in total_data.json. + """ + ) + +if word_type is not None: + if word_type in word_types: + word_types = [w for w in word_types if w == word_type] + else: + InterruptedError( + """" + An invalid grammatical type was specified. + Please choose from nouns, verbs or prepositions. + """ + ) possible_queries = [] for d in data_dir_dirs: @@ -58,9 +107,18 @@ query_lines = file.readlines() # First format the lines into a multi-line string and then pass this to wikidataintegrator. - query_results = wdi_core.WDFunctionsEngine.execute_sparql_query( + print(f"Querying {q.split('/')[0]} {q.split('/')[1]}") + query = wdi_core.WDFunctionsEngine.execute_sparql_query( """{}""".format("".join(query_lines)) ) -print(query_results) -print(query_path) + query_results = query["results"]["bindings"] + + results_formatted = [] + for r in query_results: # query_results is also a list + r_dict = {k: r[k]["value"] for k in r.keys()} + + results_formatted.append(r_dict) + +with open("./example.json", "w", encoding="utf-8",) as f: + json.dump(results_formatted, f, ensure_ascii=False, indent=2)