diff --git a/constants.py b/constants.py index 451dce9..e69de29 100644 --- a/constants.py +++ b/constants.py @@ -1,40 +0,0 @@ -import pymorphy2 -import re - -VERB_PREFIXES = [ - 'do', 'iz', 'izpo', 'nad', 'na', 'ne', 'ob', 'odpo', 'od', 'o', 'prědpo', - 'pod', 'po', 'prě', 'pre', 'pri', 'pro', 'råzpro', 'razpro', 'råz', 'raz', - 'sȯ', 's', 'u', 'vȯ', 'vo', 'v', 'vȯz', 'voz', 'vy', 'za', -] - -CYR_LETTER_SUBS = { - "н": "њ", "л": "љ", "е": "є", "и": "ы" -} - -SIMPLE_DIACR_SUBS = { - 'e': 'ě', 'c': 'č', 'z': 'ž', 's': 'š', -} -# NOTE: pymorphy2 cannot work with several changes, i.e. {'e': 'ě', 'e': 'ę'} -ETM_DIACR_SUBS = { - 'a': 'å', 'u': 'ų', 'č': 'ć', 'e': 'ę', - 'n': 'ń', 'r': 'ŕ', 'l': 'ľ', - # 'dž': 'đ' # ne funguje -} - -DEFAULT_UNITS = [ - [ - pymorphy2.units.DictionaryAnalyzer() - ], - pymorphy2.units.KnownPrefixAnalyzer(known_prefixes=VERB_PREFIXES), - [ - pymorphy2.units.UnknownPrefixAnalyzer(), - pymorphy2.units.KnownSuffixAnalyzer() - ] -] - -letters = "a-zа-яёěčžšåųćęđŕľńјљєњ" -BASE_ISV_TOKEN_REGEX = re.compile( - f'''(?:-|[^{letters}\s"'""«»„“-]+|[0-9{letters}_]+(-?[0-9{letters}_]+)*)''', - re.IGNORECASE | re.UNICODE -) - diff --git a/convert.py b/convert.py index 826c897..75e87e4 100644 --- a/convert.py +++ b/convert.py @@ -1,10 +1,6 @@ from __future__ import unicode_literals import re -import sys -import gzip import os.path -import bz2file as bz2 -import codecs import logging import ujson @@ -19,17 +15,20 @@ doubleform_signal = signal('doubleform-found') + def getArr(details_string): - return [x for x in details_string - .replace("./", '/') - .replace(" ", '') - .split('.') - if x != '' - ] - -diacr_letters = "žčšěйćżęųœ" -plain_letters = "жчшєjчжеуо" + return [ + x for x in details_string + .replace("./", '/') + .replace(" ", '') + .split('.') + if x != '' + ] + +# TODO: move this to normalizacija.py or constants.py +diacr_letters = "žčšěйćżęųœ" +plain_letters = "жчшєjчжеуо" lat_alphabet = "abcčdeěfghijjklmnoprsštuvyzž" cyr_alphabet = "абцчдеєфгхийьклмнопрсштувызж" @@ -44,17 +43,18 @@ def getArr(details_string): nms2std_trans = str.maketrans(nms_alphabet, std_alphabet) -extended_nms_alphabet = "áàâāíìîīĭıąǫũéēĕëèœóôŏöòĵĺļljýłçʒřťȯďś" -regular_etym_alphabet = "aaaaiiiiiiųųųeeėėėoooȯȯȯjľľľylczŕtods" +extended_nms_alphabet = "áàâāíìîīĭıąǫũéēĕëèœóôŏöòȯĵĺļljýłçʒřťďśńź" +regular_etym_alphabet = "aaaaiiiiiiųųųeeėėėoooȯȯȯȯjľľľylczŕťďśńź" ext_nms2std_nms_trans = str.maketrans(extended_nms_alphabet, regular_etym_alphabet) + def lat2cyr(thestring): # "e^" -> "ê" # 'z\u030C\u030C\u030C' -> 'ž\u030C\u030C' thestring = unicodedata.normalize( - 'NFKC', + 'NFKC', thestring ).lower().replace("\n", " ") @@ -76,18 +76,23 @@ def lat2cyr(thestring): def lat2etm(thestring): - return thestring.translate(ext_nms2std_nms_trans).strip() + # hack with dʒ + return thestring.translate(ext_nms2std_nms_trans).replace("đ", "dʒ").strip() def lat2std(thestring): return thestring.translate(nms2std_trans).replace("đ", "dž").strip() + translation_functions = { "isv_cyr": lat2cyr, "isv_lat": lat2std, "isv_etm": lat2etm, } +VERB_AUX_WORDS = {'(je)', 'sę', '(sųt)', 'ne'} + + def infer_pos(arr): if 'adj' in arr: return 'adjective' @@ -100,13 +105,13 @@ def infer_pos(arr): if 'prep' in arr: return 'preposition' if 'pron' in arr: - return 'pronoun'; + return 'pronoun' if 'num' in arr: - return 'numeral'; + return 'numeral' if 'intj' in arr: - return 'interjection'; + return 'interjection' if 'v' in arr: - return 'verb'; + return 'verb' def export_grammemes_description_to_xml(tag_set): @@ -140,7 +145,7 @@ def __init__(self, fname): self.lt2opencorpora = {} with open(fname, 'rb') as fp: - r = DictReader(fp,delimiter=';') + r = DictReader(fp, delimiter=';') for tag in r: # lemma form column represents set of tags that wordform should @@ -190,13 +195,15 @@ def _get_group_no(self, tag_name): return len(self.groups) def sort_tags(self, tags): + # TODO: this function is not used, but the output would be nicer if it were def inner_cmp(a, b): a_group = self._get_group_no(a) b_group = self._get_group_no(b) + # cmp is a built-in python function if a_group == b_group: - return cmp(a, b) - return cmp(a_group, b_group) + return cmp(a, b) # noqa: F821 + return cmp(a_group, b_group) # noqa: F821 return sorted(tags, cmp=inner_cmp) @@ -298,7 +305,6 @@ def export_to_xml(self, i, mapping, rev=1, lang="isv_cyr"): l_form = ET.SubElement(lemma, "l", t=output_lemma_form) self._add_tags_to_element(l_form, common_tags, mapping) - for forms in self.forms.values(): for form in forms: output_form = form.form.lower() @@ -316,6 +322,7 @@ def export_to_xml(self, i, mapping, rev=1, lang="isv_cyr"): return lemma + def yield_all_simple_adj_forms(forms_obj, pos): if "casesSingular" in forms_obj: forms_obj['singular'] = forms_obj['casesSingular'] @@ -366,6 +373,7 @@ def yield_all_simple_adj_forms(forms_obj, pos): yield content[0], {case, "plur", "neut", animatedness} | pos yield content[0], {case, "plur", "femn", animatedness} | pos + def yield_all_noun_forms(forms_obj, pos, columns): for case, data in forms_obj.items(): for (form, form_name) in zip(data, columns): @@ -376,7 +384,7 @@ def yield_all_noun_forms(forms_obj, pos, columns): form_name = "plur" if form_name == "masculine": form_name = 'masc' - # TODO: + # TODO: if form_name == "feminine/neuter": yield form, {case, 'femn'} | pos yield form, {case, 'neut'} | pos @@ -398,13 +406,10 @@ def yield_all_noun_forms(forms_obj, pos, columns): else: yield form, {case, form_name} | pos -VERB_AUX_WORDS = {'(je)', 'sę', '(sųt)', 'ne'} def yield_all_verb_forms(forms_obj, pos, base): is_byti = forms_obj['infinitive'] == 'bytì' - # if forms_obj['infinitive'].replace("ì", "i") != base: - # print(forms_obj['infinitive'], base) # ====== Infinitive ====== yield forms_obj['infinitive'], pos | {"INFN"} @@ -441,7 +446,6 @@ def yield_all_verb_forms(forms_obj, pos, base): subentry = entry.split(" ")[0] yield subentry, pos | {time} | one_tag - # ====== Future ====== # ['future'] # future uses infinitive and aux verbs @@ -488,12 +492,14 @@ def yield_all_verb_forms(forms_obj, pos, base): [{'actv', 'present'}, {'pssv', 'present'}, {'actv', 'past'}, {'pssv', 'past'}] ): # TODO: will fuck up if multi-word verb - parts = (forms_obj[time] + parts = ( + forms_obj[time] .replace("ne ", "") - .replace("ši sá", "ša sę").replace("ši sé", "še sę") # THIS MAKES PARSER NON STANDARD COMPLIANT + .replace("ši sá", "ša sę").replace("ši sé", "še sę") # THIS IS A CHANGE FROM ORIGINAL LOGIC .replace(" sę", "") .replace(",", "").replace("(", "") .replace(")", "") - .split(" ")) + .split(" ") + ) subentry_tags = [{"V-ju"}, {'V-m'}] if len(parts) == 1: @@ -521,11 +527,12 @@ def yield_all_verb_forms(forms_obj, pos, base): def iterate_json(forms_obj, pos_data, base): pos = infer_pos(pos_data) + pos_data = {x for x in pos_data if x != "m/f"} if isinstance(forms_obj, str) or pos is None: return base, pos_data if "adj" in pos: - yield from yield_all_simple_adj_forms(forms_obj, pos_data) + yield from yield_all_simple_adj_forms(forms_obj, pos_data) content = forms_obj['comparison'] yield content['positive'][0], {"positive"} | pos_data @@ -556,7 +563,6 @@ def iterate_json(forms_obj, pos_data, base): else: yield form, pos_data - elif forms_obj['type'] == 'noun': columns = forms_obj['columns'] yield from yield_all_noun_forms(forms_obj['cases'], pos_data, columns) @@ -579,9 +585,9 @@ def iterate_json(forms_obj, pos_data, base): yield from yield_all_noun_forms(forms_obj, pos_data, ['singular', 'plural']) return base, pos_data - + base_tag_set = {} -INDECLINABLE_POS = {'adverb', 'conjunction', 'preposition', 'interjection', 'particle', 'pronoun', 'numeral'} +INDECLINABLE_POS = {'adverb', 'conjunction', 'preposition', 'interjection', 'particle', 'pronoun', 'numeral'} class Dictionary(object): @@ -603,10 +609,10 @@ def __init__(self, fname, mapping): forms_obj_array = ujson.loads(forms) # HOTFIX TIME! - if word_id == "36454": - pos = "adj." if word_id == "36649": - pos = "f." + pass + if word_id == "6181": + pass add_tags = [{f"VF-{form_num+1}"} for form_num, _ in enumerate(forms_obj_array)] @@ -615,6 +621,7 @@ def __init__(self, fname, mapping): isv_lemmas = isv_lemma.split(",") if "m./f." in pos: + # example: "6181" "križ","","m./f.","1","cross", isv_lemmas = [isv_lemma, isv_lemma] add_tags = [{'masc'}, {'femn'}] for add_tag, forms_obj, isv_lemma_current in zip(add_tags, forms_obj_array, isv_lemmas): @@ -623,7 +630,7 @@ def __init__(self, fname, mapping): details_set = set(getArr(pos)) | add_tag # if infer_pos is None, then fallback to the first form local_pos = infer_pos(details_set) or pos - if local_pos == "noun": + if local_pos == "noun": details_set |= {'noun'} if not isinstance(forms_obj, dict): @@ -669,7 +676,9 @@ def __init__(self, fname, mapping): if len(all_forms) > 2: print(isv_lemma_current, all_forms) raise NameError - all_tags = [{f"V-flex-{form_num+1}"} for form_num, _ in enumerate(all_forms)] + all_tags = [ + {f"V-flex-{form_num+1}"} for form_num, _ in enumerate(all_forms) + ] if len(all_forms) == 1: all_tags = [set()] @@ -679,7 +688,9 @@ def __init__(self, fname, mapping): tags=tag_set | add_tag, )) if local_pos in {"noun", "numeral"}: - number_forms |= {one_tag for one_tag in tag_set if one_tag in ['singular', 'plural']} + number_forms |= { + one_tag for one_tag in tag_set if one_tag in ['singular', 'plural'] + } if len(number_forms) == 1: if number_forms != {"singular"} and number_forms != {"plural"}: print(number_forms, current_lemma.lemma_form.form) diff --git a/example1.py b/example1.py index 737712a..571c5b8 100644 --- a/example1.py +++ b/example1.py @@ -1,159 +1,20 @@ import pymorphy2 import argparse -from constants import VERB_PREFIXES, SIMPLE_DIACR_SUBS, ETM_DIACR_SUBS, DEFAULT_UNITS -import os - -CS_FLAVOR = { - "VERB": - { - "infn": (-2, 't'), - # "1per+sing": (-1, 'u'), - "2per+sing": {'aješ': 'áš', 'iš': 'íš'}, - "3per+sing": {'aje': 'á', 'i': 'í'}, - "1per+plur": {'ajemo': 'áme', 'imo': 'íme'}, - "2per+plur": {'ajete': 'áte', 'ite': 'íte'}, - "3per+plur": {'jųt': 'jí', 'ųt': 'ou', 'ęt': "í"}, - }, - "NOUN": - { - "gent+sing+masc": (-1, "e"), - "accs+sing+masc": (-1, "e"), - "gent+plur+masc": (-2, "ů"), - "datv+plur+masc": (-2, "ům"), - "accs+plur+masc": (-2, "e"), - "loct+plur+masc": (-2, "ech"), - "ablt+sing+femn": (-3, "ou"), - "datv+plur+femn": (-2, "ám"), - "ablt+sing+neut": (-2, "em"), - "datv+plur+neut": (-2, "ům"), - "loct+plur+neut": (-2, "ech") - }, - "ADVB": {"ADVB": (-1, 'ě')}, - "ADJF": - { - "nomn+plur+femn": (-1, "é"), - "accs+plur+femn": (-1, "é"), - "nomn+sing+neut": (-1, "é"), - "accs+sing+neut": (-1, "é"), - "nomn+plur+neut": (-1, "á"), - "accs+plur+neut": (-1, "á"), - - "accs+sing+femn": (-1, "ou"), - "ablt+sing+femn": (-1, "ou"), - "loct+sing+masc": (-2, "ém"), - "loct+sing+neut": (-2, "ém"), - "loct+sing+femn": (-2, "é"), - "datv+sing+femn": (-2, "é"), - "gent+sing+femn": (-2, "é"), - - "accs+plur+masc": (-2, "é"), - "loct+plur": (-2, "ých"), - "datv+plur": (-2, "ých"), - "ablt+plur": (-3, "ými"), - } -} - - -PL_FLAVOR = { - "VERB": - { - "infn": (-2, 'Ч'), - "1per+sing": (-1, 'ę'), - "3per+plur": (-2, 'ą'), - "3per+plur": (-2, 'ą'), - }, - "NOUN": - { - "loct+sing+masc": (-1, "ě"), - "accs+sing+femn": (-1, "ę"), - }, - "ADVB": {"ADVB": (-1, 'e')}, - "ADJF": - { - "nomn+plur+femn": (-1, "ie"), - "accs+plur+femn": (-1, "ie"), - "nomn+plur+neut": (-1, "ie"), - "accs+plur+neut": (-1, "ie"), - - "accs+sing+femn": (-1, "ą"), - "ablt+sing+femn": (-1, "ą"), - - "loct+sing+femn": (-2, "ej"), - "datv+sing+femn": (-2, "ej"), - "gent+sing+femn": (-2, "ej"), - - "accs+plur+masc": (-2, "ych"), - "loct+plur": (-2, "ych"), - "datv+plur": (-2, "ych"), - } -} - - -SR_FLAVOR = { - "VERB": - { - }, - "NOUN": - { - "nomn+plur+masc": (-1, "ovi"), - "gent+plur+masc": (None, "a"), - # TODO: https://fastlanguagemastery.com/learn-foreign-languages/serbian-language/serbian-cases-of-nouns/ - }, - "ADJF": - { - - "nomn+sing+masc": {"ny": "an"}, - "gent+sing+masc": (-1, ""), - "accs+sing+masc+anim": (-1, ""), - "datv+sing+masc": (-1, ""), - "loct+sing+masc": (-1, ""), - - "gent+sing+femn": (-2, "e"), - "ablt+sing+femn": (-2, "om"), - - "loct+plur": (-2, "im"), - "ablt+plur": (-3, "im"), - } -} +from isv_nlp_utils.constants import DEFAULT_UNITS +from isv_nlp_utils.flavorizacija import ( + CS_FLAVOR, PL_FLAVOR, RU_FLAVOR, SR_FLAVOR, + rus_letter_change, pol_letter_change, cz_letter_change, srb_letter_change +) +import os -RU_FLAVOR = { - "VERB": - { - "infn": (-1, 'ь'), - "3per+sing": (None, 't'), - }, - "NOUN": - { - "loct+sing+masc": (-1, "ě"), - }, - "ADJF": - { - "nomn+sing+femn": (None, "ja"), - "nomn+sing+neut": (None, "ě"), - "nomn+sing+masc": (None, "j"), - - "accs+sing+femn": (None, "ju"), - "accs+sing+neut": (None, "ě"), - "accs+sing+masc+anim": (None, ""), - "accs+sing+masc+inan": (None, "j"), - - # "accs+plur": (None, "iě"), - "nomn+plur": (-1, "ые"), - "accs+plur+neut": (-1, "ые"), - # "accs+plur+femn+anim": (-1, "ых"), - # "accs+plur+femn+inan": (-1, "ые"), - # "accs+plur+masc+anim": (-1, "ых"), - # "accs+plur+masc+inan": (-1, "ые"), - } -} - def flavorise(word, golden_pos_tag, isv_morph, flavor, ju): if golden_pos_tag == "PNCT": return word if golden_pos_tag == "ADVB": - variants = [v for v in isv_morph.parse(word) + variants = [ + v for v in isv_morph.parse(word) if v.tag.POS == "ADJF" and v.tag.number == "sing" and v.tag.gender == "neut" and v.tag.case == "nomn" ] @@ -181,7 +42,7 @@ def flavorise(word, golden_pos_tag, isv_morph, flavor, ju): for condition_plus, transform in flavor_rules.items(): conditions_arr = condition_plus.split("+") is_match = all( - all(cond in v.tag for cond in conditions_arr) + all(cond in v.tag for cond in conditions_arr) for v in variants ) if is_match: @@ -195,92 +56,23 @@ def flavorise(word, golden_pos_tag, isv_morph, flavor, ju): return word -# no j/й/ь support -lat_alphabet = "abcčdeěfghijklmnoprsštuvyzžęųćåńľŕ" -cyr_alphabet = "абцчдеєфгхијклмнопрсштувызжяучанлр" -lat2cyr_trans = str.maketrans(lat_alphabet, cyr_alphabet) -pol_alphabet = "abcčdeěfghijklmnoprsštuwyzżęąconlr" -lat2pol_trans = str.maketrans(lat_alphabet, pol_alphabet) - -def srb_letter_change(word): - word = word.replace('ć', "ћ").replace('dž', "ђ").replace("ę", "е") - word = word.translate(lat2cyr_trans) - - return word.replace('ы', "и").replace('нј', "њ").replace('лј', "љ") - -def pol_letter_change(word): - word = word.translate(lat2pol_trans) - return (word.replace('č', "cz").replace('š', "sz") - .replace('rj', "rz").replace('rě', "rze").replace('ri', "rzy") - .replace('ě', "ie") - .replace('Ч', "ć") - .replace('lj', "л").replace('l', "ł").replace("л", "l").replace('łę', "lę") - .replace('nj', "ni").replace('wj', "wi") - .replace('ci', "cy") - .replace('ji', "i") - .replace('dż', "dz") - ) - -def cz_letter_change(word): - return (word.replace('ę', "ě") - .replace('ų', "u") - .replace('šč', "št") - .replace('rje', "ří") - .replace('rj', "ř") - .replace('rě', "ře") - .replace('ri', "ři") - .replace('đ', "z") - .replace('å', "a") - .replace('h', "ch") - .replace('g', "h") - .replace('ć', "c") - .replace('kě', "ce") - .replace('gě', "ze") - .replace('lě', "le") - .replace('sě', "se") - .replace('hě', "še") - .replace('cě', "ce") - .replace('zě', "ze") - .replace('nju', "ni") - .replace('nj', "ň") - .replace('tje', "tí") - .replace('dje', "dí") - .replace('lju', "li") - .replace('ču', "či") - .replace('cu', "ci") - .replace('žu', "ži") - .replace('šu', "ši") - .replace('řu', "ři") - .replace('zu', "zi") - .replace('ijejų', "í") - .replace('ija', "e") - .replace('ijų', "i") - .replace('ij', "í") - ) - -def rus_letter_change(word): - word = word.replace("ń", "нь").replace("ľ", "ль") - word = word.translate(lat2cyr_trans) - return (word.replace('ју', "ю").replace('ја', "я").replace('јо', "ё") - .replace('ији', "ии") - .replace('рј', "рь").replace('лј', "ль").replace('нј', "нь") - .replace('ј', "й") - .replace('йя', "я").replace('йе', "е") - .replace('ья', "я").replace('ье', "е") - .replace('дж', "жд") - ) if __name__ == "__main__": parser = argparse.ArgumentParser( - description='Kludge Flavorisation Example') + description='Kludge Flavorisation Example' + ) parser.add_argument('path') args = parser.parse_args() isv_morph = pymorphy2.MorphAnalyzer(os.path.join(args.path, "out_isv_etm"), units=DEFAULT_UNITS) - text = 'myslim že to bųde pomoćno za råzvitų flavorizacijų . Toj tekst v råzvitoj {LANG} flavorizaciji bųde izględati tako . Take prěměny mogųt pomagati v učeńju i råzuměńju medžuslovjańskogo języka i drugyh slovjańskyh językov . Takože to jest važny krok v tvorjeńju mehanizma avtomatičnogo prěklada .'.split(" ") + text = ('myslim že to bųde pomoćno za råzvitų flavorizacijų . ' + 'Toj tekst v råzvitoj {LANG} flavorizaciji bųde izględati tako . ' + 'Take prěměny mogųt pomagati v učeńju i råzuměńju medžuslovjańskogo języka i drugyh slovjańskyh językov . ' + 'Takože to jest važny krok v tvorjeńju mehanizma avtomatičnogo prěklada .' + ).split(" ") - tags = ('VERB CONJ NPRO VERB ADVB PREP ADJF NOUN PNCT ' + tags = ('VERB CONJ NPRO VERB ADVB PREP ADJF NOUN PNCT ' 'NPRO NOUN PREP ADJF ADJF NOUN VERB VERB ADVB PNCT ' 'ADJF NOUN VERB VERB PREP NOUN CONJ NOUN ADJF NOUN CONJ ADJF ADJF NOUN PNCT ' 'ADVB NPRO VERB ADJF NOUN PREP NOUN NOUN ADJF NOUN PNCT ' @@ -291,7 +83,7 @@ def rus_letter_change(word): print() ALL_LANG_DATA = [ {'nomn': 'русскы', 'loct': 'russkoj', 'flavor': RU_FLAVOR, 'letter_change': rus_letter_change, 'ju': True}, - {'nomn': 'польскы', 'loct': 'poljskoj', 'flavor': PL_FLAVOR, 'letter_change': pol_letter_change, 'ju': True}, + {'nomn': 'пољскы', 'loct': 'poljskoj', 'flavor': PL_FLAVOR, 'letter_change': pol_letter_change, 'ju': True}, {'nomn': 'чешскы', 'loct': 'češskoj', 'flavor': CS_FLAVOR, 'letter_change': cz_letter_change, 'ju': False}, {'nomn': 'србскы', 'loct': 'srbskoj', 'flavor': SR_FLAVOR, 'letter_change': srb_letter_change, 'ju': False}, ] @@ -299,7 +91,8 @@ def rus_letter_change(word): print(f"РЕЗУЛТАТ ({lang_data['nomn']})") print(">", end=" ") for word, tag in zip(text, tags): - if word == "{LANG}": word = lang_data['loct'] + if word == "{LANG}": + word = lang_data['loct'] raw_flavorized = flavorise(word, tag, isv_morph, lang_data['flavor'], lang_data['ju']) func = lang_data['letter_change'] print(func(raw_flavorized), end=" ") diff --git a/example2.py b/example2.py index bb40dde..9007843 100644 --- a/example2.py +++ b/example2.py @@ -1,6 +1,5 @@ -import pymorphy2 import argparse -from constants import VERB_PREFIXES, SIMPLE_DIACR_SUBS, ETM_DIACR_SUBS, DEFAULT_UNITS, BASE_ISV_TOKEN_REGEX +from isv_nlp_utils.constants import BASE_ISV_TOKEN_REGEX, create_analyzers_for_every_alphabet import ipymarkup # pip install ipymarkup @@ -61,23 +60,14 @@ def print_spellcheck(text, std_morph): if __name__ == "__main__": - parser = argparse.ArgumentParser( - description='Kludge Spellcheck Example') + parser = argparse.ArgumentParser(description='Kludge Spellcheck Example') parser.add_argument('path') args = parser.parse_args() path = args.path - std_morph = pymorphy2.MorphAnalyzer( - path+"out_isv_lat", - units=DEFAULT_UNITS, - char_substitutes=SIMPLE_DIACR_SUBS - ) - - etm_morph = pymorphy2.MorphAnalyzer( - path+"out_isv_etm", - units=DEFAULT_UNITS, - char_substitutes=ETM_DIACR_SUBS - ) + abecedas = create_analyzers_for_every_alphabet(path) + std_morph = abecedas['lat'] + etm_morph = abecedas['etm'] text = "ja funguju i razuměju avtododavanje etymologičnyh bukv" @@ -107,8 +97,12 @@ def print_spellcheck(text, std_morph): print(text_full) print() - text = "Biblioteka pymorphy2 jest napisana za jezyk Python v 2012 letu. Ona jest ne jedino lemmatizer, napravdu ona jest morfologičny analizator i generator (to znači že biblioteka uměje razuměti i budovati fleksiju slov). Ona ima poddržku russkogo jezyka i eksperimentalnu poddržku ukrajinskogo jezyka." - text = "Biblioteka pymorphy2 jest napisana za jezyk Python v 2012 letu. Ona imaje nekoliko osoblivostej, ktore delajut jej ukoristanje za MS mnogo uměstnym." - + text = ("Biblioteka pymorphy2 jest napisana za jezyk Python v 2012 letu. " + "Ona jest ne jedino lemmatizer, napravdu ona jest morfologičny analizator i generator " + "(to znači že biblioteka uměje razuměti i budovati fleksiju slov). Ona ima poddržku " + "russkogo jezyka i eksperimentalnu poddržku ukrajinskogo jezyka.") print_spellcheck(text, std_morph) + text = ("Biblioteka pymorphy2 jest napisana za jezyk Python v 2012 letu. " + "Ona imaje nekoliko osoblivostej, ktore delajut jej ukoristanje za MS mnogo uměstnym.") + print_spellcheck(text, std_morph) diff --git a/example3.py b/example3.py index d2c092e..7332f45 100644 --- a/example3.py +++ b/example3.py @@ -1,25 +1,29 @@ -import pymorphy2 import argparse from collections import Counter -from constants import VERB_PREFIXES, SIMPLE_DIACR_SUBS, ETM_DIACR_SUBS, DEFAULT_UNITS +from isv_nlp_utils.constants import create_analyzers_for_every_alphabet if __name__ == "__main__": - parser = argparse.ArgumentParser( - description='Kludge Statistics Example') + parser = argparse.ArgumentParser(description='Kludge Statistics Example') parser.add_argument('path') args = parser.parse_args() path = args.path - etm_morph = pymorphy2.MorphAnalyzer( - path+"out_isv_etm", - units=DEFAULT_UNITS, - char_substitutes=ETM_DIACR_SUBS - ) + etm_morph = create_analyzers_for_every_alphabet(path)['etm'] - text = "Naša misija jest govoriti najvyše råzumlivo, zato dělamo eksperimenty, čęsto pytajemo ljudi i diskutujemo o tom kako ulěpšati naše govorenje. Zato takože čęsto napominamo ljudi, kaki dělajųt pogrěšky, aby govorili drugo. To sųt vsegda sověty a tvoje govorenje to nakraj jest tvoj izbor. My prosto staramo sę byti možlivo najvyše råzumlivi" + text = ( + "Naša misija jest govoriti najvyše råzumlivo, " + "zato dělamo eksperimenty, čęsto pytajemo ljudi i diskutujemo o tom " + "kako ulěpšati naše govorenje. Zato takože čęsto napominamo ljudi, " + "kaki dělajųt pogrěšky, aby govorili drugo. To sųt vsegda sověty " + "i tvoje govorenje to nakraj jest tvoj izbor. My prosto staramo sę " + "byti možlivo najvyše råzumlivi" + ) - text = "on je pisal, ona je pisala, oni sut pisali. Ja jesm pisavša. Piši i ty, jerbo pisano slovo jest dobro. Generalno pisanje jest dobro" + text = ( + "on je pisal, ona je pisala, oni sut pisali. Ja jesm pisavša. Piši i ty, " + "jerbo pisano slovo jest dobro. Generalno pisanje jest dobro" + ) print(etm_morph.parse("pisanje")) cnt = Counter() @@ -38,4 +42,3 @@ for form in forms: cnt[form] += 1/len(forms) print(cnt) - diff --git a/example4.py b/example4.py index 6aef507..e2932b9 100644 --- a/example4.py +++ b/example4.py @@ -4,10 +4,10 @@ import argparse from collections import Counter -import pymorphy2 import fitz # pip install pymupdf -from constants import VERB_PREFIXES, SIMPLE_DIACR_SUBS, ETM_DIACR_SUBS, DEFAULT_UNITS, BASE_ISV_TOKEN_REGEX +from isv_nlp_utils.constants import create_analyzers_for_every_alphabet, iterate_over_text + def download_file(url): local_filename = url.split('/')[-1] @@ -17,16 +17,9 @@ def download_file(url): return local_filename -def iterate_over_text(paragraph): - delimiters = BASE_ISV_TOKEN_REGEX.finditer(paragraph) - for delim in delimiters: - if any(c.isalpha() for c in delim.group()): - token = delim.group() - yield token if __name__ == "__main__": - parser = argparse.ArgumentParser( - description='Kludge Search Example') + parser = argparse.ArgumentParser(description='Kludge Search Example') parser.add_argument('path') args = parser.parse_args() path = args.path @@ -42,19 +35,12 @@ def iterate_over_text(paragraph): for page in doc: text += [page.getText()] - - std_morph = pymorphy2.MorphAnalyzer( - # path+"out_isv_etm", - path+"out_isv_lat", - units=DEFAULT_UNITS, - # char_substitutes=ETM_DIACR_SUBS - char_substitutes=SIMPLE_DIACR_SUBS - ) + std_morph = create_analyzers_for_every_alphabet(path)['lat'] cnt = Counter() for page in text[1:]: for token in iterate_over_text(page): - if not std_morph.word_is_known(token): + if not std_morph.word_is_known(token): razbor = std_morph.parse(token) lemma_form = razbor[0].normal_form if razbor else token cnt[lemma_form] += 1 @@ -83,13 +69,10 @@ def iterate_over_text(paragraph): print(std_morph.parse("petsto")) print(std_morph.parse("sam")) print(std_morph.parse("zapad")) - print(std_morph.word_is_known('zapad')) + print(std_morph.word_is_known('zapad')) print(form_data['zapad']) print(std_morph.parse("puti")) for lemma, data in form_data.items(): - #print() - #print(lemma) - #print(data) df.loc[lemma, :] = data print(df.head()) print(df.index) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8a110a5 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +pymorphy2 +ujson +unicodecsv +blinker +opencorpora-tools==0.6 diff --git a/run_generate.py b/run_generate.py index 9070e6e..a6665b3 100644 --- a/run_generate.py +++ b/run_generate.py @@ -7,6 +7,9 @@ from convert import Dictionary, doubleform_signal from pathlib import Path +import pymorphy2 + +from isv_nlp_utils.constants import DEFAULT_UNITS, ETM_DIACR_SUBS REPEATED_FORMS = Counter() @@ -24,7 +27,7 @@ def log_doubleform(sender, tags_signature): if RUN_EXPORT: subprocess.check_output( ["npm", "run", "generateParadigms"], - cwd=join(DIR,"interslavic"), shell=True + cwd=join(DIR, "interslavic"), shell=True ) @@ -61,27 +64,19 @@ def log_doubleform(sender, tags_signature): print('suffixes.json') print(Path(join(out_dir, 'suffixes.json')).stat().st_size) - print('suff.txt') - print(Path(join(DICTS_DIR, 'suff.txt')).stat().st_size) + # print('suff.txt') + # print(Path(join(DICTS_DIR, 'suff.txt')).stat().st_size) - print('paradigm.txt') - print(Path(join(DICTS_DIR, 'paradigm.txt')).stat().st_size) - - -import pymorphy2 -from pymorphy2 import units + # print('paradigm.txt') + # print(Path(join(DICTS_DIR, 'paradigm.txt')).stat().st_size) out_dir_etm = join(DIR, "pymorphy2-dicts", "out_isv_etm") etm_morph = pymorphy2.MorphAnalyzer( out_dir_etm, - units=[pymorphy2.units.DictionaryAnalyzer(), pymorphy2.units.KnownSuffixAnalyzer()], - char_substitutes={ - 'e': 'ě', 'c': 'č', 'z': 'ž', 's': 'š', - 'a': 'å', 'u': 'ų', 'č': 'ć', 'e': 'ę', - # 'dž': 'đ' # ne funguje - } + units=DEFAULT_UNITS, + char_substitutes=ETM_DIACR_SUBS ) print(etm_morph.parse("ljudij")) @@ -103,8 +98,6 @@ def log_doubleform(sender, tags_signature): print(morph.parse("фунгујут")) print() - - phrase = "Тутчас можем писати на прдачном језыковєдском нарєчју" phrase = "нарєчје јест разумливо приблизно всим машинам без ученја" @@ -114,7 +107,8 @@ def log_doubleform(sender, tags_signature): phrase = "писанйе jедним столбецем дозволjаjе додати информациjу односно двусмыслности" -phrase = "чи можем ли jа говорити на прдачном језыковєдском нарєчју в тутом каналу буде ли то добро Jесм поправил нєкаке грєшкы од првого раза" +phrase = "чи можем ли jа говорити на прдачном језыковєдском нарєчју в тутом каналу буде ли то добро" +phrase = "Jесм поправил нєкаке грєшкы од првого раза" phrase = "понєктори користники сут измыслили нєколико прдачных нарєчиј" phrase = "мене приjати же тутчас jест канал в ктором jа можем писати на прдачном језыковєдском нарєчју"