Skip to content

Commit

Permalink
funguje vysze dobro ale ne popolno
Browse files Browse the repository at this point in the history
  • Loading branch information
bt2901 committed Apr 24, 2021
1 parent 9c1062f commit f66d873
Show file tree
Hide file tree
Showing 4 changed files with 126 additions and 71 deletions.
12 changes: 9 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,17 @@
**Чи работаjе ли?** Вероjетно, да. Можно, формат експортованого фаjла не jест наjлепши, але главно же оно работаjе.

**Проблем**
Тутчас имаjемо проблем с именниками, кторе могут имати како мужскы, тако и женскы род.
~~Тутчас имаjемо проблем с именниками, кторе могут имати како мужскы, тако и женскы род.~~

**Проблем**
Глаголы, кторе имаjут алтернативне формы (напр., `vladati, vladěti`) имаjут зло склоненьjе.
~~Глаголы, кторе имаjут алтернативне формы (напр., `vladati, vladěti`) имаjут зло склоненьjе.~~

**Проблем**
Вероjетно, потрєбно нєчто измыслити однсно совршаного/несовршаного аспекта.

**Проблем**
Вероjетно, потрєбно имати додатну фазу генерациjи склоњеньја глаголных имен.

## _Конверсија до формата OpenCorpora XML_
Модификована версиjа `LT2OpenCorpora` се користаjе за тут стадију.

Expand All @@ -41,12 +44,15 @@
* Потрєбно измыслити добро изjасненjе грамем за меджусловjанскы язык (они будут имати разлику с русскими грамемами, jербо егзистуjут алтернативне северне/jужне формы, кратке/полны формы заименников и часованjе глаголов фунгуjе инако)
* Не jесм уверены, чи правилно ли имати присловник и придавник како различне формы jедного слова. Вероjетно, треба было бы вообче изчркнути формы, подобне `најбоље абхазски` и можно такоже изчркнути формы, подобне `најкомпјутернєјши`
* Потрєбно измыслити добру обработку вечеj, кторе имаjут нєколико словесов (Lemma = "zadržati dyh", але все формы не имаjут "дых") и возвратных глаголов
* Подобны ствар: osnovany na/podpirany od/polny naděje/prěznačeny za
* Потрєбно измыслити нєкаку методу работы с заименниками затоже прємного опциj анализа имаjут оне.
* Потрєбно додати до словников слова, кторе не има склонениj ("односно", "чи", "ли", "и")

## _Генерациjа словников pymorphy2_

**Зачто?**

**Како?** Склонировать репозиторий `pymorphy2-dicts`
**Како?** Склонировати репозиторий `pymorphy2-dicts`

**Чи работаjе ли?** Вєројетно да, але потрєбно изтворити конфиг фаjл, подобны такому:
* `https://github.com/kmike/pymorphy2/blob/master/pymorphy2/lang/ru/config.py`
Expand Down
149 changes: 87 additions & 62 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -479,10 +479,17 @@ def iterate_json(forms_obj, pos_data, base):
yield from yield_all_simple_adj_forms(forms_obj, pos_data)
content = forms_obj['comparison']
yield content['positive'][0], {"positive"} | pos_data
yield content['comparative'][0], {"comparative"} | pos_data

comp_form = content['comparative'][0]
if " " not in comp_form:
yield comp_form, {"comparative"} | pos_data

# TODO: is it right to treat it as adjective??
yield content['positive'][1], {"adverb", "positive"} | pos_data
yield content['comparative'][1], {"adverb", "comparative"} | pos_data
comp_form = content['comparative'][1]
if " " not in comp_form:
yield comp_form, {"adverb", "comparative"} | pos_data

elif "numeral" in pos or 'pronoun' in pos:
if forms_obj['type'] == 'adjective':
yield from yield_all_simple_adj_forms(forms_obj, pos_data)
Expand Down Expand Up @@ -528,67 +535,69 @@ def __init__(self, fname, mapping):
for i, line in enumerate(fp):
raw_data, forms, pos_formatted = line.split("\t")
word_id, isv_lemma, addition, pos, *rest = ujson.loads(raw_data)
forms_obj = ujson.loads(forms)
forms_obj_array = ujson.loads(forms)

if not isinstance(forms_obj, dict):
if forms_obj != '':
continue
# print([isv_lemma, pos_formatted, forms_obj])
if " " in isv_lemma and "," not in isv_lemma and isinstance(forms_obj, dict):
splitted = isv_lemma.split()
if len(splitted) == 2 and "sę" in splitted:
counter_se += 1
else:
counter_multiword += 1
if "verb" not in pos_formatted:
# TODO TODO XXX
# print(isv_lemma.split(), pos_formatted)
# print(forms_obj)
counter_multiword_verb += 1


# Here we've found a new lemma, let's add old one to the list
# and continue

details_set = set(getArr(pos))
# if infer_pos is None, then fallback to the first form
pos = infer_pos(details_set) or pos
if pos == "noun":
details_set |= {'noun'}
current_lemma = Lemma(
isv_lemma,
lemma_form_tags=details_set,
)
number_forms = set()
for current_form, tag_set in iterate_json(forms_obj, details_set, isv_lemma):
if "/" in current_form:
all_forms = current_form.split("/")
else:
all_forms = [current_form]
if len(all_forms) > 2:
print(isv_lemma, all_forms)
raise NameError
for single_form, add_tag in zip(all_forms, [set(), {"alt-form"}]):
current_lemma.add_form(WordForm(
single_form,
tags=tag_set | add_tag,
))
if pos in {"noun", "numeral"}:
number_forms |= {one_tag for one_tag in tag_set if one_tag in ['singular', 'plural']}
if len(number_forms) == 1:
numeric = {"Sgtm"} if number_forms == {"singular"} else {"Pltm"}
current_lemma.lemma_form.tags |= numeric
if pos == "verb":
if forms_obj['infinitive'].replace("ì", "i") != isv_lemma:
current_lemma.lemma_form.form = forms_obj['infinitive']
# if "adj" in pos:
#if isv_lemma == "žučji":
# print(pos, isv_lemma, pos_formatted)
# print(raw_data)
# print(isv_lemma)
# print (current_lemma.lemma_form.tags)
# raise NameError
self.add_lemma(current_lemma)
for form_num, forms_obj in enumerate(forms_obj_array):
add_tag = set() if form_num == 0 else {f"alt{form_num}"}
if not isinstance(forms_obj, dict):
if forms_obj != '':
continue
# print([isv_lemma, pos_formatted, forms_obj])
if " " in isv_lemma and "," not in isv_lemma and isinstance(forms_obj, dict):
splitted = isv_lemma.split()
if len(splitted) == 2 and "sę" in splitted:
counter_se += 1
else:
counter_multiword += 1
if "verb" not in pos_formatted:
# TODO TODO XXX
# print(isv_lemma.split(), pos_formatted)
# print(forms_obj)
counter_multiword_verb += 1


# Here we've found a new lemma, let's add old one to the list
# and continue

details_set = set(getArr(pos)) | add_tag
# if infer_pos is None, then fallback to the first form
pos = infer_pos(details_set) or pos
if pos == "noun":
details_set |= {'noun'}
current_lemma = Lemma(
isv_lemma,
lemma_form_tags=details_set,
)
number_forms = set()
for current_form, tag_set in iterate_json(forms_obj, details_set, isv_lemma):
if "/" in current_form:
all_forms = current_form.split("/")
else:
all_forms = [current_form]
if len(all_forms) > 2:
print(isv_lemma, all_forms)
raise NameError
for single_form, add_tag in zip(all_forms, [set(), {"alt-form"}]):
current_lemma.add_form(WordForm(
single_form,
tags=tag_set | add_tag,
))
if pos in {"noun", "numeral"}:
number_forms |= {one_tag for one_tag in tag_set if one_tag in ['singular', 'plural']}
if len(number_forms) == 1:
numeric = {"Sgtm"} if number_forms == {"singular"} else {"Pltm"}
current_lemma.lemma_form.tags |= numeric
if pos == "verb":
if forms_obj['infinitive'].replace("ì", "i") != isv_lemma:
current_lemma.lemma_form.form = forms_obj['infinitive']
# if "adj" in pos:
#if isv_lemma == "žučji":
# print(pos, isv_lemma, pos_formatted)
# print(raw_data)
# print(isv_lemma)
# print (current_lemma.lemma_form.tags)
# raise NameError
self.add_lemma(current_lemma)
print(counter_multiword)
print(counter_multiword_verb)
print(counter_se)
Expand All @@ -603,10 +612,26 @@ def export_to_xml(self, fname):
tree = ET.ElementTree(root)
root.append(export_grammemes_description_to_xml(tag_set_full))
lemmata = ET.SubElement(root, "lemmata")
known_pronouns = {}

for i, lemma in enumerate(self.lemmas.values()):
lemma_xml = lemma.export_to_xml(i + 1, tag_set_full)
if lemma_xml is not None:
# if "NPRO" in lemma.lemma_form.tags:
# if "NPRO" in str(lemma_xml):
if "pron" in lemma.lemma_form.tags:
print(lemma.lemma_form.tags, lemma.lemma_form.form)
signature = "|".join(
f"{k}: {v[0].form}" for i, (k, v) in enumerate(lemma.forms.items())
if i != 0
)
if signature in known_pronouns:
print(known_pronouns[signature], "<-", lemma.lemma_form.form)
continue
else:
known_pronouns[signature] = lemma.lemma_form.form
#print(lemma_xml)
#print(lemma_xml.write())
lemmata.append(lemma_xml)

tree.write(fname, encoding="utf-8")
2 changes: 2 additions & 0 deletions mapping_isv.csv
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ slang;сленг;aux;;;Slng;TODO: фунгуjе
unknown;ще неопрацьовані слова/форми;aux;;;unknown;Має зникнути взагалі
pers;особовий (займенник);aux;;;pers;
alt;альтернативний правопис;aux;;;alt;TODO
alt1;альтернативны форм 1;aux;;;alt1;
alt2;альтернативны форм 2;aux;;;alt2;
noun;именник;post;v_naz;anim,inanim;NOUN;
pron;заименник;post;v_naz;;NPRO;
verb;глагол;post;inf;perf,imperf;VERB;
Expand Down
34 changes: 28 additions & 6 deletions run_generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,16 @@ def log_doubleform(sender, tags_signature):
REPEATED_FORMS.update({tags_signature: 1})


DIR = "C:\\dev\\"
DIR = "C:\\dev"
DEBUG = True
RUN_EXPORT = False
RUN_CONVERT = True
RUN_BUILD_DICTS = True
RUN_CONVERT = False
RUN_BUILD_DICTS = False

if RUN_EXPORT:
subprocess.check_output(
["npm", "run", "generateParadigms"],
cwd=join(DIR,"interslavic")
cwd=join(DIR,"interslavic"), shell=True
)


Expand Down Expand Up @@ -56,9 +56,22 @@ def log_doubleform(sender, tags_signature):

import pymorphy2
morph = pymorphy2.MorphAnalyzer(out_dir)
print(morph.parse("фунгујут"))
# print(morph.parse("фунгујут"))
print()

from pathlib import Path

DICTS_DIR = join(DIR, "pymorphy2-dicts")

print('suffixes.json')
print(Path(join(out_dir, 'suffixes.json')).stat().st_size)

print('suff.txt')
print(Path(join(DICTS_DIR, 'suff.txt')).stat().st_size)

print('paradigm.txt')
print(Path(join(DICTS_DIR, 'paradigm.txt')).stat().st_size)

phrase = "Тутчас можем писати на прдачном језыковєдском нарєчју"

phrase = "нарєчје јест разумливо приблизно всим машинам без ученја"
Expand All @@ -69,11 +82,20 @@ def log_doubleform(sender, tags_signature):


phrase = "чи можем ли jа говорити на прдачном језыковєдском нарєчју в тутом каналу буде ли то добро Jесм поправил нєкаке грєшкы од првого раза"
phrase = "понєктори користники сут измыслили нєколико прдачных нарєчиј"

phrase = "мене приjати же тутчас jест канал в ктором jа можем писати на прдачном језыковєдском нарєчју"

phrase = "хм jа трєбују измыслити нєкаку методу работы с заименниками прємного опциj анализа имаjут оне"

phrase = "Мой изкус односно фунгованйа всакоможных заименников чи имайут ли премного формов они и оне"

for word in phrase.replace("й", "j").replace("j", "ј").split(" "):
for i, word in enumerate(phrase.replace("й", "j").replace("j", "ј").split(" ")):

parsings = morph.parse(word)
desc = " | ".join(f"**{parsing.normal_form}** - {parsing.tag}" for parsing in parsings)
if i % 2 == 0:
desc = "> " + desc
print(desc)
# print(len(morph.parse(word)))
# print(morph.parse(word)[0])

0 comments on commit f66d873

Please sign in to comment.