Skip to content

Commit

Permalink
vieliky klinap
Browse files Browse the repository at this point in the history
  • Loading branch information
bt2901 committed Oct 14, 2021
1 parent 5c018ae commit e3ab01c
Show file tree
Hide file tree
Showing 8 changed files with 124 additions and 381 deletions.
40 changes: 0 additions & 40 deletions constants.py
Original file line number Diff line number Diff line change
@@ -1,40 +0,0 @@
import pymorphy2
import re

VERB_PREFIXES = [
'do', 'iz', 'izpo', 'nad', 'na', 'ne', 'ob', 'odpo', 'od', 'o', 'prědpo',
'pod', 'po', 'prě', 'pre', 'pri', 'pro', 'råzpro', 'razpro', 'råz', 'raz',
'sȯ', 's', 'u', 'vȯ', 'vo', 'v', 'vȯz', 'voz', 'vy', 'za',
]

CYR_LETTER_SUBS = {
"н": "њ", "л": "љ", "е": "є", "и": "ы"
}

SIMPLE_DIACR_SUBS = {
'e': 'ě', 'c': 'č', 'z': 'ž', 's': 'š',
}
# NOTE: pymorphy2 cannot work with several changes, i.e. {'e': 'ě', 'e': 'ę'}
ETM_DIACR_SUBS = {
'a': 'å', 'u': 'ų', 'č': 'ć', 'e': 'ę',
'n': 'ń', 'r': 'ŕ', 'l': 'ľ',
# 'dž': 'đ' # ne funguje
}

DEFAULT_UNITS = [
[
pymorphy2.units.DictionaryAnalyzer()
],
pymorphy2.units.KnownPrefixAnalyzer(known_prefixes=VERB_PREFIXES),
[
pymorphy2.units.UnknownPrefixAnalyzer(),
pymorphy2.units.KnownSuffixAnalyzer()
]
]

letters = "a-zа-яёěčžšåųćęđŕľńјљєњ"
BASE_ISV_TOKEN_REGEX = re.compile(
f'''(?:-|[^{letters}\s"'""«»„“-]+|[0-9{letters}_]+(-?[0-9{letters}_]+)*)''',
re.IGNORECASE | re.UNICODE
)

97 changes: 54 additions & 43 deletions convert.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
from __future__ import unicode_literals
import re
import sys
import gzip
import os.path
import bz2file as bz2
import codecs
import logging
import ujson

Expand All @@ -19,17 +15,20 @@

doubleform_signal = signal('doubleform-found')


def getArr(details_string):
return [x for x in details_string
.replace("./", '/')
.replace(" ", '')
.split('.')
if x != ''
]

diacr_letters = "žčšěйćżęųœ"
plain_letters = "жчшєjчжеуо"
return [
x for x in details_string
.replace("./", '/')
.replace(" ", '')
.split('.')
if x != ''
]


# TODO: move this to normalizacija.py or constants.py
diacr_letters = "žčšěйćżęųœ"
plain_letters = "жчшєjчжеуо"

lat_alphabet = "abcčdeěfghijjklmnoprsštuvyzž"
cyr_alphabet = "абцчдеєфгхийьклмнопрсштувызж"
Expand All @@ -44,17 +43,18 @@ def getArr(details_string):

nms2std_trans = str.maketrans(nms_alphabet, std_alphabet)

extended_nms_alphabet = "áàâāíìîīĭıąǫũéēĕëèœóôŏöòĵĺļljýłçʒřťȯďś"
regular_etym_alphabet = "aaaaiiiiiiųųųeeėėėoooȯȯȯjľľľylczŕtods"
extended_nms_alphabet = "áàâāíìîīĭıąǫũéēĕëèœóôŏöòȯĵĺļljýłçʒřťďśńź"
regular_etym_alphabet = "aaaaiiiiiiųųųeeėėėoooȯȯȯȯjľľľylczŕťďśńź"

ext_nms2std_nms_trans = str.maketrans(extended_nms_alphabet, regular_etym_alphabet)


def lat2cyr(thestring):

# "e^" -> "ê"
# 'z\u030C\u030C\u030C' -> 'ž\u030C\u030C'
thestring = unicodedata.normalize(
'NFKC',
'NFKC',
thestring
).lower().replace("\n", " ")

Expand All @@ -76,18 +76,23 @@ def lat2cyr(thestring):


def lat2etm(thestring):
return thestring.translate(ext_nms2std_nms_trans).strip()
# hack with dʒ
return thestring.translate(ext_nms2std_nms_trans).replace("đ", "dʒ").strip()


def lat2std(thestring):
return thestring.translate(nms2std_trans).replace("đ", "dž").strip()


translation_functions = {
"isv_cyr": lat2cyr,
"isv_lat": lat2std,
"isv_etm": lat2etm,
}

VERB_AUX_WORDS = {'(je)', 'sę', '(sųt)', 'ne'}


def infer_pos(arr):
if 'adj' in arr:
return 'adjective'
Expand All @@ -100,13 +105,13 @@ def infer_pos(arr):
if 'prep' in arr:
return 'preposition'
if 'pron' in arr:
return 'pronoun';
return 'pronoun'
if 'num' in arr:
return 'numeral';
return 'numeral'
if 'intj' in arr:
return 'interjection';
return 'interjection'
if 'v' in arr:
return 'verb';
return 'verb'


def export_grammemes_description_to_xml(tag_set):
Expand Down Expand Up @@ -140,7 +145,7 @@ def __init__(self, fname):
self.lt2opencorpora = {}

with open(fname, 'rb') as fp:
r = DictReader(fp,delimiter=';')
r = DictReader(fp, delimiter=';')

for tag in r:
# lemma form column represents set of tags that wordform should
Expand Down Expand Up @@ -190,13 +195,15 @@ def _get_group_no(self, tag_name):
return len(self.groups)

def sort_tags(self, tags):
# TODO: this function is not used, but the output would be nicer if it were
def inner_cmp(a, b):
a_group = self._get_group_no(a)
b_group = self._get_group_no(b)

# cmp is a built-in python function
if a_group == b_group:
return cmp(a, b)
return cmp(a_group, b_group)
return cmp(a, b) # noqa: F821
return cmp(a_group, b_group) # noqa: F821

return sorted(tags, cmp=inner_cmp)

Expand Down Expand Up @@ -298,7 +305,6 @@ def export_to_xml(self, i, mapping, rev=1, lang="isv_cyr"):
l_form = ET.SubElement(lemma, "l", t=output_lemma_form)
self._add_tags_to_element(l_form, common_tags, mapping)


for forms in self.forms.values():
for form in forms:
output_form = form.form.lower()
Expand All @@ -316,6 +322,7 @@ def export_to_xml(self, i, mapping, rev=1, lang="isv_cyr"):

return lemma


def yield_all_simple_adj_forms(forms_obj, pos):
if "casesSingular" in forms_obj:
forms_obj['singular'] = forms_obj['casesSingular']
Expand Down Expand Up @@ -366,6 +373,7 @@ def yield_all_simple_adj_forms(forms_obj, pos):
yield content[0], {case, "plur", "neut", animatedness} | pos
yield content[0], {case, "plur", "femn", animatedness} | pos


def yield_all_noun_forms(forms_obj, pos, columns):
for case, data in forms_obj.items():
for (form, form_name) in zip(data, columns):
Expand All @@ -376,7 +384,7 @@ def yield_all_noun_forms(forms_obj, pos, columns):
form_name = "plur"
if form_name == "masculine":
form_name = 'masc'
# TODO:
# TODO:
if form_name == "feminine/neuter":
yield form, {case, 'femn'} | pos
yield form, {case, 'neut'} | pos
Expand All @@ -398,13 +406,10 @@ def yield_all_noun_forms(forms_obj, pos, columns):
else:
yield form, {case, form_name} | pos

VERB_AUX_WORDS = {'(je)', 'sę', '(sųt)', 'ne'}

def yield_all_verb_forms(forms_obj, pos, base):

is_byti = forms_obj['infinitive'] == 'bytì'
# if forms_obj['infinitive'].replace("ì", "i") != base:
# print(forms_obj['infinitive'], base)

# ====== Infinitive ======
yield forms_obj['infinitive'], pos | {"INFN"}
Expand Down Expand Up @@ -441,7 +446,6 @@ def yield_all_verb_forms(forms_obj, pos, base):
subentry = entry.split(" ")[0]
yield subentry, pos | {time} | one_tag


# ====== Future ======
# ['future']
# future uses infinitive and aux verbs
Expand Down Expand Up @@ -488,12 +492,14 @@ def yield_all_verb_forms(forms_obj, pos, base):
[{'actv', 'present'}, {'pssv', 'present'}, {'actv', 'past'}, {'pssv', 'past'}]
):
# TODO: will fuck up if multi-word verb
parts = (forms_obj[time]
parts = (
forms_obj[time]
.replace("ne ", "")
.replace("ši sá", "ša sę").replace("ši sé", "še sę") # THIS MAKES PARSER NON STANDARD COMPLIANT
.replace("ši sá", "ša sę").replace("ši sé", "še sę") # THIS IS A CHANGE FROM ORIGINAL LOGIC
.replace(" sę", "")
.replace(",", "").replace("(", "") .replace(")", "")
.split(" "))
.split(" ")
)

subentry_tags = [{"V-ju"}, {'V-m'}]
if len(parts) == 1:
Expand Down Expand Up @@ -521,11 +527,12 @@ def yield_all_verb_forms(forms_obj, pos, base):

def iterate_json(forms_obj, pos_data, base):
pos = infer_pos(pos_data)
pos_data = {x for x in pos_data if x != "m/f"}
if isinstance(forms_obj, str) or pos is None:
return base, pos_data

if "adj" in pos:
yield from yield_all_simple_adj_forms(forms_obj, pos_data)
yield from yield_all_simple_adj_forms(forms_obj, pos_data)
content = forms_obj['comparison']
yield content['positive'][0], {"positive"} | pos_data

Expand Down Expand Up @@ -556,7 +563,6 @@ def iterate_json(forms_obj, pos_data, base):
else:
yield form, pos_data


elif forms_obj['type'] == 'noun':
columns = forms_obj['columns']
yield from yield_all_noun_forms(forms_obj['cases'], pos_data, columns)
Expand All @@ -579,9 +585,9 @@ def iterate_json(forms_obj, pos_data, base):
yield from yield_all_noun_forms(forms_obj, pos_data, ['singular', 'plural'])
return base, pos_data


base_tag_set = {}
INDECLINABLE_POS = {'adverb', 'conjunction', 'preposition', 'interjection', 'particle', 'pronoun', 'numeral'}
INDECLINABLE_POS = {'adverb', 'conjunction', 'preposition', 'interjection', 'particle', 'pronoun', 'numeral'}


class Dictionary(object):
Expand All @@ -603,10 +609,10 @@ def __init__(self, fname, mapping):
forms_obj_array = ujson.loads(forms)

# HOTFIX TIME!
if word_id == "36454":
pos = "adj."
if word_id == "36649":
pos = "f."
pass
if word_id == "6181":
pass

add_tags = [{f"VF-{form_num+1}"} for form_num, _ in enumerate(forms_obj_array)]

Expand All @@ -615,6 +621,7 @@ def __init__(self, fname, mapping):

isv_lemmas = isv_lemma.split(",")
if "m./f." in pos:
# example: "6181" "križ","","m./f.","1","cross",
isv_lemmas = [isv_lemma, isv_lemma]
add_tags = [{'masc'}, {'femn'}]
for add_tag, forms_obj, isv_lemma_current in zip(add_tags, forms_obj_array, isv_lemmas):
Expand All @@ -623,7 +630,7 @@ def __init__(self, fname, mapping):
details_set = set(getArr(pos)) | add_tag
# if infer_pos is None, then fallback to the first form
local_pos = infer_pos(details_set) or pos
if local_pos == "noun":
if local_pos == "noun":
details_set |= {'noun'}

if not isinstance(forms_obj, dict):
Expand Down Expand Up @@ -669,7 +676,9 @@ def __init__(self, fname, mapping):
if len(all_forms) > 2:
print(isv_lemma_current, all_forms)
raise NameError
all_tags = [{f"V-flex-{form_num+1}"} for form_num, _ in enumerate(all_forms)]
all_tags = [
{f"V-flex-{form_num+1}"} for form_num, _ in enumerate(all_forms)
]

if len(all_forms) == 1:
all_tags = [set()]
Expand All @@ -679,7 +688,9 @@ def __init__(self, fname, mapping):
tags=tag_set | add_tag,
))
if local_pos in {"noun", "numeral"}:
number_forms |= {one_tag for one_tag in tag_set if one_tag in ['singular', 'plural']}
number_forms |= {
one_tag for one_tag in tag_set if one_tag in ['singular', 'plural']
}
if len(number_forms) == 1:
if number_forms != {"singular"} and number_forms != {"plural"}:
print(number_forms, current_lemma.lemma_form.form)
Expand Down
Loading

0 comments on commit e3ab01c

Please sign in to comment.