Skip to content

Commit

Permalink
To make randgen tests faster, FROM_ARAB and TO_ARAB are no longer…
Browse files Browse the repository at this point in the history
… composed together in the romanization round-trip test.

Additionally, the `reversible_roman` of `abjad_alphabet` is made analogous to that of Brahmic ISO. This involved removing the ability to accept any string as input. It now expects only Arabic characters, like any Brahmic script ISO.

NFC is not needed for TO_ARAB. However, this is needed for Brahmic for Gurmukhi `SHA` and its NFC form `<SA, NUKTA>` which apparently has the same romanization defined, unlike Devanagari `QA`.

Unlike Brahmic, NFC for Arabic is not idempotent. The current NFC code cannot handle the reordering of a large number of `SHADDA`, `FATHA`, `KASRA`, etc.. Even though, this is only a theoretical possibility, for randgen tests, the number of NFCs we do as part of the round tripping should be the same as the number of NFCs applied on the input before verification.

PiperOrigin-RevId: 550303033
  • Loading branch information
cibu authored and copybara-github committed Jul 23, 2023
1 parent 0e4498c commit d5d4242
Show file tree
Hide file tree
Showing 8 changed files with 72 additions and 61 deletions.
13 changes: 4 additions & 9 deletions nisaba/scripts/abjad_alphabet/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -59,18 +59,16 @@ nisaba_compile_multi_grm_py(
"utf8": "reversible_roman_utf8.far",
},
data = [
":nfc.far",
":nfc_utf8.far",
"//nisaba/scripts/abjad_alphabet/data/Arab:reversible_roman.tsv",
"//nisaba/scripts/abjad_alphabet/data/Arab:visual_norm.tsv",
],
visibility = ["//visibility:public"],
deps = [
":util",
":visual_norm_common",
"//nisaba/scripts/utils:file",
"//nisaba/scripts/utils:rewrite",
"//nisaba/scripts/utils:rule",
"@org_opengrm_pynini//pynini",
"@org_opengrm_pynini//pynini/lib:byte",
],
)

Expand Down Expand Up @@ -263,13 +261,9 @@ py_test(
srcs_version = "PY3",
deps = [
":util",
"@org_opengrm_pynini//pynini",
"//nisaba/scripts/utils:test_util",
"@io_abseil_py//absl/testing:absltest",
"@io_abseil_py//absl/testing:parameterized",
"//nisaba/scripts/utils:file",
# TODO: Remove this dependency by using OpenFar/OpenFstFromFar calls.
"//nisaba/scripts/utils:test_util",
"@io_abseil_py//absl/logging",
],
)

Expand Down Expand Up @@ -316,6 +310,7 @@ py_library(
deps = [
":util",
"//nisaba/scripts/utils:far",
"//nisaba/scripts/utils:rewrite",
"@org_opengrm_pynini//pynini",
],
)
Expand Down
10 changes: 8 additions & 2 deletions nisaba/scripts/abjad_alphabet/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,17 @@

"""Python APIs for abjad / alphabet grammars."""

# TODO: This library currently only supports `byte` tokens. Consider
# supporting `utf8` tokens too.

import pathlib
import re
import string

import pynini
from nisaba.scripts.abjad_alphabet import util as u
from nisaba.scripts.utils import far
from nisaba.scripts.utils import rewrite


class _FarStore(object):
Expand All @@ -37,7 +41,9 @@ def __init__(self) -> None:


def ToReversibleRoman() -> far.Far.FstWrapper:
return _FARS.reversible_roman.Fst('FROM_ARAB')
fst = u.open_fst_from_far('reversible_roman', 'FROM_ARAB', 'byte')
# Allows out of script characters to pass through.
return far.Far.FstWrapper(rewrite.Rewrite(fst))


def FromReversibleRoman() -> far.Far.FstWrapper:
Expand Down Expand Up @@ -70,7 +76,7 @@ def __init__(self,
self._nfc = Nfc()
self._visual_norm = VisualNorm(tag)
except KeyError as error:
raise TagError('Unsupported language/script: {}'.format(error))
raise TagError(f'Unsupported language/script: {tag}') from error
else:
self.accept_pat = re.compile(r'[^{}]+'.format(re.escape(ignore)))

Expand Down
22 changes: 10 additions & 12 deletions nisaba/scripts/abjad_alphabet/randgen_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,35 +16,33 @@

import itertools

from absl import logging

from absl.testing import absltest
from absl.testing import parameterized
from nisaba.scripts.abjad_alphabet import util as u
from nisaba.scripts.utils import file as uf
from nisaba.scripts.utils import test_util as ut
from nisaba.scripts.utils import test_util


class FstRandgenTest(parameterized.TestCase, ut.FstRandgenTestCase):
class FstRandgenTest(parameterized.TestCase, test_util.FstRandgenTestCase):

@parameterized.parameters('byte', 'utf8')
def test_romanization_roundtrip(self, token_type: str):
far = uf.OpenFar(u.FAR_DIR, 'reversible_roman', token_type)
nfc = u.open_fst_from_far('nfc', 'ARAB', token_type)
far = u.open_far('reversible_roman', token_type)
natv_to_latin = far['FROM_ARAB']
latin_to_natv = far['TO_ARAB']
round_trip = natv_to_latin @ latin_to_natv
self.AssertFstProbablyFunctional(round_trip, token_type)
self.AssertFstProbablyIdentity(
[natv_to_latin, latin_to_natv], token_type, nfc)

@parameterized.parameters(itertools.product(
u.LANGS, ('visual_norm', 'reading_norm'), ('byte', 'utf8')))
def test_visual_or_reading_norm(self, lang: str, far_name: str,
('visual_norm', 'reading_norm'), u.LANGS, ('byte', 'utf8')))
def test_visual_or_reading_norm(self, far_name: str, lang: str,
token_type: str):
fst = uf.OpenFstFromFar(u.FAR_DIR, far_name, token_type, lang)
fst = u.open_fst_from_far(far_name, lang, token_type)
self.AssertFstProbablyFunctional(fst, token_type)

@parameterized.parameters('byte', 'utf8')
def test_nfc(self, token_type: str):
fst = uf.OpenFstFromFar(u.FAR_DIR, 'nfc', token_type, 'ARAB')
fst = u.open_fst_from_far('nfc', 'ARAB', token_type)
self.AssertFstProbablyFunctional(fst, token_type)


Expand Down
29 changes: 7 additions & 22 deletions nisaba/scripts/abjad_alphabet/reversible_roman.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,39 +30,24 @@

import pynini
from pynini.export import multi_grm
from pynini.lib import byte
from nisaba.scripts.abjad_alphabet import util
from nisaba.scripts.abjad_alphabet import visual_norm_common
from nisaba.scripts.utils import file
from nisaba.scripts.utils import rewrite
from nisaba.scripts.utils import rule


def generator_main(exporter_map: multi_grm.ExporterMapping):
"""FSTs for language-agnostic reversible romanization of abjad/alphabets."""
# Compile romanisation transducer. In the direction to Latin, NFC and then
# visual normalization are applied. They are not required in the opposite
# direction.
for token_type in ('byte', 'utf8'):
with pynini.default_token_type(token_type):
exporter = exporter_map[token_type]
sigma = byte.BYTE
if token_type == 'utf8':
sigma = util.sigma_from_common_data_files()
nfc = util.open_fst_from_far('nfc', 'ARAB', token_type)

# TODO: Currently, `prefix=(‘presentation_forms’,)` takes too long
# to process, so it is not specified in `script_common_fsts()` even though
# it should be included.
script_common_fsts = visual_norm_common.script_common_fsts(sigma)
roman_mapping_file = util.LANG_DIR / 'reversible_roman.tsv'
roman_fst = rule.fst_from_rule_file(roman_mapping_file, sigma)
fsts = script_common_fsts + [roman_fst]
exporter['FROM_ARAB'] = rewrite.ComposeFsts(fsts, sigma)
roman_tsv = util.LANG_DIR / 'reversible_roman.tsv'
roman = file.StringFile(roman_tsv).star.optimize()

# Transforming Latin to native is simpler.
roman_strings = file.StringFile(roman_mapping_file)
roman_inv_fst = pynini.invert(roman_strings).star
exporter['TO_ARAB'] = roman_inv_fst.optimize()
exporter = exporter_map[token_type]
# NFC is used for romanization, not de-romanization.
exporter['FROM_ARAB'] = rewrite.ComposeFsts([nfc, roman])
exporter['TO_ARAB'] = roman.invert()


if __name__ == '__main__':
Expand Down
31 changes: 18 additions & 13 deletions nisaba/scripts/abjad_alphabet/testdata/reversible_roman.textproto
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,24 @@
# Currently, test strings are gathered from ALA-LC (Urdu) specification.

# TODO: Add test strings from rest of the languages as well. Out of script
# characters are pass through.
rewrite {
rule: "FROM_ARAB"
input: "Abæ آب"
output: "Abæ ʼ͟āb"
}
# characters should be pass through.

# rewrite {
# rule: "FROM_ARAB"
# input: "Abæ آب"
# output: "Abæ ʼ͟āb"
# }

# TODO: Like NFC, visual norm should also be applied before romanization
# (Urdu U). However, this is not currently done as the build takes too long.
# Furthermore, Brahmic scripts do not apply visual norm before ISO.

# rewrite {
# rule: "FROM_ARAB"
# input: "عضوُ"
# output: "ʻẓʉ"
# }

# Visual normalization applied prior to romanization (Urdu U).
rewrite {
rule: "FROM_ARAB"
input: "عضوُ"
output: "ʻẓʉ"
}
rewrite {
rule: "FROM_ARAB"
input: "عضۇ"
Expand All @@ -42,7 +47,7 @@ rewrite {
output: ""
}

# NFC + visual normalization applied prior to romanization.
# NFC applied prior to romanization.
rewrite {
rule: "FROM_ARAB"
input: "آپ"
Expand Down
5 changes: 5 additions & 0 deletions nisaba/scripts/abjad_alphabet/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ def sigma_from_common_data_files() -> pynini.Fst:
return uc.derive_sigma(chars)


def open_far(far_name: str, token_type: str) -> pynini.Far:
"""Loads Abjad-Alphabet FAR specified by `far_name`."""
return uf.OpenFar(FAR_DIR, far_name, token_type)


def open_fst_from_far(far_name: str, fst_name: str,
token_type: str) -> pynini.Fst:
"""Loads FST given by `fst_name` from FAR specified by `far_name`."""
Expand Down
10 changes: 10 additions & 0 deletions nisaba/scripts/brahmic/iso.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,13 @@ def _script_fsts(script: str, token_type: str) -> Tuple[p.Fst, p.Fst]:
# out.
nfc = u.OpenFstFromBrahmicFar('nfc', script, token_type)
from_nfced_script = rw.ComposeFsts([nfc, from_script])

# TODO: The NFC form of Gurmukhi SHA is <SA, NUKTA>, which currently has
# the same romanization defined in the Guru/consonant. So NFC on TO_GURU is
# required currently. However that need not be the case. We could consider
# moving the SHA from common consonant mapping to script specific files
# without adding that to Gurmukhi. That would then align this code with
# Arabic, which does not do NFC on TO_ARAB.
to_nfced_script = rw.ComposeFsts([to_script, nfc])
return (from_nfced_script, to_nfced_script)

Expand All @@ -177,6 +184,9 @@ def generator_main(exporter_map: multi_grm.ExporterMapping):
script = script.upper()
exporter[f'FROM_{script}'] = from_script
exporter[f'TO_{script}'] = to_script
# TODO: Following rewrite assumes 'byte' token type. It should be
# made available to 'utf8' as well. The corresponding 'utf8_test' is
# missing as well.
exporter['FROM_BRAHMIC'] = rw.Rewrite(p.union(*from_script_fsts))


Expand Down
13 changes: 10 additions & 3 deletions nisaba/scripts/utils/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,10 +284,17 @@ def _AssertFstSampledBehavior(
with pynini.default_token_type(token_type):
for ilabels in _OlabelsIter(input_samples):
input_str_fsa = _LabelListToStringFsa(ilabels)
output_str = rewrite.ComposeFsts([input_str_fsa] + fsts)

# Please note that `norm` is not idempotent if it is Arabic NFC which
# cannot handle the reordering of a large number of SHADDA, FATHA,
# FATHATAN, KASRA, etc.. Even though this is only a theoretical
# possibility, for randgen test, the number of NFCs in the round trip
# should be the same as the count of NFCs applied to the input before
# the assert function.
if norm_fst:
input_str_fsa @= norm_fst
output_fst = rewrite.ComposeFsts([input_str_fsa] + fsts)
assert_function(input_str_fsa, output_fst)
input_str_fsa = rewrite.ComposeFsts([input_str_fsa, norm_fst])
assert_function(input_str_fsa, output_str)


class FstTestCase(absltest.TestCase):
Expand Down

0 comments on commit d5d4242

Please sign in to comment.