From d5d42422bf52cc2f71bf839a89e562b59b4d7242 Mon Sep 17 00:00:00 2001 From: Cibu Johny Date: Sun, 23 Jul 2023 00:44:21 -0700 Subject: [PATCH] To make randgen tests faster, `FROM_ARAB` and `TO_ARAB` are no longer composed together in the romanization round-trip test. Additionally, the `reversible_roman` of `abjad_alphabet` is made analogous to that of Brahmic ISO. This involved removing the ability to accept any string as input. It now expects only Arabic characters, like any Brahmic script ISO. NFC is not needed for TO_ARAB. However, this is needed for Brahmic for Gurmukhi `SHA` and its NFC form `` which apparently has the same romanization defined, unlike Devanagari `QA`. Unlike Brahmic, NFC for Arabic is not idempotent. The current NFC code cannot handle the reordering of a large number of `SHADDA`, `FATHA`, `KASRA`, etc.. Even though, this is only a theoretical possibility, for randgen tests, the number of NFCs we do as part of the round tripping should be the same as the number of NFCs applied on the input before verification. PiperOrigin-RevId: 550303033 --- nisaba/scripts/abjad_alphabet/BUILD.bazel | 13 +++----- nisaba/scripts/abjad_alphabet/__init__.py | 10 ++++-- nisaba/scripts/abjad_alphabet/randgen_test.py | 22 ++++++------- .../abjad_alphabet/reversible_roman.py | 29 +++++------------ .../testdata/reversible_roman.textproto | 31 +++++++++++-------- nisaba/scripts/abjad_alphabet/util.py | 5 +++ nisaba/scripts/brahmic/iso.py | 10 ++++++ nisaba/scripts/utils/test_util.py | 13 ++++++-- 8 files changed, 72 insertions(+), 61 deletions(-) diff --git a/nisaba/scripts/abjad_alphabet/BUILD.bazel b/nisaba/scripts/abjad_alphabet/BUILD.bazel index c1f44c80..593e3757 100644 --- a/nisaba/scripts/abjad_alphabet/BUILD.bazel +++ b/nisaba/scripts/abjad_alphabet/BUILD.bazel @@ -59,18 +59,16 @@ nisaba_compile_multi_grm_py( "utf8": "reversible_roman_utf8.far", }, data = [ + ":nfc.far", + ":nfc_utf8.far", "//nisaba/scripts/abjad_alphabet/data/Arab:reversible_roman.tsv", - "//nisaba/scripts/abjad_alphabet/data/Arab:visual_norm.tsv", ], visibility = ["//visibility:public"], deps = [ ":util", - ":visual_norm_common", "//nisaba/scripts/utils:file", "//nisaba/scripts/utils:rewrite", - "//nisaba/scripts/utils:rule", "@org_opengrm_pynini//pynini", - "@org_opengrm_pynini//pynini/lib:byte", ], ) @@ -263,13 +261,9 @@ py_test( srcs_version = "PY3", deps = [ ":util", - "@org_opengrm_pynini//pynini", + "//nisaba/scripts/utils:test_util", "@io_abseil_py//absl/testing:absltest", "@io_abseil_py//absl/testing:parameterized", - "//nisaba/scripts/utils:file", - # TODO: Remove this dependency by using OpenFar/OpenFstFromFar calls. - "//nisaba/scripts/utils:test_util", - "@io_abseil_py//absl/logging", ], ) @@ -316,6 +310,7 @@ py_library( deps = [ ":util", "//nisaba/scripts/utils:far", + "//nisaba/scripts/utils:rewrite", "@org_opengrm_pynini//pynini", ], ) diff --git a/nisaba/scripts/abjad_alphabet/__init__.py b/nisaba/scripts/abjad_alphabet/__init__.py index 0f93d3d0..eca80ed9 100644 --- a/nisaba/scripts/abjad_alphabet/__init__.py +++ b/nisaba/scripts/abjad_alphabet/__init__.py @@ -14,6 +14,9 @@ """Python APIs for abjad / alphabet grammars.""" +# TODO: This library currently only supports `byte` tokens. Consider +# supporting `utf8` tokens too. + import pathlib import re import string @@ -21,6 +24,7 @@ import pynini from nisaba.scripts.abjad_alphabet import util as u from nisaba.scripts.utils import far +from nisaba.scripts.utils import rewrite class _FarStore(object): @@ -37,7 +41,9 @@ def __init__(self) -> None: def ToReversibleRoman() -> far.Far.FstWrapper: - return _FARS.reversible_roman.Fst('FROM_ARAB') + fst = u.open_fst_from_far('reversible_roman', 'FROM_ARAB', 'byte') + # Allows out of script characters to pass through. + return far.Far.FstWrapper(rewrite.Rewrite(fst)) def FromReversibleRoman() -> far.Far.FstWrapper: @@ -70,7 +76,7 @@ def __init__(self, self._nfc = Nfc() self._visual_norm = VisualNorm(tag) except KeyError as error: - raise TagError('Unsupported language/script: {}'.format(error)) + raise TagError(f'Unsupported language/script: {tag}') from error else: self.accept_pat = re.compile(r'[^{}]+'.format(re.escape(ignore))) diff --git a/nisaba/scripts/abjad_alphabet/randgen_test.py b/nisaba/scripts/abjad_alphabet/randgen_test.py index e5cba46b..9091e323 100644 --- a/nisaba/scripts/abjad_alphabet/randgen_test.py +++ b/nisaba/scripts/abjad_alphabet/randgen_test.py @@ -16,35 +16,33 @@ import itertools -from absl import logging - from absl.testing import absltest from absl.testing import parameterized from nisaba.scripts.abjad_alphabet import util as u -from nisaba.scripts.utils import file as uf -from nisaba.scripts.utils import test_util as ut +from nisaba.scripts.utils import test_util -class FstRandgenTest(parameterized.TestCase, ut.FstRandgenTestCase): +class FstRandgenTest(parameterized.TestCase, test_util.FstRandgenTestCase): @parameterized.parameters('byte', 'utf8') def test_romanization_roundtrip(self, token_type: str): - far = uf.OpenFar(u.FAR_DIR, 'reversible_roman', token_type) + nfc = u.open_fst_from_far('nfc', 'ARAB', token_type) + far = u.open_far('reversible_roman', token_type) natv_to_latin = far['FROM_ARAB'] latin_to_natv = far['TO_ARAB'] - round_trip = natv_to_latin @ latin_to_natv - self.AssertFstProbablyFunctional(round_trip, token_type) + self.AssertFstProbablyIdentity( + [natv_to_latin, latin_to_natv], token_type, nfc) @parameterized.parameters(itertools.product( - u.LANGS, ('visual_norm', 'reading_norm'), ('byte', 'utf8'))) - def test_visual_or_reading_norm(self, lang: str, far_name: str, + ('visual_norm', 'reading_norm'), u.LANGS, ('byte', 'utf8'))) + def test_visual_or_reading_norm(self, far_name: str, lang: str, token_type: str): - fst = uf.OpenFstFromFar(u.FAR_DIR, far_name, token_type, lang) + fst = u.open_fst_from_far(far_name, lang, token_type) self.AssertFstProbablyFunctional(fst, token_type) @parameterized.parameters('byte', 'utf8') def test_nfc(self, token_type: str): - fst = uf.OpenFstFromFar(u.FAR_DIR, 'nfc', token_type, 'ARAB') + fst = u.open_fst_from_far('nfc', 'ARAB', token_type) self.AssertFstProbablyFunctional(fst, token_type) diff --git a/nisaba/scripts/abjad_alphabet/reversible_roman.py b/nisaba/scripts/abjad_alphabet/reversible_roman.py index 0f1d29bd..d6006a20 100644 --- a/nisaba/scripts/abjad_alphabet/reversible_roman.py +++ b/nisaba/scripts/abjad_alphabet/reversible_roman.py @@ -30,39 +30,24 @@ import pynini from pynini.export import multi_grm -from pynini.lib import byte from nisaba.scripts.abjad_alphabet import util -from nisaba.scripts.abjad_alphabet import visual_norm_common from nisaba.scripts.utils import file from nisaba.scripts.utils import rewrite -from nisaba.scripts.utils import rule def generator_main(exporter_map: multi_grm.ExporterMapping): """FSTs for language-agnostic reversible romanization of abjad/alphabets.""" - # Compile romanisation transducer. In the direction to Latin, NFC and then - # visual normalization are applied. They are not required in the opposite - # direction. for token_type in ('byte', 'utf8'): with pynini.default_token_type(token_type): - exporter = exporter_map[token_type] - sigma = byte.BYTE - if token_type == 'utf8': - sigma = util.sigma_from_common_data_files() + nfc = util.open_fst_from_far('nfc', 'ARAB', token_type) - # TODO: Currently, `prefix=(‘presentation_forms’,)` takes too long - # to process, so it is not specified in `script_common_fsts()` even though - # it should be included. - script_common_fsts = visual_norm_common.script_common_fsts(sigma) - roman_mapping_file = util.LANG_DIR / 'reversible_roman.tsv' - roman_fst = rule.fst_from_rule_file(roman_mapping_file, sigma) - fsts = script_common_fsts + [roman_fst] - exporter['FROM_ARAB'] = rewrite.ComposeFsts(fsts, sigma) + roman_tsv = util.LANG_DIR / 'reversible_roman.tsv' + roman = file.StringFile(roman_tsv).star.optimize() - # Transforming Latin to native is simpler. - roman_strings = file.StringFile(roman_mapping_file) - roman_inv_fst = pynini.invert(roman_strings).star - exporter['TO_ARAB'] = roman_inv_fst.optimize() + exporter = exporter_map[token_type] + # NFC is used for romanization, not de-romanization. + exporter['FROM_ARAB'] = rewrite.ComposeFsts([nfc, roman]) + exporter['TO_ARAB'] = roman.invert() if __name__ == '__main__': diff --git a/nisaba/scripts/abjad_alphabet/testdata/reversible_roman.textproto b/nisaba/scripts/abjad_alphabet/testdata/reversible_roman.textproto index e8081b4b..b8415f05 100644 --- a/nisaba/scripts/abjad_alphabet/testdata/reversible_roman.textproto +++ b/nisaba/scripts/abjad_alphabet/testdata/reversible_roman.textproto @@ -18,19 +18,24 @@ # Currently, test strings are gathered from ALA-LC (Urdu) specification. # TODO: Add test strings from rest of the languages as well. Out of script -# characters are pass through. -rewrite { - rule: "FROM_ARAB" - input: "Abæ آب" - output: "Abæ ʼ͟āb" -} +# characters should be pass through. + +# rewrite { +# rule: "FROM_ARAB" +# input: "Abæ آب" +# output: "Abæ ʼ͟āb" +# } + +# TODO: Like NFC, visual norm should also be applied before romanization +# (Urdu U). However, this is not currently done as the build takes too long. +# Furthermore, Brahmic scripts do not apply visual norm before ISO. + +# rewrite { +# rule: "FROM_ARAB" +# input: "عضوُ" +# output: "ʻẓʉ" +# } -# Visual normalization applied prior to romanization (Urdu U). -rewrite { - rule: "FROM_ARAB" - input: "عضوُ" - output: "ʻẓʉ" -} rewrite { rule: "FROM_ARAB" input: "عضۇ" @@ -42,7 +47,7 @@ rewrite { output: "" } -# NFC + visual normalization applied prior to romanization. +# NFC applied prior to romanization. rewrite { rule: "FROM_ARAB" input: "آپ" diff --git a/nisaba/scripts/abjad_alphabet/util.py b/nisaba/scripts/abjad_alphabet/util.py index 6e51f94f..b6acacee 100644 --- a/nisaba/scripts/abjad_alphabet/util.py +++ b/nisaba/scripts/abjad_alphabet/util.py @@ -50,6 +50,11 @@ def sigma_from_common_data_files() -> pynini.Fst: return uc.derive_sigma(chars) +def open_far(far_name: str, token_type: str) -> pynini.Far: + """Loads Abjad-Alphabet FAR specified by `far_name`.""" + return uf.OpenFar(FAR_DIR, far_name, token_type) + + def open_fst_from_far(far_name: str, fst_name: str, token_type: str) -> pynini.Fst: """Loads FST given by `fst_name` from FAR specified by `far_name`.""" diff --git a/nisaba/scripts/brahmic/iso.py b/nisaba/scripts/brahmic/iso.py index ea5f9119..1db83f36 100644 --- a/nisaba/scripts/brahmic/iso.py +++ b/nisaba/scripts/brahmic/iso.py @@ -161,6 +161,13 @@ def _script_fsts(script: str, token_type: str) -> Tuple[p.Fst, p.Fst]: # out. nfc = u.OpenFstFromBrahmicFar('nfc', script, token_type) from_nfced_script = rw.ComposeFsts([nfc, from_script]) + + # TODO: The NFC form of Gurmukhi SHA is , which currently has + # the same romanization defined in the Guru/consonant. So NFC on TO_GURU is + # required currently. However that need not be the case. We could consider + # moving the SHA from common consonant mapping to script specific files + # without adding that to Gurmukhi. That would then align this code with + # Arabic, which does not do NFC on TO_ARAB. to_nfced_script = rw.ComposeFsts([to_script, nfc]) return (from_nfced_script, to_nfced_script) @@ -177,6 +184,9 @@ def generator_main(exporter_map: multi_grm.ExporterMapping): script = script.upper() exporter[f'FROM_{script}'] = from_script exporter[f'TO_{script}'] = to_script + # TODO: Following rewrite assumes 'byte' token type. It should be + # made available to 'utf8' as well. The corresponding 'utf8_test' is + # missing as well. exporter['FROM_BRAHMIC'] = rw.Rewrite(p.union(*from_script_fsts)) diff --git a/nisaba/scripts/utils/test_util.py b/nisaba/scripts/utils/test_util.py index 9cc2df9c..0a12253f 100644 --- a/nisaba/scripts/utils/test_util.py +++ b/nisaba/scripts/utils/test_util.py @@ -284,10 +284,17 @@ def _AssertFstSampledBehavior( with pynini.default_token_type(token_type): for ilabels in _OlabelsIter(input_samples): input_str_fsa = _LabelListToStringFsa(ilabels) + output_str = rewrite.ComposeFsts([input_str_fsa] + fsts) + + # Please note that `norm` is not idempotent if it is Arabic NFC which + # cannot handle the reordering of a large number of SHADDA, FATHA, + # FATHATAN, KASRA, etc.. Even though this is only a theoretical + # possibility, for randgen test, the number of NFCs in the round trip + # should be the same as the count of NFCs applied to the input before + # the assert function. if norm_fst: - input_str_fsa @= norm_fst - output_fst = rewrite.ComposeFsts([input_str_fsa] + fsts) - assert_function(input_str_fsa, output_fst) + input_str_fsa = rewrite.ComposeFsts([input_str_fsa, norm_fst]) + assert_function(input_str_fsa, output_str) class FstTestCase(absltest.TestCase):