To make randgen tests faster, FROM_ARAB and TO_ARAB are no longer…

… composed together in the romanization round-trip test. Additionally, the `reversible_roman` of `abjad_alphabet` is made analogous to that of Brahmic ISO. This involved removing the ability to accept any string as input. It now expects only Arabic characters, like any Brahmic script ISO. NFC is not needed for TO_ARAB. However, this is needed for Brahmic for Gurmukhi `SHA` and its NFC form `<SA, NUKTA>` which apparently has the same romanization defined, unlike Devanagari `QA`. Unlike Brahmic, NFC for Arabic is not idempotent. The current NFC code cannot handle the reordering of a large number of `SHADDA`, `FATHA`, `KASRA`, etc.. Even though, this is only a theoretical possibility, for randgen tests, the number of NFCs we do as part of the round tripping should be the same as the number of NFCs applied on the input before verification. PiperOrigin-RevId: 550303033
google-research · Jul 23, 2023 · d5d4242 · d5d4242
1 parent 0e4498c
commit d5d4242
Show file tree

Hide file tree

Showing 8 changed files with 72 additions and 61 deletions.
diff --git a/nisaba/scripts/abjad_alphabet/BUILD.bazel b/nisaba/scripts/abjad_alphabet/BUILD.bazel
@@ -59,18 +59,16 @@ nisaba_compile_multi_grm_py(
         "utf8": "reversible_roman_utf8.far",
     },
     data = [
+        ":nfc.far",
+        ":nfc_utf8.far",
         "//nisaba/scripts/abjad_alphabet/data/Arab:reversible_roman.tsv",
-        "//nisaba/scripts/abjad_alphabet/data/Arab:visual_norm.tsv",
     ],
     visibility = ["//visibility:public"],
     deps = [
         ":util",
-        ":visual_norm_common",
         "//nisaba/scripts/utils:file",
         "//nisaba/scripts/utils:rewrite",
-        "//nisaba/scripts/utils:rule",
         "@org_opengrm_pynini//pynini",
-        "@org_opengrm_pynini//pynini/lib:byte",
     ],
 )
 
@@ -263,13 +261,9 @@ py_test(
     srcs_version = "PY3",
     deps = [
         ":util",
-        "@org_opengrm_pynini//pynini",
+        "//nisaba/scripts/utils:test_util",
         "@io_abseil_py//absl/testing:absltest",
         "@io_abseil_py//absl/testing:parameterized",
-        "//nisaba/scripts/utils:file",
-        # TODO: Remove this dependency by using OpenFar/OpenFstFromFar calls.
-        "//nisaba/scripts/utils:test_util",
-        "@io_abseil_py//absl/logging",
     ],
 )
 
@@ -316,6 +310,7 @@ py_library(
     deps = [
         ":util",
         "//nisaba/scripts/utils:far",
+        "//nisaba/scripts/utils:rewrite",
         "@org_opengrm_pynini//pynini",
     ],
 )

diff --git a/nisaba/scripts/abjad_alphabet/__init__.py b/nisaba/scripts/abjad_alphabet/__init__.py
@@ -14,13 +14,17 @@
 
 """Python APIs for abjad / alphabet grammars."""
 
+# TODO: This library currently only supports `byte` tokens. Consider
+# supporting `utf8` tokens too.
+
 import pathlib
 import re
 import string
 
 import pynini
 from nisaba.scripts.abjad_alphabet import util as u
 from nisaba.scripts.utils import far
+from nisaba.scripts.utils import rewrite
 
 
 class _FarStore(object):
@@ -37,7 +41,9 @@ def __init__(self) -> None:
 
 
 def ToReversibleRoman() -> far.Far.FstWrapper:
-  return _FARS.reversible_roman.Fst('FROM_ARAB')
+  fst = u.open_fst_from_far('reversible_roman', 'FROM_ARAB', 'byte')
+  # Allows out of script characters to pass through.
+  return far.Far.FstWrapper(rewrite.Rewrite(fst))
 
 
 def FromReversibleRoman() -> far.Far.FstWrapper:
@@ -70,7 +76,7 @@ def __init__(self,
       self._nfc = Nfc()
       self._visual_norm = VisualNorm(tag)
     except KeyError as error:
-      raise TagError('Unsupported language/script: {}'.format(error))
+      raise TagError(f'Unsupported language/script: {tag}') from error
     else:
       self.accept_pat = re.compile(r'[^{}]+'.format(re.escape(ignore)))
 

diff --git a/nisaba/scripts/abjad_alphabet/randgen_test.py b/nisaba/scripts/abjad_alphabet/randgen_test.py
@@ -16,35 +16,33 @@
 
 import itertools
 
-from absl import logging  
-
 from absl.testing import absltest
 from absl.testing import parameterized
 from nisaba.scripts.abjad_alphabet import util as u
-from nisaba.scripts.utils import file as uf
-from nisaba.scripts.utils import test_util as ut
+from nisaba.scripts.utils import test_util
 
 
-class FstRandgenTest(parameterized.TestCase, ut.FstRandgenTestCase):
+class FstRandgenTest(parameterized.TestCase, test_util.FstRandgenTestCase):
 
   @parameterized.parameters('byte', 'utf8')
   def test_romanization_roundtrip(self, token_type: str):
-    far = uf.OpenFar(u.FAR_DIR, 'reversible_roman', token_type)
+    nfc = u.open_fst_from_far('nfc', 'ARAB', token_type)
+    far = u.open_far('reversible_roman', token_type)
     natv_to_latin = far['FROM_ARAB']
     latin_to_natv = far['TO_ARAB']
-    round_trip = natv_to_latin @ latin_to_natv
-    self.AssertFstProbablyFunctional(round_trip, token_type)
+    self.AssertFstProbablyIdentity(
+        [natv_to_latin, latin_to_natv], token_type, nfc)
 
   @parameterized.parameters(itertools.product(
-      u.LANGS, ('visual_norm', 'reading_norm'), ('byte', 'utf8')))
-  def test_visual_or_reading_norm(self, lang: str, far_name: str,
+      ('visual_norm', 'reading_norm'), u.LANGS, ('byte', 'utf8')))
+  def test_visual_or_reading_norm(self, far_name: str, lang: str,
                                   token_type: str):
-    fst = uf.OpenFstFromFar(u.FAR_DIR, far_name, token_type, lang)
+    fst = u.open_fst_from_far(far_name, lang, token_type)
     self.AssertFstProbablyFunctional(fst, token_type)
 
   @parameterized.parameters('byte', 'utf8')
   def test_nfc(self, token_type: str):
-    fst = uf.OpenFstFromFar(u.FAR_DIR, 'nfc', token_type, 'ARAB')
+    fst = u.open_fst_from_far('nfc', 'ARAB', token_type)
     self.AssertFstProbablyFunctional(fst, token_type)
 
 

diff --git a/nisaba/scripts/abjad_alphabet/reversible_roman.py b/nisaba/scripts/abjad_alphabet/reversible_roman.py
@@ -30,39 +30,24 @@
 
 import pynini
 from pynini.export import multi_grm
-from pynini.lib import byte
 from nisaba.scripts.abjad_alphabet import util
-from nisaba.scripts.abjad_alphabet import visual_norm_common
 from nisaba.scripts.utils import file
 from nisaba.scripts.utils import rewrite
-from nisaba.scripts.utils import rule
 
 
 def generator_main(exporter_map: multi_grm.ExporterMapping):
   """FSTs for language-agnostic reversible romanization of abjad/alphabets."""
-  # Compile romanisation transducer. In the direction to Latin, NFC and then
-  # visual normalization are applied. They are not required in the opposite
-  # direction.
   for token_type in ('byte', 'utf8'):
     with pynini.default_token_type(token_type):
-      exporter = exporter_map[token_type]
-      sigma = byte.BYTE
-      if token_type == 'utf8':
-        sigma = util.sigma_from_common_data_files()
+      nfc = util.open_fst_from_far('nfc', 'ARAB', token_type)
 
-      # TODO: Currently, `prefix=(‘presentation_forms’,)` takes too long
-      # to process, so it is not specified in `script_common_fsts()` even though
-      # it should be included.
-      script_common_fsts = visual_norm_common.script_common_fsts(sigma)
-      roman_mapping_file = util.LANG_DIR / 'reversible_roman.tsv'
-      roman_fst = rule.fst_from_rule_file(roman_mapping_file, sigma)
-      fsts = script_common_fsts + [roman_fst]
-      exporter['FROM_ARAB'] = rewrite.ComposeFsts(fsts, sigma)
+      roman_tsv = util.LANG_DIR / 'reversible_roman.tsv'
+      roman = file.StringFile(roman_tsv).star.optimize()
 
-      # Transforming Latin to native is simpler.
-      roman_strings = file.StringFile(roman_mapping_file)
-      roman_inv_fst = pynini.invert(roman_strings).star
-      exporter['TO_ARAB'] = roman_inv_fst.optimize()
+      exporter = exporter_map[token_type]
+      # NFC is used for romanization, not de-romanization.
+      exporter['FROM_ARAB'] = rewrite.ComposeFsts([nfc, roman])
+      exporter['TO_ARAB'] = roman.invert()
 
 
 if __name__ == '__main__':

diff --git a/nisaba/scripts/abjad_alphabet/testdata/reversible_roman.textproto b/nisaba/scripts/abjad_alphabet/testdata/reversible_roman.textproto
@@ -18,19 +18,24 @@
 # Currently, test strings are gathered from ALA-LC (Urdu) specification.
 
 # TODO: Add test strings from rest of the languages as well. Out of script
-# characters are pass through.
-rewrite {
-  rule: "FROM_ARAB"
-  input: "Abæ آب"
-  output: "Abæ ʼ͟āb"
-}
+# characters should be pass through.
+
+# rewrite {
+#   rule: "FROM_ARAB"
+#   input: "Abæ آب"
+#   output: "Abæ ʼ͟āb"
+# }
+
+# TODO: Like NFC, visual norm should also be applied before romanization
+# (Urdu U). However, this is not currently done as the build takes too long.
+# Furthermore, Brahmic scripts do not apply visual norm before ISO.
+
+# rewrite {
+#   rule: "FROM_ARAB"
+#   input: "عضوُ"
+#   output: "ʻẓʉ"
+# }
 
-# Visual normalization applied prior to romanization (Urdu U).
-rewrite {
-  rule: "FROM_ARAB"
-  input: "عضوُ"
-  output: "ʻẓʉ"
-}
 rewrite {
   rule: "FROM_ARAB"
   input: "عضۇ"
@@ -42,7 +47,7 @@ rewrite {
   output: ""
 }
 
-# NFC + visual normalization applied prior to romanization.
+# NFC applied prior to romanization.
 rewrite {
   rule: "FROM_ARAB"
   input: "آپ"

diff --git a/nisaba/scripts/abjad_alphabet/util.py b/nisaba/scripts/abjad_alphabet/util.py
@@ -50,6 +50,11 @@ def sigma_from_common_data_files() -> pynini.Fst:
   return uc.derive_sigma(chars)
 
 
+def open_far(far_name: str, token_type: str) -> pynini.Far:
+  """Loads Abjad-Alphabet FAR specified by `far_name`."""
+  return uf.OpenFar(FAR_DIR, far_name, token_type)
+
+
 def open_fst_from_far(far_name: str, fst_name: str,
                       token_type: str) -> pynini.Fst:
   """Loads FST given by `fst_name` from FAR specified by `far_name`."""

diff --git a/nisaba/scripts/brahmic/iso.py b/nisaba/scripts/brahmic/iso.py
@@ -161,6 +161,13 @@ def _script_fsts(script: str, token_type: str) -> Tuple[p.Fst, p.Fst]:
   # out.
   nfc = u.OpenFstFromBrahmicFar('nfc', script, token_type)
   from_nfced_script = rw.ComposeFsts([nfc, from_script])
+
+  # TODO: The NFC form of Gurmukhi SHA is <SA, NUKTA>, which currently has
+  # the same romanization defined in the Guru/consonant. So NFC on TO_GURU is
+  # required currently. However that need not be the case. We could consider
+  # moving the SHA from common consonant mapping to script specific files
+  # without adding that to Gurmukhi. That would then align this code with
+  # Arabic, which does not do NFC on TO_ARAB.
   to_nfced_script = rw.ComposeFsts([to_script, nfc])
   return (from_nfced_script, to_nfced_script)
 
@@ -177,6 +184,9 @@ def generator_main(exporter_map: multi_grm.ExporterMapping):
         script = script.upper()
         exporter[f'FROM_{script}'] = from_script
         exporter[f'TO_{script}'] = to_script
+      # TODO: Following rewrite assumes 'byte' token type. It should be
+      # made available to 'utf8' as well. The corresponding 'utf8_test' is
+      # missing as well.
       exporter['FROM_BRAHMIC'] = rw.Rewrite(p.union(*from_script_fsts))
 
 

diff --git a/nisaba/scripts/utils/test_util.py b/nisaba/scripts/utils/test_util.py
@@ -284,10 +284,17 @@ def _AssertFstSampledBehavior(
     with pynini.default_token_type(token_type):
       for ilabels in _OlabelsIter(input_samples):
         input_str_fsa = _LabelListToStringFsa(ilabels)
+        output_str = rewrite.ComposeFsts([input_str_fsa] + fsts)
+
+        # Please note that `norm` is not idempotent if it is Arabic NFC which
+        # cannot handle the reordering of a large number of SHADDA, FATHA,
+        # FATHATAN, KASRA, etc.. Even though this is only a theoretical
+        # possibility, for randgen test, the number of NFCs in the round trip
+        # should be the same as the count of NFCs applied to the input before
+        # the assert function.
         if norm_fst:
-          input_str_fsa @= norm_fst
-        output_fst = rewrite.ComposeFsts([input_str_fsa] + fsts)
-        assert_function(input_str_fsa, output_fst)
+          input_str_fsa = rewrite.ComposeFsts([input_str_fsa, norm_fst])
+        assert_function(input_str_fsa, output_str)
 
 
 class FstTestCase(absltest.TestCase):