google-research
diff --git a/‎nisaba/scripts/natural_translit/brahmic/en_spellout.py
+11-11 b/‎nisaba/scripts/natural_translit/brahmic/en_spellout.py
+11-11
diff --git a/‎nisaba/scripts/natural_translit/deromanization/BUILD.bazel
+22-47 b/‎nisaba/scripts/natural_translit/deromanization/BUILD.bazel
+22-47
diff --git a/‎nisaba/scripts/natural_translit/deromanization/en_spellout.py
+50 b/‎nisaba/scripts/natural_translit/deromanization/en_spellout.py
+50
diff --git a/‎nisaba/scripts/natural_translit/deromanization/en_spellout_bn_beng.py
-30 b/‎nisaba/scripts/natural_translit/deromanization/en_spellout_bn_beng.py
-30
diff --git a/‎nisaba/scripts/natural_translit/deromanization/en_spellout_gu_gujr.py
-30 b/‎nisaba/scripts/natural_translit/deromanization/en_spellout_gu_gujr.py
-30
diff --git a/‎nisaba/scripts/natural_translit/deromanization/en_spellout_hi_deva.py
-30 b/‎nisaba/scripts/natural_translit/deromanization/en_spellout_hi_deva.py
-30
diff --git a/‎nisaba/scripts/natural_translit/deromanization/en_spellout_kn_knda.py
-30 b/‎nisaba/scripts/natural_translit/deromanization/en_spellout_kn_knda.py
-30
diff --git a/‎nisaba/scripts/natural_translit/deromanization/en_spellout_ml_mlym.py
-30 b/‎nisaba/scripts/natural_translit/deromanization/en_spellout_ml_mlym.py
-30
@@ -196,7 +196,7 @@ def _spellout_inventory() -> i.Inventory:
           'ਐੱਫ਼',  # pa_guru
           'ايف',  # sd_arab
           'එෆ්',  # si_sinh
-          'எஃப்‌',  # ta_taml
+          'எஃப்',  # ta_taml
           'ఎఫ్‌',  # te_telu
           'ایف',  # ur_arab
       ),
@@ -228,7 +228,7 @@ def _spellout_inventory() -> i.Inventory:
           'ਐੱਚ',  # pa_guru
           'ايڇ',  # sd_arab
           'එච්',  # si_sinh
-          'ஹெச்‌',  # ta_taml
+          'ஹெச்',  # ta_taml
           'హెచ్‌',  # te_telu
           'ایچ',  # ur_arab
       ),
@@ -292,7 +292,7 @@ def _spellout_inventory() -> i.Inventory:
           'ਐੱਲ',  # pa_guru
           'ايل',  # sd_arab
           'එල්',  # si_sinh
-          'எல்‌',  # ta_taml
+          'எல்',  # ta_taml
           'ఎల్‌',  # te_telu
           'ایل',  # ur_arab
       ),
@@ -308,7 +308,7 @@ def _spellout_inventory() -> i.Inventory:
           'ਐੱਮ',  # pa_guru
           'ايم',  # sd_arab
           'එම්',  # si_sinh
-          'எம்‌',  # ta_taml
+          'எம்',  # ta_taml
           'ఎం',  # te_telu
           'ایم',  # ur_arab
       ),
@@ -324,7 +324,7 @@ def _spellout_inventory() -> i.Inventory:
           'ਐੱਨ',  # pa_guru
           'اين',  # sd_arab
           'එන්',  # si_sinh
-          'என்‌',  # ta_taml
+          'என்',  # ta_taml
           'ఎన్‌',  # te_telu
           'این',  # ur_arab
       ),
@@ -388,7 +388,7 @@ def _spellout_inventory() -> i.Inventory:
           'ਆਰ',  # pa_guru
           'آر',  # sd_arab
           'ආර්',  # si_sinh
-          'ஆர்‌',  # ta_taml
+          'ஆர்',  # ta_taml
           'ఆర్‌',  # te_telu
           'آر',  # ur_arab
       ),
@@ -404,7 +404,7 @@ def _spellout_inventory() -> i.Inventory:
           'ਐੱਸ',  # pa_guru
           'ايس',  # sd_arab
           'එස්',  # si_sinh
-          'எஸ்‌',  # ta_taml
+          'எஸ்',  # ta_taml
           'ఎస్‌',  # te_telu
           'ایس',  # ur_arab
       ),
@@ -484,7 +484,7 @@ def _spellout_inventory() -> i.Inventory:
           'ਐੱਕਸ',  # pa_guru
           'ايڪس',  # sd_arab
           'එක්ස්',  # si_sinh
-          'எக்ஸ்‌',  # ta_taml
+          'எக்ஸ்',  # ta_taml
           'ఎక్స్‌',  # te_telu
           'ایکس',  # ur_arab
       ),
@@ -500,7 +500,7 @@ def _spellout_inventory() -> i.Inventory:
           'ਵਾਈ',  # pa_guru
           'وائي',  # sd_arab
           'වයි',  # si_sinh
-          'ஒய்‌',  # ta_taml
+          'ஒய்',  # ta_taml
           'వై',  # te_telu
           'وائے',  # ur_arab
       ),
@@ -516,7 +516,7 @@ def _spellout_inventory() -> i.Inventory:
           'ਜ਼ੈਡ',  # pa_guru
           'زيڊ',  # sd_arab
           'සඩ්',  # si_sinh
-          'இஜட்‌',  # ta_taml
+          'இஜட்',  # ta_taml
           'జడ్‌',  # te_telu
           'زیڈ',  # ur_arab
       ),
@@ -556,7 +556,7 @@ def _separated(
 
 def speller(language: Language, script: Script) -> pyn.Fst:
   """Builds a speller FST for the given language and script."""
-  has_zwnj = [Script.KNDA, Script.MLYM, Script.TAML, Script.TELU]
+  has_zwnj = [Script.KNDA, Script.MLYM, Script.TELU]
   spellouts = i.Inventory.from_list([
       ty.Thing(
           letter.letter, value_from=_get_spellout(letter, language, script)
 
@@ -35,22 +35,6 @@ LANGUAGES = [
     ("ta", "taml"),
 ]
 
-SPELLOUT_LANGUAGES = [
-    "bn_beng",
-    "gu_gujr",
-    "hi_deva",
-    "kn_knda",
-    "ml_mlym",
-    "mr_deva",
-    "or_orya",
-    "pa_guru",
-    "sd_arab",
-    "si_sinh",
-    "ta_taml",
-    "te_telu",
-    "ur_arab",
-]
-
 # Compile ISO grms. Eg: //nisaba/scripts/natural_translit/deromanization:hi_iso
 
 [
@@ -87,22 +71,6 @@ SPELLOUT_LANGUAGES = [
     for language, script in LANGUAGES
 ]
 
-[
-    nisaba_compile_multi_grm_py(
-        name = "en_spellout_%s" % language,
-        outs = {
-            "byte": "en_spellout_%s.far" % language,
-            "utf8": "en_spellout_%s_utf8.far" % language,
-        },
-        visibility = ["//visibility:public"],
-        deps = [
-            "//nisaba/scripts/natural_translit/brahmic:en_spellout",
-            "@org_opengrm_pynini//pynini",
-        ],
-    )
-    for language in SPELLOUT_LANGUAGES
-]
-
 # Byte tests. Eg: //nisaba/scripts/natural_translit/deromanization:hi_iso_test
 
 [
@@ -119,13 +87,6 @@ SPELLOUT_LANGUAGES = [
     for language, script in LANGUAGES
 ]
 
-[
-    nisaba_grm_textproto_test(
-        name = "en_spellout_%s_test" % language,
-    )
-    for language in SPELLOUT_LANGUAGES
-]
-
 # UTF-8 tests. Eg: //nisaba/scripts/natural_translit/deromanization:hi_iso_utf8_test
 [
     nisaba_grm_textproto_test(
@@ -145,11 +106,25 @@ SPELLOUT_LANGUAGES = [
     for language, script in LANGUAGES
 ]
 
-[
-    nisaba_grm_textproto_test(
-        name = "en_spellout_%s_utf8_test" % language,
-        textproto = "testdata/en_spellout_%s.textproto" % language,
-        token_type = "utf8",
-    )
-    for language in SPELLOUT_LANGUAGES
-]
+nisaba_compile_multi_grm_py(
+    name = "en_spellout",
+    outs = {
+        "byte": "en_spellout.far",
+        "utf8": "en_spellout_utf8.far",
+    },
+    visibility = ["//visibility:public"],
+    deps = [
+        "//nisaba/scripts/natural_translit/brahmic:en_spellout",
+        "@org_opengrm_pynini//pynini",
+    ],
+)
+
+nisaba_grm_textproto_test(
+    name = "en_spellout_test",
+)
+
+nisaba_grm_textproto_test(
+    name = "en_spellout_utf8_test",
+    textproto = "testdata/en_spellout.textproto",
+    token_type = "utf8",
+)
@@ -0,0 +1,50 @@
+# Copyright 2024 Nisaba Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""English letter spellout for various South Asian languages."""
+import pynini as pyn
+from pynini.export import multi_grm
+from nisaba.scripts.natural_translit.brahmic import en_spellout
+
+_LANG_SCRIPTS = (
+    'bn_beng',
+    'gu_gujr',
+    'hi_deva',
+    'kn_knda',
+    'ml_mlym',
+    'mr_deva',
+    'or_orya',
+    'pa_guru',
+    'sd_arab',
+    'si_sinh',
+    'ta_taml',
+    'te_telu',
+    'ur_arab',
+)
+
+
+def generator_main(exporter_map: multi_grm.ExporterMapping):
+  """Generates FAR for spellout grammars for all supported language scripts."""
+  for lang_script in _LANG_SCRIPTS:
+    for token_type in ('byte', 'utf8'):
+      with pyn.default_token_type(token_type):
+        language_tag, script_tag = lang_script.split('_')
+        language = en_spellout.Language(language_tag)
+        script = en_spellout.Script(script_tag)
+        exporter = exporter_map[token_type]
+        exporter[lang_script.upper()] = en_spellout.speller(language, script)
+
+
+if __name__ == '__main__':
+  multi_grm.run(generator_main)