From 6505fffa0399e1729cbfa4f1499bc9dec5e123f2 Mon Sep 17 00:00:00 2001 From: Isin Demirsahin Date: Wed, 29 Nov 2023 04:29:48 -0800 Subject: [PATCH] No public description PiperOrigin-RevId: 586295772 --- .../natural_translit/brahmic/iso_inventory.py | 5 +- .../brahmic/nativizer/BUILD.bazel | 48 ++++++++ .../brahmic/nativizer/ltn2iso.py | 104 ++++++++++++++++++ .../brahmic/nativizer/nativize.py | 46 ++++++++ .../natural_translit/latin/ltn_inventory.py | 1 + 5 files changed, 203 insertions(+), 1 deletion(-) create mode 100644 nisaba/scripts/natural_translit/brahmic/nativizer/BUILD.bazel create mode 100644 nisaba/scripts/natural_translit/brahmic/nativizer/ltn2iso.py create mode 100644 nisaba/scripts/natural_translit/brahmic/nativizer/nativize.py diff --git a/nisaba/scripts/natural_translit/brahmic/iso_inventory.py b/nisaba/scripts/natural_translit/brahmic/iso_inventory.py index 047fc099..f5c7350f 100644 --- a/nisaba/scripts/natural_translit/brahmic/iso_inventory.py +++ b/nisaba/scripts/natural_translit/brahmic/iso_inventory.py @@ -207,6 +207,8 @@ def _make_aspirated(char: c.Char) -> c.Char: [[sp.R, sp.EYE], 'r_eye', ph.RT] ]) +SCHWA_BEARING = SIMPLE_CONSONANT + ASPIRATED_CONSONANT +SCHWA_BEARING_TR = c.store_tr_union('SCH_CONS', SCHWA_BEARING) COMPOSITE_CONSONANT = ASPIRATED_CONSONANT + DEAD_CONSONANT CND = [c.make_composite_char([sp.M, sp.CND_DIA], 'cnd', ph.NSL)] @@ -218,6 +220,7 @@ def _make_aspirated(char: c.Char) -> c.Char: TWO_POINT = TWO_POINT_SIGN + INDEPENDENT_VOWEL + COMPOSITE_CONSONANT + CND + OM CHAR = (SINGLE_POINT + TWO_POINT + LONG_VOCALIC) -STORES = [VOWEL_S, VOWEL_I, CODA, VOCALIC] +STORES = [VOWEL_S, VOWEL_I, CODA, VOCALIC, SCHWA_BEARING_TR] GRAPHEME_INVENTORY = c.gr_inventory(CHAR, STORES) +TRANSLIT_INVENTORY = c.tr_inventory(CHAR, STORES) diff --git a/nisaba/scripts/natural_translit/brahmic/nativizer/BUILD.bazel b/nisaba/scripts/natural_translit/brahmic/nativizer/BUILD.bazel new file mode 100644 index 00000000..47b0b60b --- /dev/null +++ b/nisaba/scripts/natural_translit/brahmic/nativizer/BUILD.bazel @@ -0,0 +1,48 @@ +# Copyright 2023 Nisaba Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +load("@rules_python//python:py_library.bzl", "py_library") +load( + "//nisaba/scripts/utils:grammars.bzl", + "nisaba_compile_multi_grm_py", +) + +package(default_applicable_licenses = [ +]) + +py_library( + name = "ltn2iso", + srcs = ["ltn2iso.py"], + deps = [ + "//nisaba/scripts/natural_translit/brahmic:iso_inventory", + "//nisaba/scripts/natural_translit/latin:ltn_inventory", + "//nisaba/scripts/natural_translit/script:char", + "//nisaba/scripts/natural_translit/utils:rewrite_functions", + "@org_opengrm_pynini//pynini", + ], +) + +nisaba_compile_multi_grm_py( + name = "nativize", + outs = { + "byte": "nativize.far", + "utf8": "nativize_utf8.far", + }, + visibility = ["//visibility:public"], + deps = [ + ":ltn2iso", + "//nisaba/scripts/natural_translit/phonology:txn2ltn", + "@org_opengrm_pynini//pynini", + ], +) diff --git a/nisaba/scripts/natural_translit/brahmic/nativizer/ltn2iso.py b/nisaba/scripts/natural_translit/brahmic/nativizer/ltn2iso.py new file mode 100644 index 00000000..2e658576 --- /dev/null +++ b/nisaba/scripts/natural_translit/brahmic/nativizer/ltn2iso.py @@ -0,0 +1,104 @@ +# Copyright 2023 Nisaba Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Latin to ISO fallback grammar.""" + +import pynini as pyn +from nisaba.scripts.natural_translit.brahmic import iso_inventory +from nisaba.scripts.natural_translit.latin import ltn_inventory +from nisaba.scripts.natural_translit.script import char as c +from nisaba.scripts.natural_translit.utils import rewrite_functions as rw + +ltn = ltn_inventory.GRAPHEME_INVENTORY +iso = iso_inventory.TRANSLIT_INVENTORY + +LTN_GR = c.read_glyph(ltn_inventory.ASCII_LC) +ISO_TR = c.print_glyph(iso_inventory.CHAR) + +LONG_VOWEL = rw.rewrite_ls([ + [ltn.A + ltn.A, iso.AA], + [ltn.E + ltn.E, iso.EE], + [ltn.I + ltn.I, iso.II], + [ltn.O + ltn.O, iso.OO], + [ltn.U + ltn.U, iso.UU], + [ltn.A + ltn.I, iso.AI], + [ltn.A + ltn.U, iso.AU], +]) + +HI_VOWEL = rw.rewrite_ls([ + [ltn.E, iso.EE], + [ltn.O, iso.OO], +]) + +SHORT_VOWEL = rw.rewrite_ls([ + [ltn.A, iso.A], + [ltn.E, iso.E], + [ltn.I, iso.I], + [ltn.O, iso.O], + [ltn.U, iso.U], +]) + + +BASE_TWO = rw.rewrite_ls([ + [ltn.C + ltn.H, iso.C], + [ltn.S + ltn.H, iso.SH], +]) + +BASE_ONE = rw.rewrite_ls([ + [ltn.B, iso.B], + [ltn.C, iso.K], + [ltn.D, iso.D], + [ltn.F, iso.P + iso.H], + [ltn.G, iso.G], + [ltn.H, iso.H], + [ltn.J, iso.J], + [ltn.K, iso.K], + [ltn.L, iso.L], + [ltn.M, iso.M], + [ltn.N, iso.N], + [ltn.Q, iso.K], + [ltn.P, iso.P], + [ltn.R, iso.R], + [ltn.S, iso.S], + [ltn.T, iso.T], + [ltn.V, iso.V], + [ltn.W, iso.V], + [ltn.X, iso.K + iso.S], + [ltn.Y, iso.Y], + [ltn.Z, iso.J], +]) + +ASPIRATION = rw.rewrite_ls([ + [ltn.B + ltn.H, iso.BH], + [ltn.C + ltn.H + ltn.H, iso.CH], + [ltn.D + ltn.H, iso.DH], + [ltn.G + ltn.H, iso.GH], + [ltn.J + ltn.H, iso.JH], + [ltn.K + ltn.H, iso.KH], + [ltn.P + ltn.H, iso.PH], + [ltn.T + ltn.H, iso.TH], +]) + +NUKTA = rw.rewrite_ls([ + [ltn.F, iso.F], + [ltn.Z, iso.Z], +]) + +VIS = rw.rewrite_ls([ + [ltn.F, iso.VIS + iso.P], +]) + +SCHWA_INSERTION = rw.insert( + iso.A, iso.SCH_CONS, pyn.union(iso.SCH_CONS, rw.al.EOW) +) diff --git a/nisaba/scripts/natural_translit/brahmic/nativizer/nativize.py b/nisaba/scripts/natural_translit/brahmic/nativizer/nativize.py new file mode 100644 index 00000000..9a893d73 --- /dev/null +++ b/nisaba/scripts/natural_translit/brahmic/nativizer/nativize.py @@ -0,0 +1,46 @@ +# Copyright 2023 Nisaba Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Lint as: python3 +"""End-to-end natural transliteration for Hindi.""" + +import pynini as pyn +from pynini.export import multi_grm +from nisaba.scripts.natural_translit.brahmic.nativizer import ltn2iso + +HI = ( + ltn2iso.LTN_GR + @ ltn2iso.ASPIRATION + @ ltn2iso.BASE_TWO + @ ltn2iso.NUKTA + @ ltn2iso.BASE_ONE + @ ltn2iso.LONG_VOWEL + @ ltn2iso.HI_VOWEL + @ ltn2iso.SHORT_VOWEL + @ ltn2iso.SCHWA_INSERTION + @ ltn2iso.ISO_TR +).optimize() + + +def generator_main(exporter_map: multi_grm.ExporterMapping): + """Generates FAR for natural transliteration.""" + for token_type in ('byte', 'utf8'): + with pyn.default_token_type(token_type): + + exporter = exporter_map[token_type] + exporter['HI'] = HI + + +if __name__ == '__main__': + multi_grm.run(generator_main) diff --git a/nisaba/scripts/natural_translit/latin/ltn_inventory.py b/nisaba/scripts/natural_translit/latin/ltn_inventory.py index a2705072..58c22cc9 100644 --- a/nisaba/scripts/natural_translit/latin/ltn_inventory.py +++ b/nisaba/scripts/natural_translit/latin/ltn_inventory.py @@ -57,6 +57,7 @@ def double_substring_tr(tr: pyn.Fst) -> pyn.Fst: EN_LETTERS = c.store_tr_star('EN_LETTERS', ASCII_UC) CHARS = ASCII_LC + ASCII_UC + SUBSTRING + DEL +GRAPHEME_INVENTORY = c.gr_inventory(CHARS, [EN_LETTERS]) TRANSLIT_INVENTORY = c.tr_inventory(CHARS, [EN_LETTERS])