Skip to content

Commit 8de964e

Browse files
cibucopybara-github
authored andcommitted
Nisaba en_spellout generates a single FAR file with FSTs for all supported languages, instead of a separate FAR file for each language.
For Tamil, removed the usage of `ZWNJ` as it does not usually form conjunct ligatures. PiperOrigin-RevId: 649651609
1 parent f7e938d commit 8de964e

30 files changed

+281
-825
lines changed

nisaba/scripts/natural_translit/brahmic/en_spellout.py

+11-11
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,7 @@ def _spellout_inventory() -> i.Inventory:
196196
'ਐੱਫ਼', # pa_guru
197197
'ايف', # sd_arab
198198
'එෆ්', # si_sinh
199-
'எஃப்', # ta_taml
199+
'எஃப்', # ta_taml
200200
'ఎఫ్‌', # te_telu
201201
'ایف', # ur_arab
202202
),
@@ -228,7 +228,7 @@ def _spellout_inventory() -> i.Inventory:
228228
'ਐੱਚ', # pa_guru
229229
'ايڇ', # sd_arab
230230
'එච්', # si_sinh
231-
'ஹெச்', # ta_taml
231+
'ஹெச்', # ta_taml
232232
'హెచ్‌', # te_telu
233233
'ایچ', # ur_arab
234234
),
@@ -292,7 +292,7 @@ def _spellout_inventory() -> i.Inventory:
292292
'ਐੱਲ', # pa_guru
293293
'ايل', # sd_arab
294294
'එල්', # si_sinh
295-
'எல்', # ta_taml
295+
'எல்', # ta_taml
296296
'ఎల్‌', # te_telu
297297
'ایل', # ur_arab
298298
),
@@ -308,7 +308,7 @@ def _spellout_inventory() -> i.Inventory:
308308
'ਐੱਮ', # pa_guru
309309
'ايم', # sd_arab
310310
'එම්', # si_sinh
311-
'எம்', # ta_taml
311+
'எம்', # ta_taml
312312
'ఎం', # te_telu
313313
'ایم', # ur_arab
314314
),
@@ -324,7 +324,7 @@ def _spellout_inventory() -> i.Inventory:
324324
'ਐੱਨ', # pa_guru
325325
'اين', # sd_arab
326326
'එන්', # si_sinh
327-
'என்', # ta_taml
327+
'என்', # ta_taml
328328
'ఎన్‌', # te_telu
329329
'این', # ur_arab
330330
),
@@ -388,7 +388,7 @@ def _spellout_inventory() -> i.Inventory:
388388
'ਆਰ', # pa_guru
389389
'آر', # sd_arab
390390
'ආර්', # si_sinh
391-
'ஆர்', # ta_taml
391+
'ஆர்', # ta_taml
392392
'ఆర్‌', # te_telu
393393
'آر', # ur_arab
394394
),
@@ -404,7 +404,7 @@ def _spellout_inventory() -> i.Inventory:
404404
'ਐੱਸ', # pa_guru
405405
'ايس', # sd_arab
406406
'එස්', # si_sinh
407-
'எஸ்', # ta_taml
407+
'எஸ்', # ta_taml
408408
'ఎస్‌', # te_telu
409409
'ایس', # ur_arab
410410
),
@@ -484,7 +484,7 @@ def _spellout_inventory() -> i.Inventory:
484484
'ਐੱਕਸ', # pa_guru
485485
'ايڪس', # sd_arab
486486
'එක්ස්', # si_sinh
487-
'எக்ஸ்', # ta_taml
487+
'எக்ஸ்', # ta_taml
488488
'ఎక్స్‌', # te_telu
489489
'ایکس', # ur_arab
490490
),
@@ -500,7 +500,7 @@ def _spellout_inventory() -> i.Inventory:
500500
'ਵਾਈ', # pa_guru
501501
'وائي', # sd_arab
502502
'වයි', # si_sinh
503-
'ஒய்', # ta_taml
503+
'ஒய்', # ta_taml
504504
'వై', # te_telu
505505
'وائے', # ur_arab
506506
),
@@ -516,7 +516,7 @@ def _spellout_inventory() -> i.Inventory:
516516
'ਜ਼ੈਡ', # pa_guru
517517
'زيڊ', # sd_arab
518518
'සඩ්', # si_sinh
519-
'இஜட்', # ta_taml
519+
'இஜட்', # ta_taml
520520
'జడ్‌', # te_telu
521521
'زیڈ', # ur_arab
522522
),
@@ -556,7 +556,7 @@ def _separated(
556556

557557
def speller(language: Language, script: Script) -> pyn.Fst:
558558
"""Builds a speller FST for the given language and script."""
559-
has_zwnj = [Script.KNDA, Script.MLYM, Script.TAML, Script.TELU]
559+
has_zwnj = [Script.KNDA, Script.MLYM, Script.TELU]
560560
spellouts = i.Inventory.from_list([
561561
ty.Thing(
562562
letter.letter, value_from=_get_spellout(letter, language, script)

nisaba/scripts/natural_translit/deromanization/BUILD.bazel

+22-47
Original file line numberDiff line numberDiff line change
@@ -35,22 +35,6 @@ LANGUAGES = [
3535
("ta", "taml"),
3636
]
3737

38-
SPELLOUT_LANGUAGES = [
39-
"bn_beng",
40-
"gu_gujr",
41-
"hi_deva",
42-
"kn_knda",
43-
"ml_mlym",
44-
"mr_deva",
45-
"or_orya",
46-
"pa_guru",
47-
"sd_arab",
48-
"si_sinh",
49-
"ta_taml",
50-
"te_telu",
51-
"ur_arab",
52-
]
53-
5438
# Compile ISO grms. Eg: //nisaba/scripts/natural_translit/deromanization:hi_iso
5539

5640
[
@@ -87,22 +71,6 @@ SPELLOUT_LANGUAGES = [
8771
for language, script in LANGUAGES
8872
]
8973

90-
[
91-
nisaba_compile_multi_grm_py(
92-
name = "en_spellout_%s" % language,
93-
outs = {
94-
"byte": "en_spellout_%s.far" % language,
95-
"utf8": "en_spellout_%s_utf8.far" % language,
96-
},
97-
visibility = ["//visibility:public"],
98-
deps = [
99-
"//nisaba/scripts/natural_translit/brahmic:en_spellout",
100-
"@org_opengrm_pynini//pynini",
101-
],
102-
)
103-
for language in SPELLOUT_LANGUAGES
104-
]
105-
10674
# Byte tests. Eg: //nisaba/scripts/natural_translit/deromanization:hi_iso_test
10775

10876
[
@@ -119,13 +87,6 @@ SPELLOUT_LANGUAGES = [
11987
for language, script in LANGUAGES
12088
]
12189

122-
[
123-
nisaba_grm_textproto_test(
124-
name = "en_spellout_%s_test" % language,
125-
)
126-
for language in SPELLOUT_LANGUAGES
127-
]
128-
12990
# UTF-8 tests. Eg: //nisaba/scripts/natural_translit/deromanization:hi_iso_utf8_test
13091
[
13192
nisaba_grm_textproto_test(
@@ -145,11 +106,25 @@ SPELLOUT_LANGUAGES = [
145106
for language, script in LANGUAGES
146107
]
147108

148-
[
149-
nisaba_grm_textproto_test(
150-
name = "en_spellout_%s_utf8_test" % language,
151-
textproto = "testdata/en_spellout_%s.textproto" % language,
152-
token_type = "utf8",
153-
)
154-
for language in SPELLOUT_LANGUAGES
155-
]
109+
nisaba_compile_multi_grm_py(
110+
name = "en_spellout",
111+
outs = {
112+
"byte": "en_spellout.far",
113+
"utf8": "en_spellout_utf8.far",
114+
},
115+
visibility = ["//visibility:public"],
116+
deps = [
117+
"//nisaba/scripts/natural_translit/brahmic:en_spellout",
118+
"@org_opengrm_pynini//pynini",
119+
],
120+
)
121+
122+
nisaba_grm_textproto_test(
123+
name = "en_spellout_test",
124+
)
125+
126+
nisaba_grm_textproto_test(
127+
name = "en_spellout_utf8_test",
128+
textproto = "testdata/en_spellout.textproto",
129+
token_type = "utf8",
130+
)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# Copyright 2024 Nisaba Authors.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""English letter spellout for various South Asian languages."""
16+
import pynini as pyn
17+
from pynini.export import multi_grm
18+
from nisaba.scripts.natural_translit.brahmic import en_spellout
19+
20+
_LANG_SCRIPTS = (
21+
'bn_beng',
22+
'gu_gujr',
23+
'hi_deva',
24+
'kn_knda',
25+
'ml_mlym',
26+
'mr_deva',
27+
'or_orya',
28+
'pa_guru',
29+
'sd_arab',
30+
'si_sinh',
31+
'ta_taml',
32+
'te_telu',
33+
'ur_arab',
34+
)
35+
36+
37+
def generator_main(exporter_map: multi_grm.ExporterMapping):
38+
"""Generates FAR for spellout grammars for all supported language scripts."""
39+
for lang_script in _LANG_SCRIPTS:
40+
for token_type in ('byte', 'utf8'):
41+
with pyn.default_token_type(token_type):
42+
language_tag, script_tag = lang_script.split('_')
43+
language = en_spellout.Language(language_tag)
44+
script = en_spellout.Script(script_tag)
45+
exporter = exporter_map[token_type]
46+
exporter[lang_script.upper()] = en_spellout.speller(language, script)
47+
48+
49+
if __name__ == '__main__':
50+
multi_grm.run(generator_main)

nisaba/scripts/natural_translit/deromanization/en_spellout_bn_beng.py

-30
This file was deleted.

nisaba/scripts/natural_translit/deromanization/en_spellout_gu_gujr.py

-30
This file was deleted.

nisaba/scripts/natural_translit/deromanization/en_spellout_hi_deva.py

-30
This file was deleted.

nisaba/scripts/natural_translit/deromanization/en_spellout_kn_knda.py

-30
This file was deleted.

nisaba/scripts/natural_translit/deromanization/en_spellout_ml_mlym.py

-30
This file was deleted.

0 commit comments

Comments
 (0)