Skip to content

Commit 1bd2c79

Browse files
agutkincopybara-github
authored andcommitted
Reading normalization in C++ interface.
PiperOrigin-RevId: 657164548
1 parent a1cd9ec commit 1bd2c79

File tree

4 files changed

+103
-0
lines changed

4 files changed

+103
-0
lines changed

nisaba/scripts/brahmic/grammar.cc

+14
Original file line numberDiff line numberDiff line change
@@ -141,5 +141,19 @@ bool Normalizer::SupportsFst(absl::string_view fst_name) {
141141
return Normalizer{fst_name}.Load().ok();
142142
}
143143

144+
absl::Status ReadingNorm::Load() {
145+
RETURN_IF_ERROR(visual_norm_.Load());
146+
RETURN_IF_ERROR(reading_norm_.Load());
147+
return absl::OkStatus();
148+
}
149+
150+
absl::Status ReadingNorm::Rewrite(absl::string_view input,
151+
std::string *output) const {
152+
std::string temp;
153+
RETURN_IF_ERROR(visual_norm_.Rewrite(input, &temp));
154+
RETURN_IF_ERROR(reading_norm_.Rewrite(temp, output));
155+
return absl::OkStatus();
156+
}
157+
144158
} // namespace brahmic
145159
} // namespace nisaba

nisaba/scripts/brahmic/grammar.h

+22
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,28 @@ class Normalizer {
120120
Grammar wellformed_;
121121
};
122122

123+
// Provides reading normalization of Brahmic text after composing it with
124+
// visual normalization.
125+
class ReadingNorm {
126+
public:
127+
ReadingNorm(absl::string_view far_path, absl::string_view fst_name)
128+
: visual_norm_(far_path, "visual_norm", fst_name),
129+
reading_norm_(far_path, "reading_norm", fst_name){}
130+
131+
explicit ReadingNorm(absl::string_view fst_name) :
132+
visual_norm_("visual_norm", fst_name),
133+
reading_norm_("reading_norm", fst_name) {}
134+
135+
absl::Status Load();
136+
absl::Status Rewrite(absl::string_view input, std::string *output) const;
137+
138+
private:
139+
ReadingNorm() = delete;
140+
141+
Grammar visual_norm_;
142+
Grammar reading_norm_;
143+
};
144+
123145
} // namespace brahmic
124146
} // namespace nisaba
125147

nisaba/scripts/brahmic/grammar_test.cc

+58
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,64 @@ TEST(PreLoadedNormalizerTest, NormalizerWithPreLoadedManagers) {
161161
EXPECT_FALSE(normalizer.Rewrite("काु", &output).ok());
162162
}
163163

164+
void CheckReadingNormLoadOk(absl::string_view language_or_script) {
165+
ReadingNorm reading_norm(language_or_script);
166+
EXPECT_OK(reading_norm.Load());
167+
}
168+
169+
void CheckReadingNormLoadError(absl::string_view language_or_script) {
170+
ReadingNorm reading_norm(language_or_script);
171+
EXPECT_EQ(reading_norm.Load().code(), absl::StatusCode::kInternal);
172+
}
173+
174+
TEST(ReadingNorm, CheckLoad) {
175+
// For language.
176+
CheckReadingNormLoadOk("bn");
177+
CheckReadingNormLoadOk("hi");
178+
CheckReadingNormLoadOk("ml");
179+
// For script.
180+
CheckReadingNormLoadOk("Beng");
181+
CheckReadingNormLoadOk("Mlym");
182+
CheckReadingNormLoadOk("Lepc");
183+
// No reading norm.
184+
CheckReadingNormLoadError("Deva");
185+
CheckReadingNormLoadError("mr");
186+
}
187+
188+
// TODO: Consider adding tests based on the data from the file
189+
// nisaba/scripts/brahmic/testdata/reading_norm.tsv file.
190+
void TestReadingNorm(absl::string_view language,
191+
const std::vector<std::string>& inputs,
192+
const std::vector<std::string>& expected_outputs) {
193+
ReadingNorm reading_norm(language);
194+
ASSERT_OK(reading_norm.Load());
195+
for (size_t n = 0; n < inputs.size(); ++n) {
196+
std::string output_word;
197+
EXPECT_OK(reading_norm.Rewrite(inputs[n], &output_word));
198+
EXPECT_EQ(output_word, expected_outputs[n]);
199+
}
200+
}
201+
202+
TEST(ReadingNorm, bn) {
203+
const std::vector<std::string> inputs({"সংগে"});
204+
const std::vector<std::string> expected_outputs({"সঙ্গে"});
205+
TestReadingNorm("bn", inputs, expected_outputs);
206+
}
207+
208+
TEST(ReadingNorm, hi) {
209+
const std::vector<std::string> inputs({"काङ्ग्रेस"});
210+
const std::vector<std::string> expected_outputs({"कांग्रेस"});
211+
TestReadingNorm("hi", inputs, expected_outputs);
212+
}
213+
214+
TEST(ReadingNorm, ml) {
215+
// clang-format off
216+
const std::vector<std::string> inputs({"​സ​ന്യാ​സി​വ​ൎയ്യ​ന്മാ​ർ​ക്ക"});
217+
const std::vector<std::string> expected_outputs({"സന്യാസിവര്യന്മാർക്ക"});
218+
// clang-format on
219+
TestReadingNorm("ml", inputs, expected_outputs);
220+
}
221+
164222
} // namespace
165223
} // namespace brahmic
166224
} // namespace nisaba

nisaba/scripts/brahmic/randgen_test.py

+9
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,15 @@ def test_visual_norm(self, script: str, token_type: pynini.TokenType):
3737
fst = u.OpenFstFromBrahmicFar('visual_norm', script, token_type)
3838
self.AssertFstProbablyFunctional(fst, token_type)
3939

40+
@parameterized.parameters(
41+
itertools.product(u.READING_NORM_LANGS, ('byte', 'utf8'))
42+
)
43+
def test_reading_norm(self, script_lang: str, token_type: pynini.TokenType):
44+
script, lang = script_lang
45+
tag = lang if lang else script
46+
fst = u.OpenFstFromBrahmicFar('reading_norm', tag, token_type)
47+
self.AssertFstProbablyFunctional(fst, token_type)
48+
4049
@parameterized.parameters(itertools.product(u.SCRIPTS, ('byte', 'utf8')))
4150
def test_from_iso_to_native_single_best(
4251
self, script: str, token_type: pynini.TokenType

0 commit comments

Comments
 (0)