Skip to content

Commit

Permalink
Reading normalization in C++ interface.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 657164548
  • Loading branch information
agutkin authored and copybara-github committed Jul 29, 2024
1 parent a1cd9ec commit 1bd2c79
Show file tree
Hide file tree
Showing 4 changed files with 103 additions and 0 deletions.
14 changes: 14 additions & 0 deletions nisaba/scripts/brahmic/grammar.cc
Original file line number Diff line number Diff line change
Expand Up @@ -141,5 +141,19 @@ bool Normalizer::SupportsFst(absl::string_view fst_name) {
return Normalizer{fst_name}.Load().ok();
}

absl::Status ReadingNorm::Load() {
RETURN_IF_ERROR(visual_norm_.Load());
RETURN_IF_ERROR(reading_norm_.Load());
return absl::OkStatus();
}

absl::Status ReadingNorm::Rewrite(absl::string_view input,
std::string *output) const {
std::string temp;
RETURN_IF_ERROR(visual_norm_.Rewrite(input, &temp));
RETURN_IF_ERROR(reading_norm_.Rewrite(temp, output));
return absl::OkStatus();
}

} // namespace brahmic
} // namespace nisaba
22 changes: 22 additions & 0 deletions nisaba/scripts/brahmic/grammar.h
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,28 @@ class Normalizer {
Grammar wellformed_;
};

// Provides reading normalization of Brahmic text after composing it with
// visual normalization.
class ReadingNorm {
public:
ReadingNorm(absl::string_view far_path, absl::string_view fst_name)
: visual_norm_(far_path, "visual_norm", fst_name),
reading_norm_(far_path, "reading_norm", fst_name){}

explicit ReadingNorm(absl::string_view fst_name) :
visual_norm_("visual_norm", fst_name),
reading_norm_("reading_norm", fst_name) {}

absl::Status Load();
absl::Status Rewrite(absl::string_view input, std::string *output) const;

private:
ReadingNorm() = delete;

Grammar visual_norm_;
Grammar reading_norm_;
};

} // namespace brahmic
} // namespace nisaba

Expand Down
58 changes: 58 additions & 0 deletions nisaba/scripts/brahmic/grammar_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,64 @@ TEST(PreLoadedNormalizerTest, NormalizerWithPreLoadedManagers) {
EXPECT_FALSE(normalizer.Rewrite("काु", &output).ok());
}

void CheckReadingNormLoadOk(absl::string_view language_or_script) {
ReadingNorm reading_norm(language_or_script);
EXPECT_OK(reading_norm.Load());
}

void CheckReadingNormLoadError(absl::string_view language_or_script) {
ReadingNorm reading_norm(language_or_script);
EXPECT_EQ(reading_norm.Load().code(), absl::StatusCode::kInternal);
}

TEST(ReadingNorm, CheckLoad) {
// For language.
CheckReadingNormLoadOk("bn");
CheckReadingNormLoadOk("hi");
CheckReadingNormLoadOk("ml");
// For script.
CheckReadingNormLoadOk("Beng");
CheckReadingNormLoadOk("Mlym");
CheckReadingNormLoadOk("Lepc");
// No reading norm.
CheckReadingNormLoadError("Deva");
CheckReadingNormLoadError("mr");
}

// TODO: Consider adding tests based on the data from the file
// nisaba/scripts/brahmic/testdata/reading_norm.tsv file.
void TestReadingNorm(absl::string_view language,
const std::vector<std::string>& inputs,
const std::vector<std::string>& expected_outputs) {
ReadingNorm reading_norm(language);
ASSERT_OK(reading_norm.Load());
for (size_t n = 0; n < inputs.size(); ++n) {
std::string output_word;
EXPECT_OK(reading_norm.Rewrite(inputs[n], &output_word));
EXPECT_EQ(output_word, expected_outputs[n]);
}
}

TEST(ReadingNorm, bn) {
const std::vector<std::string> inputs({"সংগে"});
const std::vector<std::string> expected_outputs({"সঙ্গে"});
TestReadingNorm("bn", inputs, expected_outputs);
}

TEST(ReadingNorm, hi) {
const std::vector<std::string> inputs({"काङ्ग्रेस"});
const std::vector<std::string> expected_outputs({"कांग्रेस"});
TestReadingNorm("hi", inputs, expected_outputs);
}

TEST(ReadingNorm, ml) {
// clang-format off
const std::vector<std::string> inputs({"​സ​ന്യാ​സി​വ​ൎയ്യ​ന്മാ​ർ​ക്ക"});
const std::vector<std::string> expected_outputs({"സന്യാസിവര്യന്മാർക്ക"});
// clang-format on
TestReadingNorm("ml", inputs, expected_outputs);
}

} // namespace
} // namespace brahmic
} // namespace nisaba
9 changes: 9 additions & 0 deletions nisaba/scripts/brahmic/randgen_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,15 @@ def test_visual_norm(self, script: str, token_type: pynini.TokenType):
fst = u.OpenFstFromBrahmicFar('visual_norm', script, token_type)
self.AssertFstProbablyFunctional(fst, token_type)

@parameterized.parameters(
itertools.product(u.READING_NORM_LANGS, ('byte', 'utf8'))
)
def test_reading_norm(self, script_lang: str, token_type: pynini.TokenType):
script, lang = script_lang
tag = lang if lang else script
fst = u.OpenFstFromBrahmicFar('reading_norm', tag, token_type)
self.AssertFstProbablyFunctional(fst, token_type)

@parameterized.parameters(itertools.product(u.SCRIPTS, ('byte', 'utf8')))
def test_from_iso_to_native_single_best(
self, script: str, token_type: pynini.TokenType
Expand Down

0 comments on commit 1bd2c79

Please sign in to comment.