Skip to content

Commit

Permalink
range and cardinal tests are passing
Browse files Browse the repository at this point in the history
Signed-off-by: ekmb <[email protected]>
  • Loading branch information
ekmb committed Apr 4, 2022
1 parent 8df8271 commit d3d8437
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def __init__(self, deterministic: bool = True, lm: bool = False):
final_graph = (
self.graph
| serial_graph
| self.single_digits_graph
| pynutil.add_weight(self.single_digits_graph, 0.0001)
| get_four_digit_year_graph()
| pynutil.add_weight(single_digits_graph_with_commas, 0.0001)
| cardinal_with_leading_zeros
Expand Down
24 changes: 17 additions & 7 deletions nemo_text_processing/text_normalization/en/taggers/range.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# limitations under the License.


from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst, convert_space
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, NEMO_SIGMA, GraphFst, convert_space

try:
import pynini
Expand All @@ -31,33 +31,43 @@ class RangeFst(GraphFst):
for False multiple transduction are generated (used for audio-based normalization)
"""

def __init__(self, time: GraphFst, date: GraphFst, cardinal: GraphFst = None, deterministic: bool = True):
def __init__(
self, time: GraphFst, date: GraphFst, cardinal: GraphFst, deterministic: bool = True, lm: bool = False
):
super().__init__(name="range", kind="classify", deterministic=deterministic)

delete_space = pynini.closure(pynutil.delete(" "), 0, 1)
self.graph = time + delete_space + pynini.cross("-", " to ") + delete_space + time
date_year = (NEMO_DIGIT ** 4 + pynini.closure(pynini.accep("s"), 0, 1)) @ date
self.graph |= date_year + delete_space + pynini.cross("-", " to ") + delete_space + date_year

year_to_year_graph = date_year + delete_space + pynini.cross("-", " to ") + delete_space + date_year
self.graph |= year_to_year_graph

cardinal = cardinal.graph
# this will use year for for 4-digit cardinal
up_to_three_morfive_digits = (NEMO_DIGIT ** (1, 3)) | (NEMO_DIGIT ** (5, ...))
up_to_three_morfive_digits = pynini.compose(up_to_three_morfive_digits, cardinal)
range_graph = (
cardinal_to_cardinal_graph = (
up_to_three_morfive_digits
+ delete_space
+ pynini.cross("-", " to ")
+ delete_space
+ up_to_three_morfive_digits
)
range_graph |= cardinal + delete_space + pynini.cross(":", " to ") + delete_space + cardinal

if not deterministic and not lm:
cardinal_to_cardinal_graph |= cardinal + delete_space + pynini.cross("-", " to ") + delete_space + cardinal

range_graph = cardinal_to_cardinal_graph | (
cardinal + delete_space + pynini.cross(":", " to ") + delete_space + cardinal
)
for x in [" x ", "x"]:
range_graph |= cardinal + pynini.closure(pynini.cross(x, pynini.union(" by ", " times ")) + cardinal, 1)
for x in ["*", " * "]:
range_graph |= cardinal + pynini.closure(pynini.cross(x, " times ") + cardinal, 1)

if not deterministic:
range_graph = cardinal + delete_space + pynini.cross("-", " minus ") + delete_space + cardinal
if not deterministic or lm:
range_graph |= cardinal + delete_space + pynini.cross("-", " minus ") + delete_space + cardinal

# supports "No. 12" -> "Number 12"
range_graph |= (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ one us dollar one cent
~1.24
one point two four
one point twenty four
~~1/24
january twenty four
the twenty fourth of january
january twenty fourth
Expand Down Expand Up @@ -46,8 +47,8 @@ It seemed to her that the jacket Oswald wore was darker than Commission Exhibit
nineteen seventy-twenty ten
one thousand nine hundred and seventy-two thousand ten
one thousand nine hundred and seventy-twenty ten
from nineteen seventy to twenty ten
from one thousand nine hundred and seventy to two thousand ten
nineteen seventy to twenty ten
one thousand nine hundred seventy to two thousand ten
~W26s
W twenty six s
W two six s
Expand All @@ -58,7 +59,6 @@ four oh one-ks
four hundred one-ks
~The box was 25 x 7 m.
The box was twenty five by seven meters.
The box was twenty five by seven m.
~4567
four thousand five hundred sixty seven
four five six seven
Expand Down Expand Up @@ -104,7 +104,7 @@ Test
august fifth two thousand six
the fifth of august two thousand six
~2-5
from two to five
two to five
two-five
~627
six hundred twenty seven
Expand Down Expand Up @@ -139,13 +139,13 @@ Tuesday (the twenty second of february) at five p m.
Tuesday (the twenty second of february) at five pm.
Tuesday (two divided by twenty two) at five pm.
Tuesday (february twenty second) at five pm.
Tuesday (february twenty second) at five p m.
Tuesday (february twenty two) at five p m.
Tuesday (two/two two) at five p m.
Tuesday (two divided by twenty two) at five p m.
Tuesday (two twenty seconds) at five pm.
Tuesday (two/twenty two) at five p m.
Tuesday (two/twenty two) at five pm.
Tuesday (february twenty second) at five p m.
Tuesday (february twenty two) at five pm.
Tuesday (two/two two) at five pm.
Tuesday (two twenty seconds) at five p m.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,5 +31,5 @@ Hi it's 5pm,4A.M.?-34. Hi,no,yes,34! 12,again,4 and NO?17 and $.01,here & there-
there--0.4kg~there - minus zero point four kilograms
there -0.4kg~there minus zero point four kilograms
there- -0.4kg~there - minus zero point four kilograms
ÀÁÂÃ check §- and ƛ, also ɧ~ÀÁÂÃ check § - and ƛ , also ɧ
ÀÁÂÃ check §- and ƛ, also ɧ~ÀÁÂÃ check section - and ƛ , also ɧ
$ and 5% or %~dollar and five percent or percent sign

0 comments on commit d3d8437

Please sign in to comment.