Skip to content

Commit 7935ec5

Browse files
committed
[common] Update French number formula.
1 parent 4cb0587 commit 7935ec5

File tree

9 files changed

+180
-118
lines changed

9 files changed

+180
-118
lines changed

.gitmodules

+3
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,6 @@
44
[submodule "lib/nltk"]
55
path = lib/nltk
66
url = https://github.com/nltk/nltk.git
7+
[submodule "lib/num2words"]
8+
path = lib/num2words
9+
url = https://github.com/savoirfairelinux/num2words.git

INSTALL.md

+3-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@ This install manual is for Debian based systems.
88

99
`git submodule update --init lib/unicodecsv`
1010

11-
2. Install both libraries using the *setup.py* scripts.
11+
`git submodule update --init lib/num2words`
12+
13+
2. Install the three libraries using the *setup.py* scripts.
1214

1315
`setup.py install --prefix=/path/to/local/install`
1416

lib/num2words

Submodule num2words added at f5fc179

scripts/common/AsrtConstants.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -121,9 +121,10 @@
121121
(ur"\u215D",u"5/8",u"Number forms: cinq huitieme",u"0"),
122122
(ur"\u215E",u"7/8",u"Number forms: sept huitieme",u"0")]
123123

124-
PUNCTUATIONEXCLUDE = ['!', '"', '#', "'", '(', ')', '*', '+', ',', '-',
124+
#Do not exclude single quote
125+
PUNCTUATIONEXCLUDE = ['!', '"', '#', '(', ')', '*', '+', ',', '-',
125126
'.', '/', ':', ';', '<', '=', '>', '?', '[', '\\',
126-
']', '^', '_', '`', '{', '|', '}', '~', "'"]
127+
']', '^', '_', '`', '{', '|', '}', '~']
127128
PUNCTUATIONMAP = {
128129
"%": (r"%",r"pourcent", u"Prozent", u"percent", u"per cento"),
129130
"&": (r"&",r"et", u"und", u"and", u"e"),

scripts/common/FormulaLMPreparation.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -212,8 +212,9 @@ def _normalizePunctuation(self):
212212
unicodeList = []
213213
for i, c in enumerate(self.strText):
214214
strC = c.encode('utf-8')
215+
#For date format, i.e. 21-Jul
215216
if strC in PUNCTUATIONEXCLUDE:
216-
continue
217+
unicodeList.append(u" ")
217218
elif self.languageId != 0 and strC in PUNCTUATIONMAP:
218219
unicodeList.append(u" " + PUNCTUATIONMAP[strC][self.languageId] + u" ")
219220
else:

scripts/common/french/FormulaNumber.py

+90-19
Original file line numberDiff line numberDiff line change
@@ -28,16 +28,22 @@
2828

2929
import logging, re
3030
from FormulaLMPreparation import LMPreparationFormula
31+
from num2words import num2words
32+
from roman import fromRoman
3133

3234
class NumberFormula():
3335
"""Various number formats expansion.
3436
"""
3537
logger = logging.getLogger("Asrt.NumberFormula")
3638

39+
THOUSANDSEPARATOR = u"'"
40+
41+
HASNUMBERREGEX = re.compile(u"([0-9]|I|V|X|L|C|D|M)+", flags=re.UNICODE)
3742
CARDINALNUMBERREGEX = re.compile(u"[0-9]+$", flags=re.UNICODE)
38-
ORDINALNUMBERREGEX = re.compile(u"(1er|1re|[0-9]+e||[0-9]+ème)$", flags=re.UNICODE)
39-
DECIMALNUMBERREGEX = re.compile(u"[0-9]+[,.][0-9]+$", flags=re.UNICODE)
40-
ROMANNUMBERREGEX = re.compile(u"(I|V|X|L|C|D|M)+(er|re|e|eme|ème)$", flags=re.UNICODE)
43+
ORDINALNUMBERREGEX = re.compile(u"(1er|1re|1ère|[0-9]+e|[0-9]+ème|Ier|Ière|[IVXLCDM]+ème)$", flags=re.UNICODE)
44+
ORDINALREPLACEREGEX = re.compile(u"[erèm]", flags=re.UNICODE)
45+
DECIMALNUMBERREGEX = re.compile(u"[0-9,.]+[0-9,.]*$", flags=re.UNICODE)
46+
ROMANNUMBERREGEX = re.compile(u"[IVXLCDM]+(er|re|ère|e|ème)?$", flags=re.UNICODE)
4147

4248
##################
4349
#Public interface
@@ -54,26 +60,61 @@ def apply(self, strText):
5460
param strText: an utf-8 encoded string
5561
return an utf-8 encoded string
5662
"""
57-
wordsList = re.split(LMPreparationFormula.SPACEREGEX, strText, flags=re.UNICODE)
63+
wordsList = re.split(LMPreparationFormula.SPACEPATTERN, strText, flags=re.UNICODE)
5864

5965
newWordsList = []
6066
for w in wordsList:
61-
if self._isCardinalNumber(w):
62-
pass
63-
elif self._isOrdinalNumber(w):
64-
pass
65-
elif self._isDecimalNumber(w):
66-
pass
67-
elif self._isRomanNumber(w):
68-
pass
69-
else:
67+
if not self._hasNumber(w):
68+
newWordsList.append(w)
69+
continue
70+
#Numbers may contain alphanumeric
71+
#characters
72+
wNorm = self._normalizeNumber(w)
73+
try:
74+
#Now check number type
75+
if self._isCardinalNumber(wNorm):
76+
wNorm = self._cardinal2word(wNorm)
77+
elif self._isOrdinalNumber(wNorm):
78+
wNorm = self._ordinal2word(wNorm)
79+
elif self._isDecimalNumber(wNorm):
80+
wNorm = self._decimal2word(wNorm)
81+
elif self._isRomanNumber(wNorm):
82+
wNorm = self._roman2word(wNorm)
83+
else:
84+
self.logger.info("Unknown number format: %s" % w.encode('utf-8'))
85+
wNorm = w
86+
newWordsList.append(wNorm)
87+
88+
except Exception, e:
89+
self.logger.warning("Error formatting number (%s): %s" % \
90+
(w.encode('utf-8'), str(e)))
7091
newWordsList.append(w)
7192

7293
return u" ".join(newWordsList)
7394

7495
##################
7596
#Implementation
7697
#
98+
@staticmethod
99+
def _hasNumber(strWord):
100+
"""Check if 'strWord' contains numbers.
101+
102+
param strWord: an utf-8 encoded words
103+
return True or False
104+
"""
105+
#Use search instead of match
106+
return NumberFormula.HASNUMBERREGEX.search(strWord) != None
107+
108+
@staticmethod
109+
def _normalizeNumber(strWord):
110+
"""Remove tousand separator.
111+
112+
param strWord: an utf-8 encoded words
113+
return an utf-8 encoded string
114+
"""
115+
strWord = strWord.replace(NumberFormula.THOUSANDSEPARATOR, u"")
116+
return strWord
117+
77118
@staticmethod
78119
def _cardinal2word(strNumber):
79120
"""Convert a cardinal number to a written
@@ -82,18 +123,36 @@ def _cardinal2word(strNumber):
82123
param strNumber: an utf-8 cardinal number
83124
return a 'written' cardinal number
84125
"""
85-
pass
126+
strNumber = num2words(int(strNumber), lang='fr')
127+
return strNumber.replace(u"-", u" ")
86128

87129
@staticmethod
88130
def _ordinal2word(strNumber):
89131
"""Convert an ordinal number to a written
90132
word.
91133
134+
i.e. 1er --> premier
135+
92136
param strNumber: an utf-8 ordinal number
93137
return a 'written' ordinal number
94138
"""
95-
pass
96-
139+
if strNumber.encode('utf-8') == u"1ère".encode('utf-8'):
140+
return u"première"
141+
142+
strNewNumber = re.sub(u"[erèm]", "", strNumber)
143+
if NumberFormula._isCardinalNumber(strNewNumber):
144+
strNewNumber = num2words(int(strNewNumber), ordinal=True, lang='fr')
145+
elif NumberFormula._isRomanNumber(strNewNumber):
146+
#Roman to cardinal
147+
strNewNumber = strNewNumber.encode('utf-8')
148+
cardinalNumber = fromRoman(strNewNumber)
149+
#Digits to ordinal
150+
strNewNumber = num2words(cardinalNumber, ordinal=True, lang='fr')
151+
else:
152+
strNewNumber = strNumber
153+
154+
return strNewNumber
155+
97156
@staticmethod
98157
def _decimal2word(strNumber):
99158
"""Convert a decimal number to a written
@@ -102,7 +161,17 @@ def _decimal2word(strNumber):
102161
param strNumber: an utf-8 decimal number
103162
return a 'written' decimal number
104163
"""
105-
pass
164+
strNumber = u" virgule ".join(re.split("[,]",strNumber))
165+
strNumber = u" point ".join(re.split("[.]",strNumber))
166+
167+
tokenList = []
168+
for w in re.split(LMPreparationFormula.SPACEPATTERN, strNumber):
169+
w = w.strip()
170+
if NumberFormula._isCardinalNumber(w):
171+
w = NumberFormula._cardinal2word(w)
172+
tokenList.append(w)
173+
174+
return u" ".join(tokenList)
106175

107176
@staticmethod
108177
def _roman2word(strNumber):
@@ -112,8 +181,10 @@ def _roman2word(strNumber):
112181
param strNumber: an utf-8 roman number
113182
return a 'written' roman number
114183
"""
115-
pass
116-
184+
strNumber = strNumber.encode('utf-8')
185+
cardinalNumber = fromRoman(strNumber)
186+
return NumberFormula._cardinal2word(cardinalNumber)
187+
117188
@staticmethod
118189
def _isCardinalNumber(strWord):
119190
"""Check if 'strWord' is a cardinal number.

scripts/common/french/unit_test/FormulaNumberUnitTest.py

+76-3
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,82 @@
2626

2727
class FormulaNumberUnitTest(unittest.TestCase):
2828

29-
###############
29+
testDict = { "cardinal": [(u"10",u"dix"),(u"25",u"vingt cinq")],
30+
"ordinal" : [(u"1er",u"premier"),(u"1ère",u"première"),(u"2ème",u"deuxième"),
31+
(u"Vème",u"cinquième"), (u"Xème",u"dixième")],
32+
"decimal" : [(u"2,5",u"deux virgule cinq"), (u"2.5,3",u"deux point cinq virgule trois")],
33+
"roman" : [(u"V",u"cinq"), (u"X",u"dix")],
34+
"all" : [(u"1ab",u"1ab"),(u"ab",u"ab"),
35+
(u"le 25 mars 2015 2.5 Xème",u"le vingt cinq mars deux mille quinze deux point cinq dixième")]
36+
}
37+
38+
#################
39+
# Implementation
40+
#
41+
def evaluateListValues(self, testList, callback):
42+
for t, gt in testList:
43+
r = callback(t).encode('utf-8')
44+
self.assertEquals(gt.encode('utf-8'), r, r)
45+
46+
#################
3047
# Unit tests
3148
#
32-
def testIsCardinal(self):
33-
pass
49+
def test_isCardinal(self):
50+
testList = [(u"2",True),(u"123",True), (u"123.",False)]
51+
52+
for t, gt in testList:
53+
self.assertEquals(NumberFormula._isCardinalNumber(t), gt, t.encode('utf-8'))
54+
55+
def test_isOrdinal(self):
56+
testList = [(u"1er",True), (u"1re",True), (u"1ère",True), (u"2e",True), (u"2ème",True), ]
57+
58+
for t, gt in testList:
59+
self.assertEquals(NumberFormula._isOrdinalNumber(t), gt, t.encode('utf-8'))
60+
61+
def test_isDecimal(self):
62+
testList = [(u"2.5",True), (u"2,5",True),(u"2,5,3",True), (u"2-5",False)]
63+
64+
for t, gt in testList:
65+
self.assertEquals(NumberFormula._isDecimalNumber(t), gt, t.encode('utf-8'))
66+
67+
def test_isRoman(self):
68+
testList = [(u"V",True), (u"Ier",True),(u"XII",True), (u"XIIème",True)]
69+
70+
for t, gt in testList:
71+
self.assertEquals(NumberFormula._isRomanNumber(t), gt, t.encode('utf-8'))
72+
73+
def test_hasNumber(self):
74+
testList = [(u"12",True), (u"1ab",True),(u"ab22",True), (u"Xab",True),
75+
(u"xab",False), (u"a1ab",True)]
76+
77+
for t, gt in testList:
78+
self.assertEquals(NumberFormula._hasNumber(t), gt, t.encode('utf-8'))
79+
80+
def test_normalizeNumber(self):
81+
testList = [(u"50'000",u"50000"),(u"550'000'000",u"550000000")]
82+
self.evaluateListValues(testList, NumberFormula._normalizeNumber)
83+
84+
def test_cardinal2word(self):
85+
testList = self.testDict["cardinal"]
86+
self.evaluateListValues(testList, NumberFormula._cardinal2word)
87+
88+
def test_ordinal2word(self):
89+
testList = self.testDict["ordinal"]
90+
self.evaluateListValues(testList, NumberFormula._ordinal2word)
91+
92+
def test_decimal2word(self):
93+
testList = self.testDict["decimal"]
94+
self.evaluateListValues(testList, NumberFormula._decimal2word)
95+
96+
def test_roman2word(self):
97+
testList = self.testDict["roman"]
98+
self.evaluateListValues(testList, NumberFormula._roman2word)
99+
100+
def test_apply(self):
101+
f = NumberFormula()
102+
103+
for k in self.testDict.keys():
104+
#print "Testing %s " % k
105+
testList = self.testDict[k]
106+
self.evaluateListValues(testList, f.apply)
34107

scripts/common/roman.py

-90
This file was deleted.

0 commit comments

Comments
 (0)