28
28
29
29
import logging , re
30
30
from FormulaLMPreparation import LMPreparationFormula
31
+ from num2words import num2words
32
+ from roman import fromRoman
31
33
32
34
class NumberFormula ():
33
35
"""Various number formats expansion.
34
36
"""
35
37
logger = logging .getLogger ("Asrt.NumberFormula" )
36
38
39
+ THOUSANDSEPARATOR = u"'"
40
+
41
+ HASNUMBERREGEX = re .compile (u"([0-9]|I|V|X|L|C|D|M)+" , flags = re .UNICODE )
37
42
CARDINALNUMBERREGEX = re .compile (u"[0-9]+$" , flags = re .UNICODE )
38
- ORDINALNUMBERREGEX = re .compile (u"(1er|1re|[0-9]+e||[0-9]+ème)$" , flags = re .UNICODE )
39
- DECIMALNUMBERREGEX = re .compile (u"[0-9]+[,.][0-9]+$" , flags = re .UNICODE )
40
- ROMANNUMBERREGEX = re .compile (u"(I|V|X|L|C|D|M)+(er|re|e|eme|ème)$" , flags = re .UNICODE )
43
+ ORDINALNUMBERREGEX = re .compile (u"(1er|1re|1ère|[0-9]+e|[0-9]+ème|Ier|Ière|[IVXLCDM]+ème)$" , flags = re .UNICODE )
44
+ ORDINALREPLACEREGEX = re .compile (u"[erèm]" , flags = re .UNICODE )
45
+ DECIMALNUMBERREGEX = re .compile (u"[0-9,.]+[0-9,.]*$" , flags = re .UNICODE )
46
+ ROMANNUMBERREGEX = re .compile (u"[IVXLCDM]+(er|re|ère|e|ème)?$" , flags = re .UNICODE )
41
47
42
48
##################
43
49
#Public interface
@@ -54,26 +60,61 @@ def apply(self, strText):
54
60
param strText: an utf-8 encoded string
55
61
return an utf-8 encoded string
56
62
"""
57
- wordsList = re .split (LMPreparationFormula .SPACEREGEX , strText , flags = re .UNICODE )
63
+ wordsList = re .split (LMPreparationFormula .SPACEPATTERN , strText , flags = re .UNICODE )
58
64
59
65
newWordsList = []
60
66
for w in wordsList :
61
- if self ._isCardinalNumber (w ):
62
- pass
63
- elif self ._isOrdinalNumber (w ):
64
- pass
65
- elif self ._isDecimalNumber (w ):
66
- pass
67
- elif self ._isRomanNumber (w ):
68
- pass
69
- else :
67
+ if not self ._hasNumber (w ):
68
+ newWordsList .append (w )
69
+ continue
70
+ #Numbers may contain alphanumeric
71
+ #characters
72
+ wNorm = self ._normalizeNumber (w )
73
+ try :
74
+ #Now check number type
75
+ if self ._isCardinalNumber (wNorm ):
76
+ wNorm = self ._cardinal2word (wNorm )
77
+ elif self ._isOrdinalNumber (wNorm ):
78
+ wNorm = self ._ordinal2word (wNorm )
79
+ elif self ._isDecimalNumber (wNorm ):
80
+ wNorm = self ._decimal2word (wNorm )
81
+ elif self ._isRomanNumber (wNorm ):
82
+ wNorm = self ._roman2word (wNorm )
83
+ else :
84
+ self .logger .info ("Unknown number format: %s" % w .encode ('utf-8' ))
85
+ wNorm = w
86
+ newWordsList .append (wNorm )
87
+
88
+ except Exception , e :
89
+ self .logger .warning ("Error formatting number (%s): %s" % \
90
+ (w .encode ('utf-8' ), str (e )))
70
91
newWordsList .append (w )
71
92
72
93
return u" " .join (newWordsList )
73
94
74
95
##################
75
96
#Implementation
76
97
#
98
+ @staticmethod
99
+ def _hasNumber (strWord ):
100
+ """Check if 'strWord' contains numbers.
101
+
102
+ param strWord: an utf-8 encoded words
103
+ return True or False
104
+ """
105
+ #Use search instead of match
106
+ return NumberFormula .HASNUMBERREGEX .search (strWord ) != None
107
+
108
+ @staticmethod
109
+ def _normalizeNumber (strWord ):
110
+ """Remove tousand separator.
111
+
112
+ param strWord: an utf-8 encoded words
113
+ return an utf-8 encoded string
114
+ """
115
+ strWord = strWord .replace (NumberFormula .THOUSANDSEPARATOR , u"" )
116
+ return strWord
117
+
77
118
@staticmethod
78
119
def _cardinal2word (strNumber ):
79
120
"""Convert a cardinal number to a written
@@ -82,18 +123,36 @@ def _cardinal2word(strNumber):
82
123
param strNumber: an utf-8 cardinal number
83
124
return a 'written' cardinal number
84
125
"""
85
- pass
126
+ strNumber = num2words (int (strNumber ), lang = 'fr' )
127
+ return strNumber .replace (u"-" , u" " )
86
128
87
129
@staticmethod
88
130
def _ordinal2word (strNumber ):
89
131
"""Convert an ordinal number to a written
90
132
word.
91
133
134
+ i.e. 1er --> premier
135
+
92
136
param strNumber: an utf-8 ordinal number
93
137
return a 'written' ordinal number
94
138
"""
95
- pass
96
-
139
+ if strNumber .encode ('utf-8' ) == u"1ère" .encode ('utf-8' ):
140
+ return u"première"
141
+
142
+ strNewNumber = re .sub (u"[erèm]" , "" , strNumber )
143
+ if NumberFormula ._isCardinalNumber (strNewNumber ):
144
+ strNewNumber = num2words (int (strNewNumber ), ordinal = True , lang = 'fr' )
145
+ elif NumberFormula ._isRomanNumber (strNewNumber ):
146
+ #Roman to cardinal
147
+ strNewNumber = strNewNumber .encode ('utf-8' )
148
+ cardinalNumber = fromRoman (strNewNumber )
149
+ #Digits to ordinal
150
+ strNewNumber = num2words (cardinalNumber , ordinal = True , lang = 'fr' )
151
+ else :
152
+ strNewNumber = strNumber
153
+
154
+ return strNewNumber
155
+
97
156
@staticmethod
98
157
def _decimal2word (strNumber ):
99
158
"""Convert a decimal number to a written
@@ -102,7 +161,17 @@ def _decimal2word(strNumber):
102
161
param strNumber: an utf-8 decimal number
103
162
return a 'written' decimal number
104
163
"""
105
- pass
164
+ strNumber = u" virgule " .join (re .split ("[,]" ,strNumber ))
165
+ strNumber = u" point " .join (re .split ("[.]" ,strNumber ))
166
+
167
+ tokenList = []
168
+ for w in re .split (LMPreparationFormula .SPACEPATTERN , strNumber ):
169
+ w = w .strip ()
170
+ if NumberFormula ._isCardinalNumber (w ):
171
+ w = NumberFormula ._cardinal2word (w )
172
+ tokenList .append (w )
173
+
174
+ return u" " .join (tokenList )
106
175
107
176
@staticmethod
108
177
def _roman2word (strNumber ):
@@ -112,8 +181,10 @@ def _roman2word(strNumber):
112
181
param strNumber: an utf-8 roman number
113
182
return a 'written' roman number
114
183
"""
115
- pass
116
-
184
+ strNumber = strNumber .encode ('utf-8' )
185
+ cardinalNumber = fromRoman (strNumber )
186
+ return NumberFormula ._cardinal2word (cardinalNumber )
187
+
117
188
@staticmethod
118
189
def _isCardinalNumber (strWord ):
119
190
"""Check if 'strWord' is a cardinal number.
0 commit comments