From d07f18b20c5b931171f75fdc7694291f9eca8b07 Mon Sep 17 00:00:00 2001 From: Pablo Diaz Date: Mon, 13 May 2024 20:48:07 -0500 Subject: [PATCH 1/6] first release, wrong output --- README.md | 2 +- example/alpha/main.go | 16 +++ go.mod | 3 + itn/base.go | 171 +++++++++++++++++++++++++++++ itn/es.go | 164 ++++++++++++++++++++++++++++ itn/es_test.go | 102 +++++++++++++++++ itn/parsers.go | 247 ++++++++++++++++++++++++++++++++++++++++++ itn/transforms.go | 1 + itn/utils.go | 15 +++ 9 files changed, 720 insertions(+), 1 deletion(-) create mode 100644 example/alpha/main.go create mode 100644 go.mod create mode 100644 itn/base.go create mode 100644 itn/es.go create mode 100644 itn/es_test.go create mode 100644 itn/parsers.go create mode 100644 itn/transforms.go create mode 100644 itn/utils.go diff --git a/README.md b/README.md index 0a77692..c026017 100644 --- a/README.md +++ b/README.md @@ -1 +1 @@ -# itn \ No newline at end of file +# Inverse Text Normalization diff --git a/example/alpha/main.go b/example/alpha/main.go new file mode 100644 index 0000000..a4dcaa1 --- /dev/null +++ b/example/alpha/main.go @@ -0,0 +1,16 @@ +package main + +import ( + "github.com/pablodz/itn/itn" +) + +func main() { + processor := itn.NewSpanishLanguage() + new_string := processor.Alpha2Digit( + "uno dos tres cuatro siete siete noventa 89", + false, + true, + 3, + ) + println(new_string) +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..bf03c63 --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module github.com/pablodz/itn + +go 1.22.3 \ No newline at end of file diff --git a/itn/base.go b/itn/base.go new file mode 100644 index 0000000..8cdd4d8 --- /dev/null +++ b/itn/base.go @@ -0,0 +1,171 @@ +package itn + +import ( + "log" + "regexp" + "strings" +) + +type Language struct { + Multipliers map[string]int + Units map[string]int + STens map[string]int + MTens map[string]int + MTensWSTens []string + Hundred map[string]int + MHundreds map[string]int + Numbers map[string]int + Sign map[string]string + Zero []string + DecimalSep string + DecimalSYM string + AndNums []string + And string + NeverIfAlone []string + Relaxed map[string]RelaxTuple + Simplify_check_coef_appliable bool +} + +type RelaxTuple struct { + Zero string + One string +} + +func (lg *Language) NotNumericWord(word string) bool { + isEmpty := false + if word == "" { + isEmpty = true + } + + isDecimalSep := false + if word == lg.DecimalSep { + isDecimalSep = true + } + + isNotNumber := false + if _, ok := lg.Numbers[word]; !ok { + isNotNumber = true + } + + isNotZero := false + for _, zero := range lg.Zero { + if word == zero { + isNotZero = true + break + } + } + + return isEmpty || isDecimalSep && isNotNumber && isNotZero +} + +func (lg *Language) Normalize(word string) string { + return word +} + +func (lg *Language) Ord2Card(word string) string { + return "" +} + +func (lg *Language) NumOrd(digits string, originalWord string) string { + return "" +} + +var WORDSEP = regexp.MustCompile(`\s*[\.,;\(\)…\[\]:!\?]+\s*|\n`) + +type segmentAndPunct struct { + segment string + punct string +} + +type LookAhead struct { + Word string + Ahead string +} + +func lookAhead(tokens []string) []LookAhead { + if len(tokens) == 0 { + return []LookAhead{} + } + + lookAheads := []LookAhead{} + for i := 0; i < len(tokens); i++ { + + nextWord := "" + if i+1 >= len(tokens) { + nextWord = "" + } else { + nextWord = tokens[i+1] + } + + lookAheads = append(lookAheads, LookAhead{tokens[i], nextWord}) + } + // fill the last element with empty next + lookAheads[len(lookAheads)-1].Ahead = "" + + return lookAheads +} + +func (lg *SpanishLanguage) Alpha2Digit(text string, relaxed bool, signed bool, ordinalThreshold int) string { + segments := WORDSEP.Split(text, -1) + for i, segment := range segments { + log.Println("[segment]", i, segment) + } + punct := WORDSEP.FindAllString(text, -1) + for i, p := range punct { + log.Println("[punct]", i, p) + } + + if len(punct) < len(segments) { + punct = append(punct, "") + } + + segmentAndPuncts := []segmentAndPunct{} + for i, segment := range segments { + segmentAndPuncts = append(segmentAndPuncts, segmentAndPunct{segment, punct[i]}) + } + + outSegments := []string{} + for _, sp := range segmentAndPuncts { + tokens := strings.Split(sp.segment, " ") + log.Printf("[sp.segment] %s [len]%d", sp.segment, len(tokens)) + + numBuilder := NewWordToDigitParser(lg.Language, relaxed, signed, ordinalThreshold, "") + lastWord := "" + inNumber := false + outTokens := []string{} + for _, couple := range lookAhead(tokens) { + + log.Printf("[word] %s [next] %s", couple.Word, couple.Ahead) + + if numBuilder.push(strings.ToLower(couple.Word), strings.ToLower(couple.Ahead)) { + log.Printf("condition 1: word %s ahead %s", couple.Word, couple.Ahead) + inNumber = true + } else if inNumber { + log.Printf("condition 2: word %s ahead %s", couple.Word, couple.Ahead) + log.Printf("numBuilder.value() >>>>>>>> %s", numBuilder.GetValue()) + outTokens = append(outTokens, numBuilder.GetValue()) + log.Printf("relaxed %v signed %v ordinalThreshold %d lastWord %s", relaxed, signed, ordinalThreshold, lastWord) + numBuilder = NewWordToDigitParser(lg.Language, relaxed, signed, ordinalThreshold, lastWord) + inNumber = numBuilder.push(strings.ToLower(couple.Word), strings.ToLower(couple.Ahead)) + log.Printf("inNumber %v", inNumber) + } + + if !inNumber { + log.Printf("condition 3: word %s ahead %s", couple.Word, couple.Ahead) + outTokens = append(outTokens, couple.Word) + } + lastWord = strings.ToLower(couple.Word) + } + numBuilder.close() + if numBuilder.GetValue() != "" { + log.Printf("numBuilder.value() %s", numBuilder.GetValue()) + outTokens = append(outTokens, numBuilder.GetValue()) + } + + outSegments = append(outSegments, strings.Join(outTokens, " ")) + outSegments = append(outSegments, sp.punct) + } + text = strings.Join(outSegments, "") + + return text +} diff --git a/itn/es.go b/itn/es.go new file mode 100644 index 0000000..d17af43 --- /dev/null +++ b/itn/es.go @@ -0,0 +1,164 @@ +package itn + +import ( + "fmt" + "strings" +) + +type SpanishLanguage struct { + *Language + Composites map[string]int +} + +func NewSpanishLanguage() *SpanishLanguage { + l := &SpanishLanguage{} + l.Language = &Language{} + + l.Multipliers = map[string]int{ + "mil": 1000, + "miles": 1000, + "millon": 1000000, + "millón": 1000000, + "millones": 1000000, + } + + l.Units = map[string]int{ + "uno": 1, + "dos": 2, + "tres": 3, + "cuatro": 4, + "cinco": 5, + "seis": 6, + "siete": 7, + "ocho": 8, + "nueve": 9, + "un": 1, // optional + "una": 1, // optional + } + + l.STens = map[string]int{ + "diez": 10, + "once": 11, + "doce": 12, + "trece": 13, + "catorce": 14, + "quince": 15, + "dieciseis": 16, + "diecisiete": 17, + "dieciocho": 18, + "diecinueve": 19, + "veinte": 20, + "veintiuno": 21, + "veintidos": 22, + "veintitres": 23, + "veinticuatro": 24, + "veinticinco": 25, + "veintiseis": 26, + "veintisiete": 27, + "veintiocho": 28, + "veintinueve": 29, + "veintitrés": 23, // with accent + "veintidós": 22, // with accent + } + + l.MTens = map[string]int{ + "treinta": 30, + "cuarenta": 40, + "cincuenta": 50, + "sesenta": 60, + "setenta": 70, + "ochenta": 80, + "noventa": 90, + } + + l.MTensWSTens = []string{} + + l.Hundred = map[string]int{ + "cien": 100, + "ciento": 100, + "cienta": 100, + "doscientos": 200, + "trescientos": 300, + "cuatrocientos": 400, + "quinientos": 500, + "seiscientos": 600, + "setecientos": 700, + "ochocientos": 800, + "novecientos": 900, + "doscientas": 200, // with feminine + "trescientas": 300, // with feminine + "cuatrocientas": 400, // with feminine + "quinientas": 500, // with feminine + "seiscientas": 600, // with feminine + "setecientas": 700, // with feminine + "ochocientas": 800, // with feminine + "novecientas": 900, // with feminine + } + + l.Composites = map[string]int{} + + // deep copy from l.multipliers + l.Numbers = l.Multipliers + for k, v := range l.Units { + l.Numbers[k] = v + } + for k, v := range l.STens { + l.Numbers[k] = v + } + for k, v := range l.MTens { + l.Numbers[k] = v + } + for k, v := range l.Hundred { + l.Numbers[k] = v + } + for k, v := range l.Composites { + l.Numbers[k] = v + } + + l.Sign = map[string]string{ + "mas": "+", + "menos": "-", + } + + l.Zero = []string{ + "cero", + } + + l.DecimalSep = "coma" + l.DecimalSYM = "." + + l.AndNums = []string{ + "un", + "uno", + "una", + "dos", + "tres", + "cuatro", + "cinco", + "seis", + "siete", + "ocho", + "nueve", + } + + l.And = "y" + + l.NeverIfAlone = []string{ + "un", + "uno", + "una", + } + + return l +} + +func (lg *SpanishLanguage) Ord2Card(word string) string { + return word +} + +func (lg *SpanishLanguage) NumOrd(digits string, originalWord string) string { + if strings.HasSuffix(originalWord, "o") { + return fmt.Sprintf("%sº", digits) + } + return fmt.Sprintf("%sª", digits) +} diff --git a/itn/es_test.go b/itn/es_test.go new file mode 100644 index 0000000..94985a5 --- /dev/null +++ b/itn/es_test.go @@ -0,0 +1,102 @@ +package itn + +import ( + "testing" +) + +func TestAlpha2Digit(t *testing.T) { + type test struct { + input string + output string + } + + tests := []test{ + { + input: "uno coma uno", + output: "1.1", + }, + { + input: "uno coma cuatrocientos uno", + output: "1.401", + }, + { + input: "veinticinco vacas, doce gallinas y ciento veinticinco kg de patatas.", + output: "25 vacas, 12 gallinas y 125 kg de patatas.", + }, + { + input: "Habían trescientos hombres y quinientas mujeres", + output: "Habían 300 hombres y 500 mujeres", + }, + { + input: "mil doscientos sesenta y seis dolares.", + output: "1266 dolares.", + }, + { + input: "un dos tres cuatro veinte quince", + output: "1 2 3 4 20 15", + }, + { + input: "veintiuno, treinta y uno.", + output: "21, 31.", + }, + { + input: "un dos tres cuatro treinta cinco.", + output: "1 2 3 4 35.", + }, + { + input: "un dos tres cuatro veinte, cinco.", + output: "1 2 3 4 20, 5.", + }, + { + input: "treinta y cuatro = treinta cuatro", + output: "34 = 34", + }, { + input: "mas treinta y tres nueve sesenta cero seis doce veintiuno", + output: "+33 9 60 06 12 21", + }, + { + input: "cero nueve sesenta cero seis doce veintiuno", + output: "09 60 06 12 21", + }, { + input: "cincuenta sesenta treinta y once", + output: "50 60 30 y 11", + }, { + input: "trece mil cero noventa", + output: "13000 090", + }, { + input: "cero", + output: "0", + }, { + input: "doce coma noventa y nueve, ciento veinte coma cero cinco, uno coma doscientos treinta y seis, uno coma dos tres seis.", + output: "12.99, 120.05, 1.236, 1.2 3 6.", + }, { + input: "coma quince", + output: "0.15", + }, { + input: "Tenemos mas veinte grados dentro y menos quince fuera.", + output: "Tenemos +20 grados dentro y -15 fuera.", + }, { + input: "Un momento por favor! treinta y un gatos. Uno dos tres cuatro!", + output: "Un momento por favor! 31 gatos. 1 2 3 4!", + }, { + input: "Ni uno. Uno uno. Treinta y uno", + output: "Ni uno. 1 1. 31", + }, { + input: "un millon", + output: "1000000", + }, { + input: "un millón", + output: "1000000", + }, + } + + for _, tt := range tests { + processor := NewSpanishLanguage() + new_string := processor.Alpha2Digit(tt.input, false, true, 3) + if new_string != tt.output { + t.Errorf("❌ Expected <%s>, got <%s>", tt.output, new_string) + } else { + t.Logf("✅ Expected <%s>, got <%s>", tt.output, new_string) + } + } +} diff --git a/itn/parsers.go b/itn/parsers.go new file mode 100644 index 0000000..ddebb50 --- /dev/null +++ b/itn/parsers.go @@ -0,0 +1,247 @@ +package itn + +import ( + "fmt" + "log" + "strings" +) + +type WordStreamValueParser struct { + Skip string + n000Val int + grpVal int + lastWord string + lang *Language + relaxed bool +} + +func NewWordStreamValueParser(lang *Language, relaxed bool) *WordStreamValueParser { + return &WordStreamValueParser{ + Skip: "", + n000Val: 0, + grpVal: 0, + lang: lang, + relaxed: relaxed, + } +} + +func (w *WordStreamValueParser) GetValue() int { + return w.n000Val + w.grpVal +} + +func (w *WordStreamValueParser) groupExpects(word string, update bool) bool { + expected := false + if w.lastWord == "" { + expected = true + } else if containsKey(w.lang.Units, w.lastWord) && w.grpVal < 10 || containsKey(w.lang.STens, w.lastWord) && w.grpVal < 20 { + expected = containsKey(w.lang.Hundred, word) + } else if containsKey(w.lang.MHundreds, w.lastWord) { + expected = true + } else if containsKey(w.lang.MTens, w.lastWord) { + expected = containsKey(w.lang.Units, word) || containsKey(w.lang.STens, word) && contains(w.lang.MTensWSTens, w.lastWord) + } else if containsKey(w.lang.Hundred, w.lastWord) { + expected = !containsKey(w.lang.Hundred, word) + } + + if update { + w.lastWord = word + } + + return expected +} + +func (w *WordStreamValueParser) isCoefAppliable(coef int) bool { + if w.lang.Simplify_check_coef_appliable { + return coef != w.GetValue() + } + + if coef > w.GetValue() && (w.GetValue() > 0 || coef >= 100) { + return true + } + + if coef*1000 <= w.n000Val || coef == 100 && 100 > w.grpVal { + return (w.grpVal > 0 || coef == 1000 || coef == 100) + } + + return false +} + +func (w *WordStreamValueParser) push(word string, lookAhead string) bool { + if word == "" { + return false + } + + if word == w.lang.And && contains(w.lang.AndNums, lookAhead) { + return true + } + + word = w.lang.Normalize(word) + if !containsKey(w.lang.Numbers, word) { + return false + } + + relaxed := w.lang.Relaxed + if containsKey(w.lang.Multipliers, word) { + coef := w.lang.Multipliers[word] + if !w.isCoefAppliable(coef) { + return false + } + + if coef < 1000 { + if w.grpVal == 0 { + w.grpVal = 1 + } + w.grpVal = w.grpVal * coef + w.lastWord = "" + return true + } + if coef < w.n000Val { + if w.grpVal == 0 { + w.grpVal = 1 + } + w.n000Val = w.n000Val + coef*(w.grpVal) + } else { + if w.grpVal == 0 { + w.grpVal = 1 + } + w.n000Val = w.GetValue() * coef + } + w.grpVal = 0 + w.lastWord = "" + + } else if w.relaxed && containsKey(relaxed, word) && lookAhead != "" && strings.HasPrefix(relaxed[word].Zero, lookAhead) && w.groupExpects(relaxed[word].One, false) { + w.Skip = relaxed[word].Zero + w.grpVal = w.grpVal + w.lang.Numbers[relaxed[word].One] + } else if w.Skip != "" && strings.HasPrefix(w.Skip, word) { + w.Skip = "" + } else if w.groupExpects(word, true) { + if containsKey(w.lang.Hundred, word) { + if w.grpVal != 0 { + w.grpVal = 100 * w.grpVal + } else { + w.grpVal = w.lang.Hundred[word] + } + } else if containsKey(w.lang.MHundreds, word) { + w.grpVal = w.lang.MHundreds[word] + } else { + w.grpVal = w.grpVal + w.lang.Numbers[word] + } + } else { + w.Skip = "" + return false + } + + return true +} + +type WordToDigitParser struct { + Lang *Language + value []string + IntBuilder *WordStreamValueParser + FracBuilder *WordStreamValueParser + Signed bool + InFrac bool + Closed bool + Open bool + LastWord string + OrdinalThreshold int +} + +func NewWordToDigitParser(lang *Language, relaxed bool, signed bool, ordinalThreshold int, precedingWord string) *WordToDigitParser { + return &WordToDigitParser{ + Lang: lang, + value: []string{}, + IntBuilder: NewWordStreamValueParser(lang, relaxed), + FracBuilder: NewWordStreamValueParser(lang, relaxed), + Signed: signed, + InFrac: false, + Closed: false, + Open: false, + LastWord: precedingWord, + OrdinalThreshold: ordinalThreshold, + } +} + +func (w *WordToDigitParser) GetValue() string { + return strings.Join(w.value, "") +} + +func (w *WordToDigitParser) close() { + if !w.Closed { + if w.InFrac && w.FracBuilder.GetValue() > 0 { + w.value = append(w.value, fmt.Sprint(w.FracBuilder.GetValue())) + } else if !w.InFrac && w.IntBuilder.GetValue() > 0 { + w.value = append(w.value, fmt.Sprint(w.IntBuilder.GetValue())) + } + w.Closed = true + } +} + +func (w *WordToDigitParser) atStartOfSeq() bool { + return (w.InFrac && w.FracBuilder.GetValue() == 0 || !w.InFrac && w.IntBuilder.GetValue() == 0) +} + +func (w *WordToDigitParser) atStart() bool { + return !w.Open +} + +func (w *WordToDigitParser) the_push(word string, lookAhead string) bool { + builder := w.IntBuilder + if w.InFrac { + builder = w.FracBuilder + } + return builder.push(word, lookAhead) +} + +func (w *WordToDigitParser) isAlone(word string, nextWord string) bool { + return !w.Open && contains(w.Lang.NeverIfAlone, word) && w.Lang.NotNumericWord(nextWord) && w.Lang.NotNumericWord(w.LastWord) && !(nextWord == "" && w.LastWord == "") +} + +func (w *WordToDigitParser) push(word string, lookAhead string) bool { + if w.Closed || w.isAlone(word, lookAhead) { + w.LastWord = word + return false + } + + if w.Signed && containsKey(w.Lang.Sign, word) && containsKey(w.Lang.Numbers, lookAhead) && w.atStart() { + log.Printf("> condition 1: word %s ahead %s", word, lookAhead) + w.value = append(w.value, w.Lang.Sign[word]) + } else if contains(w.Lang.Zero, word) && w.atStartOfSeq() && lookAhead != "" && strings.Contains(w.Lang.DecimalSep, lookAhead) { + log.Printf("> condition 2: word %s ahead %s", word, lookAhead) + } else if contains(w.Lang.Zero, word) && w.atStartOfSeq() { + log.Printf("> condition 3: word %s ahead %s", word, lookAhead) + w.value = append(w.value, "0") + } else if w.the_push(w.Lang.Ord2Card(word), lookAhead) { + log.Printf("> condition 4: word %s ahead %s", word, lookAhead) + value2Add := word + if w.IntBuilder.GetValue() > w.OrdinalThreshold { + digits := w.IntBuilder.GetValue() + if w.InFrac { + digits = w.FracBuilder.GetValue() + } + value2Add = w.Lang.NumOrd(fmt.Sprint(digits), word) + } + w.value = append(w.value, value2Add) + w.Closed = true + } else if word == w.Lang.DecimalSep || contains(strings.Split(w.Lang.DecimalSep, ","), word) && (containsKey(w.Lang.Numbers, lookAhead) || contains(w.Lang.Zero, lookAhead)) && !w.InFrac { + log.Printf("> condition 5: word %s ahead %s", word, lookAhead) + if w.GetValue() == "" { + w.value = append(w.value, fmt.Sprint(w.IntBuilder.GetValue())) + } + w.value = append(w.value, w.Lang.DecimalSYM) + w.InFrac = true + } else if !w.the_push(word, lookAhead) { + log.Printf("> condition 6: word %s ahead %s", word, lookAhead) + if w.Open { + w.close() + } + w.LastWord = word + return false + } + + log.Printf("word %s ahead %s", word, lookAhead) + + w.Open = true + w.LastWord = word + return true +} diff --git a/itn/transforms.go b/itn/transforms.go new file mode 100644 index 0000000..ceffb1f --- /dev/null +++ b/itn/transforms.go @@ -0,0 +1 @@ +package itn diff --git a/itn/utils.go b/itn/utils.go new file mode 100644 index 0000000..d652285 --- /dev/null +++ b/itn/utils.go @@ -0,0 +1,15 @@ +package itn + +func contains(slice []string, word string) bool { + for _, v := range slice { + if v == word { + return true + } + } + return false +} + +func containsKey[T int | string | RelaxTuple](dict map[string]T, key string) bool { + _, ok := dict[key] + return ok +} From fd27267b66dfc71e0e80a13033cc8f8ca0ecf46b Mon Sep 17 00:00:00 2001 From: Pablo Diaz Date: Tue, 14 May 2024 12:21:39 -0500 Subject: [PATCH 2/6] fix first word itn --- example/alpha/main.go | 4 +- itn/base.go | 213 +++++++++++++++++++++++++++++++++--------- itn/es.go | 164 -------------------------------- itn/es_test.go | 2 +- itn/parsers.go | 68 +++++++++----- itn/transforms.go | 1 - 6 files changed, 215 insertions(+), 237 deletions(-) delete mode 100644 itn/es.go delete mode 100644 itn/transforms.go diff --git a/example/alpha/main.go b/example/alpha/main.go index a4dcaa1..7169852 100644 --- a/example/alpha/main.go +++ b/example/alpha/main.go @@ -5,9 +5,9 @@ import ( ) func main() { - processor := itn.NewSpanishLanguage() + processor := itn.NewLanguageES() new_string := processor.Alpha2Digit( - "uno dos tres cuatro siete siete noventa 89", + "uno quince", false, true, 3, diff --git a/itn/base.go b/itn/base.go index 8cdd4d8..dd300ce 100644 --- a/itn/base.go +++ b/itn/base.go @@ -1,6 +1,7 @@ package itn import ( + "fmt" "log" "regexp" "strings" @@ -26,40 +27,145 @@ type Language struct { Simplify_check_coef_appliable bool } -type RelaxTuple struct { - Zero string - One string -} +func NewLanguageES() *Language { -func (lg *Language) NotNumericWord(word string) bool { - isEmpty := false - if word == "" { - isEmpty = true - } + l := &Language{ + Multipliers: map[string]int{ + "mil": 1000, + "miles": 1000, + "millon": 1000000, + "millón": 1000000, + "millones": 1000000, + }, + Units: map[string]int{ + "uno": 1, + "dos": 2, + "tres": 3, + "cuatro": 4, + "cinco": 5, + "seis": 6, + "siete": 7, + "ocho": 8, + "nueve": 9, + "un": 1, // optional + "una": 1, // optional - isDecimalSep := false - if word == lg.DecimalSep { - isDecimalSep = true + }, + STens: map[string]int{ + "diez": 10, + "once": 11, + "doce": 12, + "trece": 13, + "catorce": 14, + "quince": 15, + "dieciseis": 16, + "diecisiete": 17, + "dieciocho": 18, + "diecinueve": 19, + "veinte": 20, + "veintiuno": 21, + "veintidos": 22, + "veintitres": 23, + "veinticuatro": 24, + "veinticinco": 25, + "veintiseis": 26, + "veintisiete": 27, + "veintiocho": 28, + "veintinueve": 29, + "veintitrés": 23, // with accent + "veintidós": 22, // with accent + }, + MTens: map[string]int{ + "treinta": 30, + "cuarenta": 40, + "cincuenta": 50, + "sesenta": 60, + "setenta": 70, + "ochenta": 80, + "noventa": 90, + }, + MTensWSTens: []string{}, + Hundred: map[string]int{ + "cien": 100, + "ciento": 100, + "cienta": 100, + "doscientos": 200, + "trescientos": 300, + "cuatrocientos": 400, + "quinientos": 500, + "seiscientos": 600, + "setecientos": 700, + "ochocientos": 800, + "novecientos": 900, + "doscientas": 200, // with feminine + "trescientas": 300, // with feminine + "cuatrocientas": 400, // with feminine + "quinientas": 500, // with feminine + "seiscientas": 600, // with feminine + "setecientas": 700, // with feminine + "ochocientas": 800, // with feminine + "novecientas": 900, // with feminine + }, + Sign: map[string]string{ + "mas": "+", + "menos": "-", + }, + Zero: []string{ + "cero", + }, + DecimalSep: "coma", + DecimalSYM: ".", + AndNums: []string{ + "un", + "uno", + "una", + "dos", + "tres", + "cuatro", + "cinco", + "seis", + "siete", + "ocho", + "nueve", + }, + + And: "y", + NeverIfAlone: []string{ + "un", + "uno", + "una", + }, + Relaxed: map[string]RelaxTuple{}, } - isNotNumber := false - if _, ok := lg.Numbers[word]; !ok { - isNotNumber = true + // deep copy from l.multipliers + l.Numbers = map[string]int{ + "mil": 1000, + "miles": 1000, + "millon": 1000000, + "millón": 1000000, + "millones": 1000000, } - isNotZero := false - for _, zero := range lg.Zero { - if word == zero { - isNotZero = true - break - } + for k, v := range l.Units { + l.Numbers[k] = v + } + for k, v := range l.STens { + l.Numbers[k] = v + } + for k, v := range l.MTens { + l.Numbers[k] = v + } + for k, v := range l.Hundred { + l.Numbers[k] = v } - return isEmpty || isDecimalSep && isNotNumber && isNotZero + return l } -func (lg *Language) Normalize(word string) string { - return word +type RelaxTuple struct { + Zero string + One string } func (lg *Language) Ord2Card(word string) string { @@ -67,7 +173,18 @@ func (lg *Language) Ord2Card(word string) string { } func (lg *Language) NumOrd(digits string, originalWord string) string { - return "" + if strings.HasSuffix(originalWord, "o") { + return fmt.Sprintf("%sº", digits) + } + return fmt.Sprintf("%sª", digits) +} + +func (lg *Language) Normalize(word string) string { + return word +} + +func (lg *Language) NotNumericWord(word string) bool { + return word == "" || word != lg.DecimalSep && !containsKey(lg.Numbers, word) && !contains(lg.Zero, word) } var WORDSEP = regexp.MustCompile(`\s*[\.,;\(\)…\[\]:!\?]+\s*|\n`) @@ -105,15 +222,15 @@ func lookAhead(tokens []string) []LookAhead { return lookAheads } -func (lg *SpanishLanguage) Alpha2Digit(text string, relaxed bool, signed bool, ordinalThreshold int) string { +func (lg *Language) Alpha2Digit(text string, relaxed bool, signed bool, ordinalThreshold int) string { segments := WORDSEP.Split(text, -1) - for i, segment := range segments { - log.Println("[segment]", i, segment) - } + // for i, segment := range segments { + // log.Println("[segment]", i, segment) + // } punct := WORDSEP.FindAllString(text, -1) - for i, p := range punct { - log.Println("[punct]", i, p) - } + // for i, p := range punct { + // log.Println("[punct]", i, p) + // } if len(punct) < len(segments) { punct = append(punct, "") @@ -121,44 +238,50 @@ func (lg *SpanishLanguage) Alpha2Digit(text string, relaxed bool, signed bool, o segmentAndPuncts := []segmentAndPunct{} for i, segment := range segments { - segmentAndPuncts = append(segmentAndPuncts, segmentAndPunct{segment, punct[i]}) + segmentAndPuncts = append(segmentAndPuncts, + segmentAndPunct{ + segment, + punct[i], + }, + ) } outSegments := []string{} for _, sp := range segmentAndPuncts { tokens := strings.Split(sp.segment, " ") - log.Printf("[sp.segment] %s [len]%d", sp.segment, len(tokens)) + log.Printf("tokens %v", tokens) - numBuilder := NewWordToDigitParser(lg.Language, relaxed, signed, ordinalThreshold, "") + numBuilder := NewWordToDigitParser(lg, relaxed, signed, ordinalThreshold, "") lastWord := "" inNumber := false outTokens := []string{} for _, couple := range lookAhead(tokens) { - log.Printf("[word] %s [next] %s", couple.Word, couple.Ahead) + log.Printf("✅ [word] %s [ahead] %s", couple.Word, couple.Ahead) - if numBuilder.push(strings.ToLower(couple.Word), strings.ToLower(couple.Ahead)) { - log.Printf("condition 1: word %s ahead %s", couple.Word, couple.Ahead) + pushed := numBuilder.push(strings.ToLower(couple.Word), strings.ToLower(couple.Ahead)) + if pushed { + log.Printf("> condition 1: word %s ahead %s", couple.Word, couple.Ahead) inNumber = true } else if inNumber { - log.Printf("condition 2: word %s ahead %s", couple.Word, couple.Ahead) - log.Printf("numBuilder.value() >>>>>>>> %s", numBuilder.GetValue()) + log.Printf("> condition 2: word %s ahead %s", couple.Word, couple.Ahead) outTokens = append(outTokens, numBuilder.GetValue()) - log.Printf("relaxed %v signed %v ordinalThreshold %d lastWord %s", relaxed, signed, ordinalThreshold, lastWord) - numBuilder = NewWordToDigitParser(lg.Language, relaxed, signed, ordinalThreshold, lastWord) + numBuilder = NewWordToDigitParser(lg, relaxed, signed, ordinalThreshold, lastWord) inNumber = numBuilder.push(strings.ToLower(couple.Word), strings.ToLower(couple.Ahead)) - log.Printf("inNumber %v", inNumber) } if !inNumber { - log.Printf("condition 3: word %s ahead %s", couple.Word, couple.Ahead) + log.Printf("> condition 3: word %s ahead %s", couple.Word, couple.Ahead) outTokens = append(outTokens, couple.Word) } + lastWord = strings.ToLower(couple.Word) + } + + log.Printf("---") numBuilder.close() if numBuilder.GetValue() != "" { - log.Printf("numBuilder.value() %s", numBuilder.GetValue()) outTokens = append(outTokens, numBuilder.GetValue()) } diff --git a/itn/es.go b/itn/es.go deleted file mode 100644 index d17af43..0000000 --- a/itn/es.go +++ /dev/null @@ -1,164 +0,0 @@ -package itn - -import ( - "fmt" - "strings" -) - -type SpanishLanguage struct { - *Language - Composites map[string]int -} - -func NewSpanishLanguage() *SpanishLanguage { - l := &SpanishLanguage{} - l.Language = &Language{} - - l.Multipliers = map[string]int{ - "mil": 1000, - "miles": 1000, - "millon": 1000000, - "millón": 1000000, - "millones": 1000000, - } - - l.Units = map[string]int{ - "uno": 1, - "dos": 2, - "tres": 3, - "cuatro": 4, - "cinco": 5, - "seis": 6, - "siete": 7, - "ocho": 8, - "nueve": 9, - "un": 1, // optional - "una": 1, // optional - } - - l.STens = map[string]int{ - "diez": 10, - "once": 11, - "doce": 12, - "trece": 13, - "catorce": 14, - "quince": 15, - "dieciseis": 16, - "diecisiete": 17, - "dieciocho": 18, - "diecinueve": 19, - "veinte": 20, - "veintiuno": 21, - "veintidos": 22, - "veintitres": 23, - "veinticuatro": 24, - "veinticinco": 25, - "veintiseis": 26, - "veintisiete": 27, - "veintiocho": 28, - "veintinueve": 29, - "veintitrés": 23, // with accent - "veintidós": 22, // with accent - } - - l.MTens = map[string]int{ - "treinta": 30, - "cuarenta": 40, - "cincuenta": 50, - "sesenta": 60, - "setenta": 70, - "ochenta": 80, - "noventa": 90, - } - - l.MTensWSTens = []string{} - - l.Hundred = map[string]int{ - "cien": 100, - "ciento": 100, - "cienta": 100, - "doscientos": 200, - "trescientos": 300, - "cuatrocientos": 400, - "quinientos": 500, - "seiscientos": 600, - "setecientos": 700, - "ochocientos": 800, - "novecientos": 900, - "doscientas": 200, // with feminine - "trescientas": 300, // with feminine - "cuatrocientas": 400, // with feminine - "quinientas": 500, // with feminine - "seiscientas": 600, // with feminine - "setecientas": 700, // with feminine - "ochocientas": 800, // with feminine - "novecientas": 900, // with feminine - } - - l.Composites = map[string]int{} - - // deep copy from l.multipliers - l.Numbers = l.Multipliers - for k, v := range l.Units { - l.Numbers[k] = v - } - for k, v := range l.STens { - l.Numbers[k] = v - } - for k, v := range l.MTens { - l.Numbers[k] = v - } - for k, v := range l.Hundred { - l.Numbers[k] = v - } - for k, v := range l.Composites { - l.Numbers[k] = v - } - - l.Sign = map[string]string{ - "mas": "+", - "menos": "-", - } - - l.Zero = []string{ - "cero", - } - - l.DecimalSep = "coma" - l.DecimalSYM = "." - - l.AndNums = []string{ - "un", - "uno", - "una", - "dos", - "tres", - "cuatro", - "cinco", - "seis", - "siete", - "ocho", - "nueve", - } - - l.And = "y" - - l.NeverIfAlone = []string{ - "un", - "uno", - "una", - } - - return l -} - -func (lg *SpanishLanguage) Ord2Card(word string) string { - return word -} - -func (lg *SpanishLanguage) NumOrd(digits string, originalWord string) string { - if strings.HasSuffix(originalWord, "o") { - return fmt.Sprintf("%sº", digits) - } - return fmt.Sprintf("%sª", digits) -} diff --git a/itn/es_test.go b/itn/es_test.go index 94985a5..5ad79d7 100644 --- a/itn/es_test.go +++ b/itn/es_test.go @@ -91,7 +91,7 @@ func TestAlpha2Digit(t *testing.T) { } for _, tt := range tests { - processor := NewSpanishLanguage() + processor := NewLanguageES() new_string := processor.Alpha2Digit(tt.input, false, true, 3) if new_string != tt.output { t.Errorf("❌ Expected <%s>, got <%s>", tt.output, new_string) diff --git a/itn/parsers.go b/itn/parsers.go index ddebb50..d1f0b93 100644 --- a/itn/parsers.go +++ b/itn/parsers.go @@ -11,15 +11,12 @@ type WordStreamValueParser struct { n000Val int grpVal int lastWord string - lang *Language + lang Language relaxed bool } -func NewWordStreamValueParser(lang *Language, relaxed bool) *WordStreamValueParser { - return &WordStreamValueParser{ - Skip: "", - n000Val: 0, - grpVal: 0, +func NewWordStreamValueParser(lang Language, relaxed bool) WordStreamValueParser { + return WordStreamValueParser{ lang: lang, relaxed: relaxed, } @@ -67,23 +64,32 @@ func (w *WordStreamValueParser) isCoefAppliable(coef int) bool { } func (w *WordStreamValueParser) push(word string, lookAhead string) bool { + + log.Printf("- WordStreamValueParser.push.word %s [ahead] %s", word, lookAhead) + if word == "" { + log.Printf(">> WordStreamValueParser.push.condition 0: [word]%s [ahead] %s", word, lookAhead) return false } if word == w.lang.And && contains(w.lang.AndNums, lookAhead) { + log.Printf(">> WordStreamValueParser.push.condition 1: [word]%s [ahead] %s", word, lookAhead) return true } word = w.lang.Normalize(word) if !containsKey(w.lang.Numbers, word) { + log.Printf(">> WordStreamValueParser.push.condition 2: [word]%s [ahead] %s", word, lookAhead) return false } - relaxed := w.lang.Relaxed + RELAXED := w.lang.Relaxed if containsKey(w.lang.Multipliers, word) { + log.Printf(">> WordStreamValueParser.push.condition 3: [word]%s [ahead] %s", word, lookAhead) coef := w.lang.Multipliers[word] + log.Printf(">>> WordStreamValueParser.push.coef %d", coef) if !w.isCoefAppliable(coef) { + log.Printf(">> WordStreamValueParser.push.condition 3.1: [word]%s [ahead] %s", word, lookAhead) return false } @@ -93,6 +99,7 @@ func (w *WordStreamValueParser) push(word string, lookAhead string) bool { } w.grpVal = w.grpVal * coef w.lastWord = "" + log.Printf(">> WordStreamValueParser.push.condition 3.2: [word]%s [ahead] %s", word, lookAhead) return true } if coef < w.n000Val { @@ -108,37 +115,45 @@ func (w *WordStreamValueParser) push(word string, lookAhead string) bool { } w.grpVal = 0 w.lastWord = "" - - } else if w.relaxed && containsKey(relaxed, word) && lookAhead != "" && strings.HasPrefix(relaxed[word].Zero, lookAhead) && w.groupExpects(relaxed[word].One, false) { - w.Skip = relaxed[word].Zero - w.grpVal = w.grpVal + w.lang.Numbers[relaxed[word].One] + } else if w.relaxed && containsKey(RELAXED, word) && lookAhead != "" && strings.HasPrefix(RELAXED[word].Zero, lookAhead) && w.groupExpects(RELAXED[word].One, false) { + log.Printf(">> WordStreamValueParser.push.condition 4: [word]%s [ahead] %s", word, lookAhead) + w.Skip = RELAXED[word].Zero + w.grpVal = w.grpVal + w.lang.Numbers[RELAXED[word].One] } else if w.Skip != "" && strings.HasPrefix(w.Skip, word) { + log.Printf(">> WordStreamValueParser.push.condition 5: [word]%s [ahead] %s", word, lookAhead) w.Skip = "" } else if w.groupExpects(word, true) { + log.Printf(">> WordStreamValueParser.push.condition 6: [word]%s [ahead] %s", word, lookAhead) if containsKey(w.lang.Hundred, word) { + log.Printf(">> WordStreamValueParser.push.condition 6.1: [word]%s [ahead] %s", word, lookAhead) if w.grpVal != 0 { w.grpVal = 100 * w.grpVal } else { w.grpVal = w.lang.Hundred[word] } } else if containsKey(w.lang.MHundreds, word) { + log.Printf(">> WordStreamValueParser.push.condition 6.2: [word]%s [ahead] %s", word, lookAhead) w.grpVal = w.lang.MHundreds[word] } else { + log.Printf(">> WordStreamValueParser.push.condition 6.3: [word]%s [ahead] %s", word, lookAhead) w.grpVal = w.grpVal + w.lang.Numbers[word] + log.Printf(">>> WordStreamValueParser.push.grpVal %d", w.grpVal) } } else { + log.Printf(">> WordStreamValueParser.push.condition 7: [word]%s [ahead] %s", word, lookAhead) w.Skip = "" return false } + log.Printf(">> WordStreamValueParser.push.condition 8: [word]%s [ahead] %s", word, lookAhead) return true } type WordToDigitParser struct { Lang *Language value []string - IntBuilder *WordStreamValueParser - FracBuilder *WordStreamValueParser + IntBuilder WordStreamValueParser + FracBuilder WordStreamValueParser Signed bool InFrac bool Closed bool @@ -151,8 +166,8 @@ func NewWordToDigitParser(lang *Language, relaxed bool, signed bool, ordinalThre return &WordToDigitParser{ Lang: lang, value: []string{}, - IntBuilder: NewWordStreamValueParser(lang, relaxed), - FracBuilder: NewWordStreamValueParser(lang, relaxed), + IntBuilder: NewWordStreamValueParser(*lang, relaxed), + FracBuilder: NewWordStreamValueParser(*lang, relaxed), Signed: signed, InFrac: false, Closed: false, @@ -178,7 +193,7 @@ func (w *WordToDigitParser) close() { } func (w *WordToDigitParser) atStartOfSeq() bool { - return (w.InFrac && w.FracBuilder.GetValue() == 0 || !w.InFrac && w.IntBuilder.GetValue() == 0) + return w.InFrac && w.FracBuilder.GetValue() == 0 || !w.InFrac && w.IntBuilder.GetValue() == 0 } func (w *WordToDigitParser) atStart() bool { @@ -186,9 +201,12 @@ func (w *WordToDigitParser) atStart() bool { } func (w *WordToDigitParser) the_push(word string, lookAhead string) bool { - builder := w.IntBuilder + builder := WordStreamValueParser{} + log.Printf(">> inFrac %v word %s lookAhead %s", w.InFrac, word, lookAhead) if w.InFrac { builder = w.FracBuilder + } else { + builder = w.IntBuilder } return builder.push(word, lookAhead) } @@ -198,21 +216,23 @@ func (w *WordToDigitParser) isAlone(word string, nextWord string) bool { } func (w *WordToDigitParser) push(word string, lookAhead string) bool { + if w.Closed || w.isAlone(word, lookAhead) { + log.Printf(">> WordToDigitParser.push.condition 0:[word]%s [ahead] %s", word, lookAhead) w.LastWord = word return false } if w.Signed && containsKey(w.Lang.Sign, word) && containsKey(w.Lang.Numbers, lookAhead) && w.atStart() { - log.Printf("> condition 1: word %s ahead %s", word, lookAhead) + log.Printf(">> WordToDigitParser.push.condition 1:[word]%s [ahead] %s", word, lookAhead) w.value = append(w.value, w.Lang.Sign[word]) } else if contains(w.Lang.Zero, word) && w.atStartOfSeq() && lookAhead != "" && strings.Contains(w.Lang.DecimalSep, lookAhead) { - log.Printf("> condition 2: word %s ahead %s", word, lookAhead) + log.Printf(">> WordToDigitParser.push.condition 2:[word]%s [ahead] %s", word, lookAhead) } else if contains(w.Lang.Zero, word) && w.atStartOfSeq() { - log.Printf("> condition 3: word %s ahead %s", word, lookAhead) + log.Printf(">> WordToDigitParser.push.condition 3:[word]%s [ahead] %s", word, lookAhead) w.value = append(w.value, "0") } else if w.the_push(w.Lang.Ord2Card(word), lookAhead) { - log.Printf("> condition 4: word %s ahead %s", word, lookAhead) + log.Printf(">> WordToDigitParser.push.condition 4:[word]%s [ahead] %s", word, lookAhead) value2Add := word if w.IntBuilder.GetValue() > w.OrdinalThreshold { digits := w.IntBuilder.GetValue() @@ -224,14 +244,14 @@ func (w *WordToDigitParser) push(word string, lookAhead string) bool { w.value = append(w.value, value2Add) w.Closed = true } else if word == w.Lang.DecimalSep || contains(strings.Split(w.Lang.DecimalSep, ","), word) && (containsKey(w.Lang.Numbers, lookAhead) || contains(w.Lang.Zero, lookAhead)) && !w.InFrac { - log.Printf("> condition 5: word %s ahead %s", word, lookAhead) + log.Printf(">> WordToDigitParser.push.condition 5:[word]%s [ahead] %s", word, lookAhead) if w.GetValue() == "" { w.value = append(w.value, fmt.Sprint(w.IntBuilder.GetValue())) } w.value = append(w.value, w.Lang.DecimalSYM) w.InFrac = true } else if !w.the_push(word, lookAhead) { - log.Printf("> condition 6: word %s ahead %s", word, lookAhead) + log.Printf(">> WordToDigitParser.push.condition 6:[word] %s [ahead] %s", word, lookAhead) if w.Open { w.close() } @@ -239,7 +259,7 @@ func (w *WordToDigitParser) push(word string, lookAhead string) bool { return false } - log.Printf("word %s ahead %s", word, lookAhead) + log.Printf(">> WordToDigitParser.push.condition 7:[word] %s [ahead] %s", word, lookAhead) w.Open = true w.LastWord = word diff --git a/itn/transforms.go b/itn/transforms.go deleted file mode 100644 index ceffb1f..0000000 --- a/itn/transforms.go +++ /dev/null @@ -1 +0,0 @@ -package itn From 6b35748c481234edc911fa4f8473a025a685fdfa Mon Sep 17 00:00:00 2001 From: Pablo Diaz Date: Tue, 14 May 2024 16:49:00 -0500 Subject: [PATCH 3/6] all tests working --- example/alpha/main.go | 2 +- itn/base.go | 5 +- itn/parsers.go | 128 +++++++++++++++++++++++++----------------- 3 files changed, 80 insertions(+), 55 deletions(-) diff --git a/example/alpha/main.go b/example/alpha/main.go index 7169852..bd18027 100644 --- a/example/alpha/main.go +++ b/example/alpha/main.go @@ -7,7 +7,7 @@ import ( func main() { processor := itn.NewLanguageES() new_string := processor.Alpha2Digit( - "uno quince", + "uno dos quince", false, true, 3, diff --git a/itn/base.go b/itn/base.go index dd300ce..bc83ea5 100644 --- a/itn/base.go +++ b/itn/base.go @@ -222,7 +222,7 @@ func lookAhead(tokens []string) []LookAhead { return lookAheads } -func (lg *Language) Alpha2Digit(text string, relaxed bool, signed bool, ordinalThreshold int) string { +func (lg Language) Alpha2Digit(text string, relaxed bool, signed bool, ordinalThreshold int) string { segments := WORDSEP.Split(text, -1) // for i, segment := range segments { // log.Println("[segment]", i, segment) @@ -277,6 +277,8 @@ func (lg *Language) Alpha2Digit(text string, relaxed bool, signed bool, ordinalT lastWord = strings.ToLower(couple.Word) + log.Printf("... lastWord %s, inNumber %t, outTokens %v", lastWord, inNumber, outTokens) + } log.Printf("---") @@ -287,6 +289,7 @@ func (lg *Language) Alpha2Digit(text string, relaxed bool, signed bool, ordinalT outSegments = append(outSegments, strings.Join(outTokens, " ")) outSegments = append(outSegments, sp.punct) + } text = strings.Join(outSegments, "") diff --git a/itn/parsers.go b/itn/parsers.go index d1f0b93..4ea95ec 100644 --- a/itn/parsers.go +++ b/itn/parsers.go @@ -15,32 +15,40 @@ type WordStreamValueParser struct { relaxed bool } -func NewWordStreamValueParser(lang Language, relaxed bool) WordStreamValueParser { - return WordStreamValueParser{ +func NewWordStreamValueParser(lang Language, relaxed bool) *WordStreamValueParser { + return &WordStreamValueParser{ lang: lang, relaxed: relaxed, } } func (w *WordStreamValueParser) GetValue() int { + log.Printf("+ WordStreamValueParser.GetValue") return w.n000Val + w.grpVal } func (w *WordStreamValueParser) groupExpects(word string, update bool) bool { + log.Printf("+ WordStreamValueParser.groupExpects.word %s [lastWord] %s [update] %t", word, w.lastWord, update) expected := false if w.lastWord == "" { + log.Printf(">> WordStreamValueParser.groupExpects.condition 0: [word]%s [lastWord] %s [update] %t", word, w.lastWord, update) expected = true } else if containsKey(w.lang.Units, w.lastWord) && w.grpVal < 10 || containsKey(w.lang.STens, w.lastWord) && w.grpVal < 20 { + log.Printf(">> WordStreamValueParser.groupExpects.condition 1: [word]%s [lastWord] %s [update] %t", word, w.lastWord, update) expected = containsKey(w.lang.Hundred, word) } else if containsKey(w.lang.MHundreds, w.lastWord) { + log.Printf(">> WordStreamValueParser.groupExpects.condition 2: [word]%s [lastWord] %s [update] %t", word, w.lastWord, update) expected = true } else if containsKey(w.lang.MTens, w.lastWord) { + log.Printf(">> WordStreamValueParser.groupExpects.condition 3: [word]%s [lastWord] %s [update] %t", word, w.lastWord, update) expected = containsKey(w.lang.Units, word) || containsKey(w.lang.STens, word) && contains(w.lang.MTensWSTens, w.lastWord) } else if containsKey(w.lang.Hundred, w.lastWord) { + log.Printf(">> WordStreamValueParser.groupExpects.condition 4: [word]%s [lastWord] %s [update] %t", word, w.lastWord, update) expected = !containsKey(w.lang.Hundred, word) } if update { + log.Printf(">> WordStreamValueParser.groupExpects.condition 5: [word]%s [lastWord] %s [update] %t", word, w.lastWord, update) w.lastWord = word } @@ -48,6 +56,7 @@ func (w *WordStreamValueParser) groupExpects(word string, update bool) bool { } func (w *WordStreamValueParser) isCoefAppliable(coef int) bool { + log.Printf("+ WordStreamValueParser.isCoefAppliable.coef %d", coef) if w.lang.Simplify_check_coef_appliable { return coef != w.GetValue() } @@ -65,7 +74,7 @@ func (w *WordStreamValueParser) isCoefAppliable(coef int) bool { func (w *WordStreamValueParser) push(word string, lookAhead string) bool { - log.Printf("- WordStreamValueParser.push.word %s [ahead] %s", word, lookAhead) + log.Printf("+ WordStreamValueParser.push.word %s [ahead] %s [lastWord] %s", word, lookAhead, w.lastWord) if word == "" { log.Printf(">> WordStreamValueParser.push.condition 0: [word]%s [ahead] %s", word, lookAhead) @@ -94,24 +103,27 @@ func (w *WordStreamValueParser) push(word string, lookAhead string) bool { } if coef < 1000 { - if w.grpVal == 0 { - w.grpVal = 1 + value := w.grpVal + if value == 0 { + value = 1 } - w.grpVal = w.grpVal * coef + w.grpVal = value * coef w.lastWord = "" log.Printf(">> WordStreamValueParser.push.condition 3.2: [word]%s [ahead] %s", word, lookAhead) return true } if coef < w.n000Val { - if w.grpVal == 0 { - w.grpVal = 1 + value := w.n000Val + if value == 0 { + value = 1 } - w.n000Val = w.n000Val + coef*(w.grpVal) + w.n000Val = w.n000Val + coef*(value) } else { - if w.grpVal == 0 { - w.grpVal = 1 + value := w.GetValue() + if value == 0 { + value = 1 } - w.n000Val = w.GetValue() * coef + w.n000Val = value * coef } w.grpVal = 0 w.lastWord = "" @@ -150,118 +162,128 @@ func (w *WordStreamValueParser) push(word string, lookAhead string) bool { } type WordToDigitParser struct { - Lang *Language - value []string - IntBuilder WordStreamValueParser - FracBuilder WordStreamValueParser + Lang Language + the_value []string + IntBuilder *WordStreamValueParser + FracBuilder *WordStreamValueParser Signed bool InFrac bool Closed bool Open bool - LastWord string + lastWord string OrdinalThreshold int } -func NewWordToDigitParser(lang *Language, relaxed bool, signed bool, ordinalThreshold int, precedingWord string) *WordToDigitParser { - return &WordToDigitParser{ +func NewWordToDigitParser(lang Language, relaxed bool, signed bool, ordinalThreshold int, precedingWord string) WordToDigitParser { + return WordToDigitParser{ Lang: lang, - value: []string{}, - IntBuilder: NewWordStreamValueParser(*lang, relaxed), - FracBuilder: NewWordStreamValueParser(*lang, relaxed), + the_value: []string{}, + IntBuilder: NewWordStreamValueParser(lang, relaxed), + FracBuilder: NewWordStreamValueParser(lang, relaxed), Signed: signed, InFrac: false, Closed: false, Open: false, - LastWord: precedingWord, + lastWord: precedingWord, OrdinalThreshold: ordinalThreshold, } } func (w *WordToDigitParser) GetValue() string { - return strings.Join(w.value, "") + log.Printf("+ WordToDigitParser.GetValue") + return strings.Join(w.the_value, "") } func (w *WordToDigitParser) close() { + log.Printf("+ WordToDigitParser.close") if !w.Closed { - if w.InFrac && w.FracBuilder.GetValue() > 0 { - w.value = append(w.value, fmt.Sprint(w.FracBuilder.GetValue())) - } else if !w.InFrac && w.IntBuilder.GetValue() > 0 { - w.value = append(w.value, fmt.Sprint(w.IntBuilder.GetValue())) + if w.InFrac && w.FracBuilder.GetValue() != 0 { + log.Printf(">> WordToDigitParser.close.condition 0: adding FracBuilder %d", w.FracBuilder.GetValue()) + w.the_value = append(w.the_value, fmt.Sprint(w.FracBuilder.GetValue())) + } else if !w.InFrac && w.IntBuilder.GetValue() != 0 { + log.Printf(">> WordToDigitParser.close.condition 1: adding IntBuilder %d", w.IntBuilder.GetValue()) + w.the_value = append(w.the_value, fmt.Sprint(w.IntBuilder.GetValue())) } w.Closed = true } } func (w *WordToDigitParser) atStartOfSeq() bool { + print(">> WordToDigitParser.atStartOfSeq") return w.InFrac && w.FracBuilder.GetValue() == 0 || !w.InFrac && w.IntBuilder.GetValue() == 0 } func (w *WordToDigitParser) atStart() bool { + print(">> WordToDigitParser.atStart") return !w.Open } func (w *WordToDigitParser) the_push(word string, lookAhead string) bool { - builder := WordStreamValueParser{} - log.Printf(">> inFrac %v word %s lookAhead %s", w.InFrac, word, lookAhead) + log.Printf("🌀 >> inFrac %v [word] %s [lookAhead] %s [lastWord] %s", w.InFrac, word, lookAhead, w.lastWord) if w.InFrac { - builder = w.FracBuilder + builder := w.FracBuilder + return builder.push(word, lookAhead) } else { - builder = w.IntBuilder + builder := w.IntBuilder + return builder.push(word, lookAhead) } - return builder.push(word, lookAhead) } func (w *WordToDigitParser) isAlone(word string, nextWord string) bool { - return !w.Open && contains(w.Lang.NeverIfAlone, word) && w.Lang.NotNumericWord(nextWord) && w.Lang.NotNumericWord(w.LastWord) && !(nextWord == "" && w.LastWord == "") + return !w.Open && contains(w.Lang.NeverIfAlone, word) && w.Lang.NotNumericWord(nextWord) && w.Lang.NotNumericWord(w.lastWord) && !(nextWord == "" && w.lastWord == "") } func (w *WordToDigitParser) push(word string, lookAhead string) bool { if w.Closed || w.isAlone(word, lookAhead) { - log.Printf(">> WordToDigitParser.push.condition 0:[word]%s [ahead] %s", word, lookAhead) - w.LastWord = word + log.Printf(">> WordToDigitParser.push.condition 0:[word]%s [ahead] %s [lastWord] %s", word, lookAhead, w.lastWord) + w.lastWord = word return false } if w.Signed && containsKey(w.Lang.Sign, word) && containsKey(w.Lang.Numbers, lookAhead) && w.atStart() { - log.Printf(">> WordToDigitParser.push.condition 1:[word]%s [ahead] %s", word, lookAhead) - w.value = append(w.value, w.Lang.Sign[word]) + log.Printf(">> WordToDigitParser.push.condition 1:[word]%s [ahead] %s [lastWord] %s", word, lookAhead, w.lastWord) + w.the_value = append(w.the_value, w.Lang.Sign[word]) } else if contains(w.Lang.Zero, word) && w.atStartOfSeq() && lookAhead != "" && strings.Contains(w.Lang.DecimalSep, lookAhead) { - log.Printf(">> WordToDigitParser.push.condition 2:[word]%s [ahead] %s", word, lookAhead) + log.Printf(">> WordToDigitParser.push.condition 2:[word]%s [ahead] %s [lastWord] %s", word, lookAhead, w.lastWord) } else if contains(w.Lang.Zero, word) && w.atStartOfSeq() { - log.Printf(">> WordToDigitParser.push.condition 3:[word]%s [ahead] %s", word, lookAhead) - w.value = append(w.value, "0") + log.Printf(">> WordToDigitParser.push.condition 3:[word]%s [ahead] %s [lastWord] %s", word, lookAhead, w.lastWord) + w.the_value = append(w.the_value, "0") } else if w.the_push(w.Lang.Ord2Card(word), lookAhead) { - log.Printf(">> WordToDigitParser.push.condition 4:[word]%s [ahead] %s", word, lookAhead) - value2Add := word + log.Printf(">> WordToDigitParser.push.condition 4:[word]%s [ahead] %s [lastWord] %s", word, lookAhead, w.lastWord) + value2Add := "" if w.IntBuilder.GetValue() > w.OrdinalThreshold { - digits := w.IntBuilder.GetValue() + digits := 0 if w.InFrac { digits = w.FracBuilder.GetValue() + } else { + digits = w.IntBuilder.GetValue() } value2Add = w.Lang.NumOrd(fmt.Sprint(digits), word) + } else { + value2Add = word } - w.value = append(w.value, value2Add) + w.the_value = append(w.the_value, value2Add) w.Closed = true - } else if word == w.Lang.DecimalSep || contains(strings.Split(w.Lang.DecimalSep, ","), word) && (containsKey(w.Lang.Numbers, lookAhead) || contains(w.Lang.Zero, lookAhead)) && !w.InFrac { - log.Printf(">> WordToDigitParser.push.condition 5:[word]%s [ahead] %s", word, lookAhead) + } else if (word == w.Lang.DecimalSep || contains(strings.Split(w.Lang.DecimalSep, ","), word)) && (containsKey(w.Lang.Numbers, lookAhead) || contains(w.Lang.Zero, lookAhead)) && !w.InFrac { + log.Printf(">> WordToDigitParser.push.condition 5:[word]%s [ahead] %s [lastWord] %s", word, lookAhead, w.lastWord) if w.GetValue() == "" { - w.value = append(w.value, fmt.Sprint(w.IntBuilder.GetValue())) + w.the_value = append(w.the_value, fmt.Sprint(w.IntBuilder.GetValue())) } - w.value = append(w.value, w.Lang.DecimalSYM) + w.the_value = append(w.the_value, w.Lang.DecimalSYM) w.InFrac = true } else if !w.the_push(word, lookAhead) { - log.Printf(">> WordToDigitParser.push.condition 6:[word] %s [ahead] %s", word, lookAhead) + log.Printf(">> WordToDigitParser.push.condition 6:[word] %s [ahead] %s [lastWord] %s", word, lookAhead, w.lastWord) if w.Open { w.close() } - w.LastWord = word + w.lastWord = word return false } - log.Printf(">> WordToDigitParser.push.condition 7:[word] %s [ahead] %s", word, lookAhead) + log.Printf(">> WordToDigitParser.push.condition 7:[word] %s [ahead] %s [lastWord] %s", word, lookAhead, w.lastWord) w.Open = true - w.LastWord = word + w.lastWord = word return true } From 64540391af783779a21bae5f68bd236ecf28001b Mon Sep 17 00:00:00 2001 From: Pablo Diaz Date: Tue, 14 May 2024 16:55:06 -0500 Subject: [PATCH 4/6] itn working --- .deepsource.toml | 9 +++++++++ .github/workflows/tagger.yml | 31 +++++++++++++++++++++++++++++ .version | 1 + LICENSE | 2 +- README.md | 17 ++++++++++++++++ {example => examples}/alpha/main.go | 0 6 files changed, 59 insertions(+), 1 deletion(-) create mode 100644 .deepsource.toml create mode 100644 .github/workflows/tagger.yml create mode 100644 .version rename {example => examples}/alpha/main.go (100%) diff --git a/.deepsource.toml b/.deepsource.toml new file mode 100644 index 0000000..7a19f92 --- /dev/null +++ b/.deepsource.toml @@ -0,0 +1,9 @@ +version = 1 + +test_patterns = ["**/*_test.go"] + +[[analyzers]] +name = "go" + + [analyzers.meta] + import_root = "github.com/pablodz/itn" diff --git a/.github/workflows/tagger.yml b/.github/workflows/tagger.yml new file mode 100644 index 0000000..5648743 --- /dev/null +++ b/.github/workflows/tagger.yml @@ -0,0 +1,31 @@ +name: tagger +on: + push: + branches: + - main +permissions: + contents: write + +jobs: + tagger: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Create tag + id: tag + run: | + VERSION_FILE=".version" + VERSION_VALUE=$(cat $VERSION_FILE) + MAX_BRANCH_LENGTH=40 + FIXED_BRANCH=$(echo ${GITHUB_REF:11:${MAX_BRANCH_LENGTH}} | sed 's/[^[:alnum:]]/-/g') + NEW_TAG=$(echo "$VERSION_VALUE-${FIXED_BRANCH}.$(date +%Y%m%d-%H%M%S)") + echo "NEW_TAG=$NEW_TAG" >> $GITHUB_ENV + + - name: Push tag + run: | + git config --local user.email "actions@github.com" + git config --local user.name "GitHub Actions" + git tag ${{ env.NEW_TAG }} + git push origin ${{ env.NEW_TAG }} diff --git a/.version b/.version new file mode 100644 index 0000000..a1c2c6a --- /dev/null +++ b/.version @@ -0,0 +1 @@ +v0.1.1 \ No newline at end of file diff --git a/LICENSE b/LICENSE index 4b7df60..3ef92ac 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2024 Pablo +Copyright (c) 2024 Pablo & text2num developers Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index c026017..a080f18 100644 --- a/README.md +++ b/README.md @@ -1 +1,18 @@ # Inverse Text Normalization + +## Installation + +```bash +go get -v github.com/pablodz/itn@latest +``` + +## Examples + +Check [folder](/examples/) + +## Supported languages + +- ✅ español +- 🌀 francés +- 🌀 italiano +- 🌀 portugués diff --git a/example/alpha/main.go b/examples/alpha/main.go similarity index 100% rename from example/alpha/main.go rename to examples/alpha/main.go From 093deec6e4e2c3879e9a72232b69342ebc83a4b2 Mon Sep 17 00:00:00 2001 From: Pablo Diaz Date: Tue, 14 May 2024 16:58:17 -0500 Subject: [PATCH 5/6] format --- itn/base.go | 1 - itn/es_test.go | 33 ++++++++++++++++++++++----------- itn/parsers.go | 2 -- 3 files changed, 22 insertions(+), 14 deletions(-) diff --git a/itn/base.go b/itn/base.go index bc83ea5..3f59d9a 100644 --- a/itn/base.go +++ b/itn/base.go @@ -28,7 +28,6 @@ type Language struct { } func NewLanguageES() *Language { - l := &Language{ Multipliers: map[string]int{ "mil": 1000, diff --git a/itn/es_test.go b/itn/es_test.go index 5ad79d7..b8b6014 100644 --- a/itn/es_test.go +++ b/itn/es_test.go @@ -50,41 +50,52 @@ func TestAlpha2Digit(t *testing.T) { { input: "treinta y cuatro = treinta cuatro", output: "34 = 34", - }, { + }, + { input: "mas treinta y tres nueve sesenta cero seis doce veintiuno", output: "+33 9 60 06 12 21", }, { input: "cero nueve sesenta cero seis doce veintiuno", output: "09 60 06 12 21", - }, { + }, + { input: "cincuenta sesenta treinta y once", output: "50 60 30 y 11", - }, { + }, + { input: "trece mil cero noventa", output: "13000 090", - }, { + }, + { input: "cero", output: "0", - }, { + }, + { input: "doce coma noventa y nueve, ciento veinte coma cero cinco, uno coma doscientos treinta y seis, uno coma dos tres seis.", output: "12.99, 120.05, 1.236, 1.2 3 6.", - }, { + }, + { input: "coma quince", output: "0.15", - }, { + }, + { input: "Tenemos mas veinte grados dentro y menos quince fuera.", output: "Tenemos +20 grados dentro y -15 fuera.", - }, { + }, + { input: "Un momento por favor! treinta y un gatos. Uno dos tres cuatro!", output: "Un momento por favor! 31 gatos. 1 2 3 4!", - }, { + }, + { input: "Ni uno. Uno uno. Treinta y uno", output: "Ni uno. 1 1. 31", - }, { + }, + { input: "un millon", output: "1000000", - }, { + }, + { input: "un millón", output: "1000000", }, diff --git a/itn/parsers.go b/itn/parsers.go index 4ea95ec..629077c 100644 --- a/itn/parsers.go +++ b/itn/parsers.go @@ -73,7 +73,6 @@ func (w *WordStreamValueParser) isCoefAppliable(coef int) bool { } func (w *WordStreamValueParser) push(word string, lookAhead string) bool { - log.Printf("+ WordStreamValueParser.push.word %s [ahead] %s [lastWord] %s", word, lookAhead, w.lastWord) if word == "" { @@ -234,7 +233,6 @@ func (w *WordToDigitParser) isAlone(word string, nextWord string) bool { } func (w *WordToDigitParser) push(word string, lookAhead string) bool { - if w.Closed || w.isAlone(word, lookAhead) { log.Printf(">> WordToDigitParser.push.condition 0:[word]%s [ahead] %s [lastWord] %s", word, lookAhead, w.lastWord) w.lastWord = word From a7b0b97d45814ab7c0e6dd6864366b655c16d911 Mon Sep 17 00:00:00 2001 From: Pablo Diaz Date: Tue, 14 May 2024 17:00:09 -0500 Subject: [PATCH 6/6] format --- go.mod | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go.mod b/go.mod index bf03c63..1ef080b 100644 --- a/go.mod +++ b/go.mod @@ -1,3 +1,3 @@ module github.com/pablodz/itn -go 1.22.3 \ No newline at end of file +go 1.22.3