From 36d60b2140bbfdc29297ae6e8d094ff00cbb0c93 Mon Sep 17 00:00:00 2001 From: Pablo Diaz Date: Fri, 17 May 2024 17:08:37 -0500 Subject: [PATCH 1/2] working partially --- examples/alpha/pt/main.go | 13 ++ itn/base.go | 50 +++++--- itn/i18n.go | 192 ++++++++++++++++++++++++++++ itn/{en_test.go => lang_en_test.go} | 0 itn/{es_test.go => lang_es_test.go} | 0 itn/lang_pt_test.go | 177 +++++++++++++++++++++++++ itn/parsers.go | 5 +- itn/pt_ordinals.go | 167 ++++++++++++++++++++++++ 8 files changed, 587 insertions(+), 17 deletions(-) create mode 100644 examples/alpha/pt/main.go rename itn/{en_test.go => lang_en_test.go} (100%) rename itn/{es_test.go => lang_es_test.go} (100%) create mode 100644 itn/lang_pt_test.go create mode 100644 itn/pt_ordinals.go diff --git a/examples/alpha/pt/main.go b/examples/alpha/pt/main.go new file mode 100644 index 0000000..5fbfdeb --- /dev/null +++ b/examples/alpha/pt/main.go @@ -0,0 +1,13 @@ +package main + +import ( + "github.com/pablodz/itn/itn" +) + +func main() { + itn.SetDebug(true) + + processor, _ := itn.NewLanguage(itn.Portuguese) + new_string := processor.Alpha2Digit("Trezentos e setenta e oito milhões vinte e sete mil trezentos e doze", false, true, 3) + println(new_string) +} diff --git a/itn/base.go b/itn/base.go index 9fffd30..113403e 100644 --- a/itn/base.go +++ b/itn/base.go @@ -27,6 +27,7 @@ type Language struct { Simplify_check_coef_appliable bool // Optional RadMap map[string]string // Optional Composites map[string]int // Optional + PtOrdinals map[string]string // Only for Portuguese } type RelaxTuple struct { @@ -36,8 +37,18 @@ type RelaxTuple struct { func (lg *Language) Ord2Card(word string) string { switch lg.LangCode { - case English: + case Portuguese: logPrintf(">>>> Ord2Card.0 %s", word) + if len(word) < 1 { + return "" + } + ordinal, ok := lg.PtOrdinals[word[:len(word)-1]] + if !ok { + return "" + } + return ordinal + case English: + logPrintf(">>>> Ord2Card.1 %s", word) plurSuff := strings.HasSuffix(word, "ths") singSuff := strings.HasSuffix(word, "th") source := "" @@ -49,7 +60,7 @@ func (lg *Language) Ord2Card(word string) string { } else if strings.HasSuffix(word, "third") { source = strings.ReplaceAll(word, "third", "three") } else { - logPrintf(">>>> Ord2Card.1 %s", word) + logPrintf(">>>> Ord2Card.2 %s", word) return "" } } else { @@ -73,14 +84,13 @@ func (lg *Language) Ord2Card(word string) string { } if !containsKey(lg.Numbers, source) { - logPrintf(">>>> Ord2Card.2 %s", source) + logPrintf(">>>> Ord2Card.3 %s", source) return "" } - logPrintf(">>>> Ord2Card.3 %s", source) + logPrintf(">>>> Ord2Card.4 %s", source) return source - case Spanish: - return "" + default: return "" } @@ -89,6 +99,7 @@ func (lg *Language) Ord2Card(word string) string { func (lg *Language) NumOrd(digits string, originalWord string) string { switch lg.LangCode { case English: + logPrintf(">>>> NumOrd.0 %s", originalWord) sf := "" if strings.HasSuffix(originalWord, "s") { sf = originalWord[len(originalWord)-3:] @@ -98,14 +109,16 @@ func (lg *Language) NumOrd(digits string, originalWord string) string { return fmt.Sprintf("%s%s", digits, sf) - case Spanish: - + case Portuguese, Spanish: + logPrintf(">>>> NumOrd.1 %s", originalWord) if strings.HasSuffix(originalWord, "o") { return fmt.Sprintf("%sº", digits) } + return fmt.Sprintf("%sª", digits) } + logPrintf(">>>> NumOrd.2 ❌ %s", originalWord) return "ERROR" } @@ -120,7 +133,12 @@ func (lg *Language) NotNumericWord(word string) bool { return word == "" || word != lg.DecimalSep && !containsKey(lg.Numbers, word) && !contains(lg.Zero, word) } -var WORDSEP = regexp.MustCompile(`\s*[\.,;\(\)…\[\]:!\?]+\s*|\n`) +const UsePTOrdinalsMerger = true + +var ( + WORDSEP = regexp.MustCompile(`\s*[\.,;\(\)…\[\]:!\?]+\s*|\n`) + omg = OrdinalsMerger{} +) type segmentAndPunct struct { segment string @@ -171,12 +189,7 @@ func (lg Language) Alpha2Digit(text string, relaxed bool, signed bool, ordinalTh segmentAndPuncts := []segmentAndPunct{} for i, segment := range segments { - segmentAndPuncts = append(segmentAndPuncts, - segmentAndPunct{ - segment, - punct[i], - }, - ) + segmentAndPuncts = append(segmentAndPuncts, segmentAndPunct{segment, punct[i]}) } outSegments := []string{} @@ -226,5 +239,12 @@ func (lg Language) Alpha2Digit(text string, relaxed bool, signed bool, ordinalTh } text = strings.Join(outSegments, "") + logPrintf(">>> [text] %s", text) + + // Post-Processing + if lg.LangCode == Portuguese && UsePTOrdinalsMerger { + text = omg.MergeCompoundOrdinalsPT(text) + } + return text } diff --git a/itn/i18n.go b/itn/i18n.go index 37b45a2..85d4a18 100644 --- a/itn/i18n.go +++ b/itn/i18n.go @@ -233,6 +233,198 @@ func NewLanguage(LangCode LanguageCode) (*Language, error) { maps.Copy(l.Numbers, l.Hundred) maps.Copy(l.Numbers, l.Composites) + return l, nil + case Portuguese: + + l := &Language{ + LangCode: LangCode, + Multipliers: map[string]int{ + "mil": 1000, + "milhar": 1000, + "milhares": 1000, + "milhao": 1000000, + "milhão": 1000000, + "milhoes": 1000000, + "milhões": 1000000, + "bilhao": 1000000000, + "bilhão": 1000000000, + "bilhoes": 1000000000, + "bilhões": 1000000000, + "trilhao": 1000000000000, + "trilhão": 1000000000000, + "trilhoes": 1000000000000, + "trilhões": 1000000000000, + }, + Units: map[string]int{ + "um": 1, + "dois": 2, + "três": 3, + "quatro": 4, + "cinco": 5, + "seis": 6, + "sete": 7, + "oito": 8, + "nove": 9, + "uma": 1, // optional + "duas": 2, // optional + "tres": 3, // without accent + "catorze": 14, // without accent + "dezesseis": 16, // without accent + "dezessete": 17, // without accent + "dezenove": 19, // without accent + }, + STens: map[string]int{ + "dez": 10, + "onze": 11, + "doze": 12, + "treze": 13, + "catorze": 14, + "quinze": 15, + "dezasseis": 16, + "dezassete": 17, + "dezoito": 18, + "dezanove": 19, + }, + MTens: map[string]int{ + "vinte": 20, + "trinta": 30, + "quarenta": 40, + "cinquenta": 50, + "sessenta": 60, + "setenta": 70, + "oitenta": 80, + "noventa": 90, + }, + MTensWSTens: []string{}, + Hundred: map[string]int{ + "cem": 100, + "centena": 100, + "cento": 100, + "centenas": 100, + "duzentos": 200, + "duzentas": 200, + "trezentos": 300, + "trezentas": 300, + "quatrocentos": 400, + "quatrocentas": 400, + "quinhentos": 500, + "quinhentas": 500, + "seiscentos": 600, + "seiscentas": 600, + "setecentos": 700, + "setecentas": 700, + "oitocentos": 800, + "oitocentas": 800, + "novecentos": 900, + "novecentas": 900, + }, + Sign: map[string]string{ + "mais": "+", + "menos": "-", + }, + Zero: []string{ + "zero", + }, + DecimalSep: "vírgula", + DecimalSYM: ",", + AndNums: []string{ + "um", + "uma", + "duas", + "dois", + "três", + "tres", + "quatro", + "cinco", + "seis", + "sete", + "oito", + "nove", + "dez", + "onze", + "doze", + "treze", + "quatorze", + "catorze", + "quinze", + "dezasseis", + "dezesseis", + "dezassete", + "dezessete", + "dezoito", + "dezanove", + "dezenove", + "vinte", + "trinta", + "quarenta", + "cinquenta", + "sessenta", + "setenta", + "oitenta", + "noventa", + "cem", + "duzentos", + "trezentos", + "quatrocentos", + "quinhentos", + "seiscentos", + "setecentos", + "oitocentos", + "novecentos", + }, + + And: "e", + NeverIfAlone: []string{ + "um", + "uma", + }, + Relaxed: map[string]RelaxTuple{}, + RadMap: map[string]string{ + "fif": "five", + "eigh": "eight", + "nin": "nine", + "twelf": "twelve", + }, + Composites: map[string]int{}, + PtOrdinals: map[string]string{ + "primeir": "um", + "segund": "dois", + "terceir": "três", + "quart": "quatro", + "quint": "cinco", + "sext": "seis", + "sétim": "sete", + "oitav": "oito", + "non": "nove", + "décim": "dez", + "vigésim": "vinte", + "trigésim": "trinta", + "quadragésim": "quarenta", + "quinquagésim": "cinquenta", + "sexagésim": "sessenta", + "septagésim": "setenta", + "octagésim": "oitenta", + "nonagésim": "noventa", + "centésim": "cem", + "ducentésim": "cem", + "trecentésim": "cem", + "quadrigentésim": "cem", + "quingentésim": "cem", + "sexgentésim": "cem", + "setingentésim": "cem", + "octigentésim": "cem", + "nonigentésim": "mil", + "milionésim": "milhão", + }, + } + + l.Numbers = maps.Clone(l.Multipliers) + maps.Copy(l.Numbers, l.Units) + maps.Copy(l.Numbers, l.STens) + maps.Copy(l.Numbers, l.MTens) + maps.Copy(l.Numbers, l.Hundred) + maps.Copy(l.Numbers, l.Composites) + return l, nil default: diff --git a/itn/en_test.go b/itn/lang_en_test.go similarity index 100% rename from itn/en_test.go rename to itn/lang_en_test.go diff --git a/itn/es_test.go b/itn/lang_es_test.go similarity index 100% rename from itn/es_test.go rename to itn/lang_es_test.go diff --git a/itn/lang_pt_test.go b/itn/lang_pt_test.go new file mode 100644 index 0000000..13061ac --- /dev/null +++ b/itn/lang_pt_test.go @@ -0,0 +1,177 @@ +package itn + +import ( + "testing" +) + +func TestAlpha2DigitPT(t *testing.T) { + type test struct { + input string + output string + } + + tests := []test{ + { + input: "um vírgula um", + output: "1,1", + }, + { + input: "um vírgula quatrocentos e um", + output: "1,401", + }, + { + input: "vinte cinco vacas, doze galinhas e cento vinte e cinco kg de batatas.", + output: "25 vacas, 12 galinhas e 125 kg de batatas.", + }, + { + input: "mil duzentos sessenta e seis dólares.", + output: "1266 dólares.", + }, + { + input: "um dois três quatro vinte quinze", + output: "1 2 3 4 20 15", + }, + { + input: "vinte e um, trinta e um.", + output: "21, 31.", + }, + { + input: "mais trinta e três nove sessenta zero seis doze vinte e um", + output: "+33 9 60 06 12 21", + }, + { + input: "zero nove sessenta zero seis doze vinte e um", + output: "09 60 06 12 21", + }, + { + input: "cinquenta sessenta trinta onze", + output: "50 60 30 11", + }, + { + input: "duzentos e quarenta e quatro", + output: "244", + }, + { + input: "dois mil e vinte", + output: "2020", + }, + { + input: "mil novecentos e oitenta e quatro", + output: "1984", + }, + { + input: "mil e novecentos", + output: "1900", + }, + { + input: "dois mil cento e vinte cinco", + output: "2125", + }, + { + input: "Trezentos e setenta e oito milhões vinte e sete mil trezentos e doze", + output: "378027312", + }, + { + input: "treze mil zero noventa", + output: "13000 090", + }, + { + input: "zero", + output: "0", + }, + { + input: "doze vírgula noventa e nove, cento e vinte vírgula zero cinco, um vírgula duzentos e trinta e seis, um vírgula dois três seis.", + output: "12,99, 120,05, 1,236, 1,2 3 6.", + }, + { + input: "vírgula quinze", + output: "0,15", + }, + { + input: "Temos mais vinte graus dentro e menos quinze fora.", + output: "Temos +20 graus dentro e -15 fora.", + }, + { + input: "Um momento por favor! trinta e um gatos. Um dois três quatro!", + output: "Um momento por favor! 31 gatos. 1 2 3 4!", + }, + { + input: "Nem um. Um um. Trinta e um", + output: "Nem um. 1 1. 31", + }, + { + input: "Um milhao", + output: "1000000", + }, + { + input: "Um segundo por favor! Vigésimo segundo é diferente de vinte segundos.", + output: "Um segundo por favor! 22º é diferente de 20 segundos.", + }, + { + input: "Ordinais: primeiro, quinto, terceiro, vigésima, vigésimo primeiro, centésimo quadragésimo quinto", + output: "Ordinais: primeiro, 5º, terceiro, 20ª, 21º, 145º", + }, + { + input: "A décima quarta brigada do exército português, juntamento com o nonagésimo sexto regimento britânico, bateu o centésimo vigésimo sétimo regimento de infantaria de Napoleão", + output: "A 14ª brigada do exército português, juntamento com o 96º regimento britânico, bateu o 127º regimento de infantaria de Napoleão", + }, + { + input: "em mil quinhentos e catorze, ela nasceu", + output: "em 1514, ela nasceu", + }, + { + input: "tudo aconteceu até mil novecentos e dezesseis", + output: "tudo aconteceu até 1916", + }, + { + input: "em dezessete de janeiro de mil novecentos e noventa", + output: "em 17 de janeiro de 1990", + }, + { + input: "quanto é dezenove menos três? É dezesseis", + output: "quanto é 19 menos 3? É 16", + }, + } + + for _, tt := range tests { + processor, _ := NewLanguage(Portuguese) + new_string := processor.Alpha2Digit(tt.input, false, true, 3) + if new_string != tt.output { + t.Errorf("❌ Expected <%s>, got <%s>", tt.output, new_string) + } else { + t.Logf("✅ Expected <%s>, got <%s>", tt.output, new_string) + } + } +} + +func TestAlpha2DigitPTRelaxed(t *testing.T) { + type test struct { + input string + output string + } + + tests := []test{ + { + input: "um dois três quatro trinta e cinco.", + output: "1 2 3 4 35.", + }, + { + input: "um dois três quatro vinte, cinco.", + output: "1 2 3 4 20, 5.", + }, + { + input: "trinta e quatro = trinta quatro", + output: "34 = 34", + }, + } + + for _, tt := range tests { + processor, _ := NewLanguage(Portuguese) + new_string := processor.Alpha2Digit(tt.input, true, true, 3) + if new_string != tt.output { + t.Errorf("❌ Expected <%s>, got <%s>", tt.output, new_string) + } else { + t.Logf("✅ Expected <%s>, got <%s>", tt.output, new_string) + } + } +} diff --git a/itn/parsers.go b/itn/parsers.go index 5a496cc..bf5768f 100644 --- a/itn/parsers.go +++ b/itn/parsers.go @@ -121,6 +121,7 @@ func (w *WordStreamValueParser) push(word string, lookAhead string) bool { if value == 0 { value = 1 } + logPrintf(">>> WordStreamValueParser.push.condition 3.3: [value] %d [coef] %d", value, coef) w.n000Val = value * coef } w.grpVal = 0 @@ -207,12 +208,12 @@ func (w *WordToDigitParser) close() { } func (w *WordToDigitParser) atStartOfSeq() bool { - print(">> WordToDigitParser.atStartOfSeq") + logPrintf(">> WordToDigitParser.atStartOfSeq") return w.InFrac && w.FracBuilder.GetValue() == 0 || !w.InFrac && w.IntBuilder.GetValue() == 0 } func (w *WordToDigitParser) atStart() bool { - print(">> WordToDigitParser.atStart") + logPrintf(">> WordToDigitParser.atStart") return !w.Open } diff --git a/itn/pt_ordinals.go b/itn/pt_ordinals.go new file mode 100644 index 0000000..ef19117 --- /dev/null +++ b/itn/pt_ordinals.go @@ -0,0 +1,167 @@ +package itn + +import ( + "fmt" + "regexp" + "strconv" + "strings" +) + +var SegmentBreak = regexp.MustCompile(`\s*[\.,;\(\)…\[\]:!\?]+\s*`) + +type SubRegex struct { + regex *regexp.Regexp + replacement string +} + +// Initialize the regexes and replacements +var subRegexes = []SubRegex{ + {regexp.MustCompile(`1\s`), "um "}, + {regexp.MustCompile(`2\s`), "dois"}, + {regexp.MustCompile(`\b1[º°]\b`), "primeiro"}, + {regexp.MustCompile(`\b2[º°]\b`), "segundo"}, + {regexp.MustCompile(`\b3[º°]\b`), "terceiro"}, + {regexp.MustCompile(`\b1ª\b`), "primeira"}, + {regexp.MustCompile(`\b2ª\b`), "segunda"}, + {regexp.MustCompile(`\b3ª\b`), "terceira"}, +} + +type OrdinalsMerger struct{} + +func NewOrdinalsMerger() *OrdinalsMerger { + return &OrdinalsMerger{} +} + +func (om *OrdinalsMerger) MergeCompoundOrdinalsPT(text string) string { + segments := SegmentBreak.Split(text, -1) + punct := SegmentBreak.FindAllString(text, -1) + if len(punct) < len(segments) { + punct = append(punct, "") + } + + segmentAndPuncts := []segmentAndPunct{} + for i, segment := range segments { + segmentAndPuncts = append(segmentAndPuncts, + segmentAndPunct{ + segment, + punct[i], + }, + ) + } + + outSegments := []string{} + for _, sp := range segmentAndPuncts { + tokens := []string{} + for _, t := range strings.Split(sp.segment, " ") { + if len(t) > 0 { + tokens = append(tokens, t) + } + } + + pointer := 0 + tokens2 := []string{} + currentIsOrdinal := false + seq := []int{} + gender := "" + ordinal := 0 + + for pointer < len(tokens) { + token := tokens[pointer] + if om.isOrdinal(token) { + currentIsOrdinal = true + seq = append(seq, om.getCardinal(token)) + gender = om.getGender(token) + } else { + if !currentIsOrdinal { + tokens2 = append(tokens2, token) + } else { + for _, s := range seq { + ordinal = ordinal + s + } + tokens2 = append(tokens2, fmt.Sprintf("%s%s", strconv.Itoa(ordinal), gender)) + tokens2 = append(tokens2, token) + seq = []int{} + currentIsOrdinal = false + } + } + + pointer++ + + } + + if currentIsOrdinal { + for _, s := range seq { + ordinal = ordinal + s + } + tokens2 = append(tokens2, fmt.Sprintf("%s%s", strconv.Itoa(ordinal), gender)) + } + + tokens2 = om.text2NumStyle(tokens2) + sp.segment = strings.Join(tokens2, " ") + sp.punct + outSegments = append(outSegments, sp.segment) + + } + + return strings.Join(outSegments, "") +} + +func (om *OrdinalsMerger) isOrdinal(token string) bool { + out := false + if len(token) > 1 && (strings.Contains(token, "º") || strings.Contains(token, "°") || strings.Contains(token, "ª")) { + out = true + } + if contains([]string{ + "primeiro", + "primeira", + "segundo", + "segunda", + "terceiro", + "terceira", + }, token) { + out = true + } + + return out +} + +func (om *OrdinalsMerger) getCardinal(token string) int { + out := 0 + token = strings.TrimSpace(token) + if len(token) < 2 { + return out + } + numPart := token[:len(token)-1] + out, err := strconv.Atoi(numPart) + if err != nil { + switch numPart { + case "primeir": + out = 1 + case "segund": + out = 2 + case "terceir": + out = 3 + } + } + return out +} + +func (om *OrdinalsMerger) getGender(token string) string { + gender := string(token[len(token)-1]) + if gender == "a" { + gender = "ª" + } + if gender == "o" { + gender = "º" + } + return gender +} + +func (om *OrdinalsMerger) text2NumStyle(tokens []string) []string { + for i, token := range tokens { + for _, r := range subRegexes { + token = r.regex.ReplaceAllString(token, r.replacement) + } + tokens[i] = token + } + return tokens +} From 4bb12e013b809629bfef4952a3f8711658953687 Mon Sep 17 00:00:00 2001 From: Pablo Diaz Date: Sat, 18 May 2024 17:03:02 -0500 Subject: [PATCH 2/2] working totally pt --- examples/alpha/pt/main.go | 2 + itn/base.go | 2 +- itn/i18n.go | 8 +--- itn/lang_pt_test.go | 22 +++++++++- itn/parsers.go | 9 +++- itn/pt_ordinals.go | 89 ++++++++++++++++++++++----------------- itn/utils.go | 8 ++++ 7 files changed, 90 insertions(+), 50 deletions(-) diff --git a/examples/alpha/pt/main.go b/examples/alpha/pt/main.go index 5fbfdeb..6b14052 100644 --- a/examples/alpha/pt/main.go +++ b/examples/alpha/pt/main.go @@ -10,4 +10,6 @@ func main() { processor, _ := itn.NewLanguage(itn.Portuguese) new_string := processor.Alpha2Digit("Trezentos e setenta e oito milhões vinte e sete mil trezentos e doze", false, true, 3) println(new_string) + println("-----------------------------------------------------") + println("378027312") } diff --git a/itn/base.go b/itn/base.go index 113403e..194d4b5 100644 --- a/itn/base.go +++ b/itn/base.go @@ -38,7 +38,7 @@ type RelaxTuple struct { func (lg *Language) Ord2Card(word string) string { switch lg.LangCode { case Portuguese: - logPrintf(">>>> Ord2Card.0 %s", word) + logPrintf(">>>> Ord2Card.0 [word] %s", word) if len(word) < 1 { return "" } diff --git a/itn/i18n.go b/itn/i18n.go index 85d4a18..3534add 100644 --- a/itn/i18n.go +++ b/itn/i18n.go @@ -378,13 +378,7 @@ func NewLanguage(LangCode LanguageCode) (*Language, error) { "um", "uma", }, - Relaxed: map[string]RelaxTuple{}, - RadMap: map[string]string{ - "fif": "five", - "eigh": "eight", - "nin": "nine", - "twelf": "twelve", - }, + Relaxed: map[string]RelaxTuple{}, Composites: map[string]int{}, PtOrdinals: map[string]string{ "primeir": "um", diff --git a/itn/lang_pt_test.go b/itn/lang_pt_test.go index 13061ac..81df6d9 100644 --- a/itn/lang_pt_test.go +++ b/itn/lang_pt_test.go @@ -127,6 +127,26 @@ func TestAlpha2DigitPT(t *testing.T) { input: "em dezessete de janeiro de mil novecentos e noventa", output: "em 17 de janeiro de 1990", }, + } + + for _, tt := range tests { + processor, _ := NewLanguage(Portuguese) + new_string := processor.Alpha2Digit(tt.input, false, true, 3) + if new_string != tt.output { + t.Errorf("❌ Expected <%s>, got <%s>", tt.output, new_string) + } else { + t.Logf("✅ Expected <%s>, got <%s>", tt.output, new_string) + } + } +} + +func TestAlpha2DigitPTFalse(t *testing.T) { + type test struct { + input string + output string + } + + tests := []test{ { input: "quanto é dezenove menos três? É dezesseis", output: "quanto é 19 menos 3? É 16", @@ -135,7 +155,7 @@ func TestAlpha2DigitPT(t *testing.T) { for _, tt := range tests { processor, _ := NewLanguage(Portuguese) - new_string := processor.Alpha2Digit(tt.input, false, true, 3) + new_string := processor.Alpha2Digit(tt.input, true, false, 3) if new_string != tt.output { t.Errorf("❌ Expected <%s>, got <%s>", tt.output, new_string) } else { diff --git a/itn/parsers.go b/itn/parsers.go index bf5768f..2d732ec 100644 --- a/itn/parsers.go +++ b/itn/parsers.go @@ -57,17 +57,22 @@ func (w *WordStreamValueParser) groupExpects(word string, update bool) bool { func (w *WordStreamValueParser) isCoefAppliable(coef int) bool { logPrintf("+ WordStreamValueParser.isCoefAppliable.coef %d", coef) if w.lang.Simplify_check_coef_appliable { + logPrintf(">> WordStreamValueParser.isCoefAppliable.condition 0: [coef] %d", coef) return coef != w.GetValue() } if coef > w.GetValue() && (w.GetValue() > 0 || coef >= 100) { + logPrintf(">> WordStreamValueParser.isCoefAppliable.condition 1: [coef] %d", coef) return true } if coef*1000 <= w.n000Val || coef == 100 && 100 > w.grpVal { - return (w.grpVal > 0 || coef == 1000 || coef == 100) + logPrintf(">> WordStreamValueParser.isCoefAppliable.condition 2: [coef] %d", coef) + return w.grpVal > 0 || coef == 1000 || coef == 100 } + logPrintf(">> WordStreamValueParser.isCoefAppliable.condition 3: [coef] %d", coef) + return false } @@ -111,7 +116,7 @@ func (w *WordStreamValueParser) push(word string, lookAhead string) bool { return true } if coef < w.n000Val { - value := w.n000Val + value := w.grpVal if value == 0 { value = 1 } diff --git a/itn/pt_ordinals.go b/itn/pt_ordinals.go index ef19117..bdcd384 100644 --- a/itn/pt_ordinals.go +++ b/itn/pt_ordinals.go @@ -9,23 +9,6 @@ import ( var SegmentBreak = regexp.MustCompile(`\s*[\.,;\(\)…\[\]:!\?]+\s*`) -type SubRegex struct { - regex *regexp.Regexp - replacement string -} - -// Initialize the regexes and replacements -var subRegexes = []SubRegex{ - {regexp.MustCompile(`1\s`), "um "}, - {regexp.MustCompile(`2\s`), "dois"}, - {regexp.MustCompile(`\b1[º°]\b`), "primeiro"}, - {regexp.MustCompile(`\b2[º°]\b`), "segundo"}, - {regexp.MustCompile(`\b3[º°]\b`), "terceiro"}, - {regexp.MustCompile(`\b1ª\b`), "primeira"}, - {regexp.MustCompile(`\b2ª\b`), "segunda"}, - {regexp.MustCompile(`\b3ª\b`), "terceira"}, -} - type OrdinalsMerger struct{} func NewOrdinalsMerger() *OrdinalsMerger { @@ -41,15 +24,13 @@ func (om *OrdinalsMerger) MergeCompoundOrdinalsPT(text string) string { segmentAndPuncts := []segmentAndPunct{} for i, segment := range segments { - segmentAndPuncts = append(segmentAndPuncts, - segmentAndPunct{ - segment, - punct[i], - }, - ) + segmentAndPuncts = append(segmentAndPuncts, segmentAndPunct{segment, punct[i]}) } outSegments := []string{} + ordinal := 0 + gender := "" + for _, sp := range segmentAndPuncts { tokens := []string{} for _, t := range strings.Split(sp.segment, " ") { @@ -62,26 +43,32 @@ func (om *OrdinalsMerger) MergeCompoundOrdinalsPT(text string) string { tokens2 := []string{} currentIsOrdinal := false seq := []int{} - gender := "" - ordinal := 0 + + logPrintf("> MergeCompoundOrdinalsPT.1.tokens %v [tokens2] %v", tokens, tokens2) for pointer < len(tokens) { token := tokens[pointer] + if om.isOrdinal(token) { + logPrintf("> MergeCompoundOrdinalsPT.2.1.token <%s> [tokens2] %v", token, tokens2) currentIsOrdinal = true seq = append(seq, om.getCardinal(token)) gender = om.getGender(token) } else { if !currentIsOrdinal { + logPrintf("> MergeCompoundOrdinalsPT.4.token %s [tokens2] %v", token, tokens2) tokens2 = append(tokens2, token) } else { - for _, s := range seq { - ordinal = ordinal + s - } + logPrintf("> MergeCompoundOrdinalsPT.5.token %s [tokens2] %v", token, tokens2) + logPrintf("> MergeCompoundOrdinalsPT.5.seq %v", seq) + ordinal = sumInts(seq) + logPrintf("> MergeCompoundOrdinalsPT.5.ordinal %d", ordinal) tokens2 = append(tokens2, fmt.Sprintf("%s%s", strconv.Itoa(ordinal), gender)) tokens2 = append(tokens2, token) seq = []int{} currentIsOrdinal = false + logPrintf("> MergeCompoundOrdinalsPT.5.1.token %s [tokens2] %v", token, tokens2) + } } @@ -90,9 +77,8 @@ func (om *OrdinalsMerger) MergeCompoundOrdinalsPT(text string) string { } if currentIsOrdinal { - for _, s := range seq { - ordinal = ordinal + s - } + logPrintf("> MergeCompoundOrdinalsPT.6.seq %v [tokens2] %v", seq, tokens2) + ordinal = sumInts(seq) tokens2 = append(tokens2, fmt.Sprintf("%s%s", strconv.Itoa(ordinal), gender)) } @@ -101,8 +87,10 @@ func (om *OrdinalsMerger) MergeCompoundOrdinalsPT(text string) string { outSegments = append(outSegments, sp.segment) } + text = strings.Join(outSegments, "") + logPrintf("> MergeCompoundOrdinalsPT.7.text %s", text) - return strings.Join(outSegments, "") + return text } func (om *OrdinalsMerger) isOrdinal(token string) bool { @@ -126,11 +114,9 @@ func (om *OrdinalsMerger) isOrdinal(token string) bool { func (om *OrdinalsMerger) getCardinal(token string) int { out := 0 - token = strings.TrimSpace(token) - if len(token) < 2 { - return out - } - numPart := token[:len(token)-1] + runes := []rune(token) + numPart := string(runes[:len(runes)-1]) // Extract the part of the string before the last character + logPrintf(">>>> [getCardinal] token[:-1] %s", numPart) out, err := strconv.Atoi(numPart) if err != nil { switch numPart { @@ -140,8 +126,11 @@ func (om *OrdinalsMerger) getCardinal(token string) int { out = 2 case "terceir": out = 3 + default: + out = 0 } } + logPrintf(">>>> [getCardinal] %s -> %d", token, out) return out } @@ -156,12 +145,34 @@ func (om *OrdinalsMerger) getGender(token string) string { return gender } +type SubRegex struct { + Pattern *regexp.Regexp + Replacement string +} + +var subRegexes = []SubRegex{ + {regexp.MustCompile(`1\s`), "um "}, + {regexp.MustCompile(`2\s`), "dois"}, + {regexp.MustCompile(`(^|[^a-zA-Z0-9])1[º°]([^a-zA-Z0-9]|$)`), "primeiro"}, + {regexp.MustCompile(`(^|[^a-zA-Z0-9])2[º°]([^a-zA-Z0-9]|$)`), "segundo"}, + {regexp.MustCompile(`(^|[^a-zA-Z0-9])3[º°]([^a-zA-Z0-9]|$)`), "terceiro"}, + {regexp.MustCompile(`(^|[^a-zA-Z0-9])1ª([^a-zA-Z0-9]|$)`), "primeira"}, + {regexp.MustCompile(`(^|[^a-zA-Z0-9])2ª([^a-zA-Z0-9]|$)`), "segunda"}, + {regexp.MustCompile(`(^|[^a-zA-Z0-9])3ª([^a-zA-Z0-9]|$)`), "terceira"}, +} + func (om *OrdinalsMerger) text2NumStyle(tokens []string) []string { + logPrintf(">>>>>>> [Tokens] --- %v", tokens) for i, token := range tokens { - for _, r := range subRegexes { - token = r.regex.ReplaceAllString(token, r.replacement) + for _, sr := range subRegexes { + v := sr.Pattern.ReplaceAllString(token, sr.Replacement) + if token != v { + logPrintf(">>>>>>>>>>>>>>>>>>>>>>>>> [Tokens] !!![%d] %s -> %s", i, tokens[i], v) + } + token = v } tokens[i] = token } + logPrintf(">>>>>>> [Tokens] --- %v", tokens) return tokens } diff --git a/itn/utils.go b/itn/utils.go index 677f23b..8a08a59 100644 --- a/itn/utils.go +++ b/itn/utils.go @@ -27,3 +27,11 @@ func logPrintf(format string, args ...interface{}) { log.Printf(format, args...) } } + +func sumInts(ints []int) int { + sum := 0 + for _, i := range ints { + sum += i + } + return sum +}