Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

first release #1

Merged
merged 7 commits into from
May 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions .deepsource.toml
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
version = 1

test_patterns = ["**/*_test.go"]

[[analyzers]]
name = "go"

[analyzers.meta]
import_root = "github.com/pablodz/itn"

[[transformers]]
name = "gofumpt"
import_root = "github.com/sopro-dev/sopro-core"
31 changes: 31 additions & 0 deletions .github/workflows/tagger.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
name: tagger
on:
push:
branches:
- main
permissions:
contents: write

jobs:
tagger:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v3

- name: Create tag
id: tag
run: |
VERSION_FILE=".version"
VERSION_VALUE=$(cat $VERSION_FILE)
MAX_BRANCH_LENGTH=40
FIXED_BRANCH=$(echo ${GITHUB_REF:11:${MAX_BRANCH_LENGTH}} | sed 's/[^[:alnum:]]/-/g')
NEW_TAG=$(echo "$VERSION_VALUE-${FIXED_BRANCH}.$(date +%Y%m%d-%H%M%S)")
echo "NEW_TAG=$NEW_TAG" >> $GITHUB_ENV

- name: Push tag
run: |
git config --local user.email "[email protected]"
git config --local user.name "GitHub Actions"
git tag ${{ env.NEW_TAG }}
git push origin ${{ env.NEW_TAG }}
1 change: 1 addition & 0 deletions .version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
v0.1.1
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
MIT License

Copyright (c) 2024 Pablo
Copyright (c) 2024 Pablo & text2num developers

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
19 changes: 18 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,18 @@
# itn
# Inverse Text Normalization

## Installation

```bash
go get -v github.com/pablodz/itn@latest
```

## Examples

Check [folder](/examples/)

## Supported languages

- ✅ español
- 🌀 francés
- 🌀 italiano
- 🌀 portugués
16 changes: 16 additions & 0 deletions examples/alpha/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
package main

import (
"github.com/pablodz/itn/itn"
)

func main() {
processor := itn.NewLanguageES()
new_string := processor.Alpha2Digit(
"uno dos quince",
false,
true,
3,
)
println(new_string)
}
3 changes: 3 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
module github.com/pablodz/itn

go 1.22.3
296 changes: 296 additions & 0 deletions itn/base.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,296 @@
package itn

import (
"fmt"
"log"
"regexp"
"strings"
)

type Language struct {
Multipliers map[string]int
Units map[string]int
STens map[string]int
MTens map[string]int
MTensWSTens []string
Hundred map[string]int
MHundreds map[string]int
Numbers map[string]int
Sign map[string]string
Zero []string
DecimalSep string
DecimalSYM string
AndNums []string
And string
NeverIfAlone []string
Relaxed map[string]RelaxTuple
Simplify_check_coef_appliable bool
}

func NewLanguageES() *Language {
l := &Language{
Multipliers: map[string]int{
"mil": 1000,
"miles": 1000,
"millon": 1000000,
"millón": 1000000,
"millones": 1000000,
},
Units: map[string]int{
"uno": 1,
"dos": 2,
"tres": 3,
"cuatro": 4,
"cinco": 5,
"seis": 6,
"siete": 7,
"ocho": 8,
"nueve": 9,
"un": 1, // optional
"una": 1, // optional

},
STens: map[string]int{
"diez": 10,
"once": 11,
"doce": 12,
"trece": 13,
"catorce": 14,
"quince": 15,
"dieciseis": 16,
"diecisiete": 17,
"dieciocho": 18,
"diecinueve": 19,
"veinte": 20,
"veintiuno": 21,
"veintidos": 22,
"veintitres": 23,
"veinticuatro": 24,
"veinticinco": 25,
"veintiseis": 26,
"veintisiete": 27,
"veintiocho": 28,
"veintinueve": 29,
"veintitrés": 23, // with accent
"veintidós": 22, // with accent
},
MTens: map[string]int{
"treinta": 30,
"cuarenta": 40,
"cincuenta": 50,
"sesenta": 60,
"setenta": 70,
"ochenta": 80,
"noventa": 90,
},
MTensWSTens: []string{},
Hundred: map[string]int{
"cien": 100,
"ciento": 100,
"cienta": 100,
"doscientos": 200,
"trescientos": 300,
"cuatrocientos": 400,
"quinientos": 500,
"seiscientos": 600,
"setecientos": 700,
"ochocientos": 800,
"novecientos": 900,
"doscientas": 200, // with feminine
"trescientas": 300, // with feminine
"cuatrocientas": 400, // with feminine
"quinientas": 500, // with feminine
"seiscientas": 600, // with feminine
"setecientas": 700, // with feminine
"ochocientas": 800, // with feminine
"novecientas": 900, // with feminine
},
Sign: map[string]string{
"mas": "+",
"menos": "-",
},
Zero: []string{
"cero",
},
DecimalSep: "coma",
DecimalSYM: ".",
AndNums: []string{
"un",
"uno",
"una",
"dos",
"tres",
"cuatro",
"cinco",
"seis",
"siete",
"ocho",
"nueve",
},

And: "y",
NeverIfAlone: []string{
"un",
"uno",
"una",
},
Relaxed: map[string]RelaxTuple{},
}

// deep copy from l.multipliers
l.Numbers = map[string]int{
"mil": 1000,
"miles": 1000,
"millon": 1000000,
"millón": 1000000,
"millones": 1000000,
}

for k, v := range l.Units {
l.Numbers[k] = v
}
for k, v := range l.STens {
l.Numbers[k] = v
}
for k, v := range l.MTens {
l.Numbers[k] = v
}
for k, v := range l.Hundred {
l.Numbers[k] = v
}

return l
}

type RelaxTuple struct {
Zero string
One string
}

func (lg *Language) Ord2Card(word string) string {
return ""
}

func (lg *Language) NumOrd(digits string, originalWord string) string {
if strings.HasSuffix(originalWord, "o") {
return fmt.Sprintf("%sº", digits)
}
return fmt.Sprintf("%sª", digits)
}

func (lg *Language) Normalize(word string) string {
return word
}

func (lg *Language) NotNumericWord(word string) bool {
return word == "" || word != lg.DecimalSep && !containsKey(lg.Numbers, word) && !contains(lg.Zero, word)
}

var WORDSEP = regexp.MustCompile(`\s*[\.,;\(\)…\[\]:!\?]+\s*|\n`)

type segmentAndPunct struct {
segment string
punct string
}

type LookAhead struct {
Word string
Ahead string
}

func lookAhead(tokens []string) []LookAhead {
if len(tokens) == 0 {
return []LookAhead{}
}

lookAheads := []LookAhead{}
for i := 0; i < len(tokens); i++ {

nextWord := ""
if i+1 >= len(tokens) {
nextWord = ""
} else {
nextWord = tokens[i+1]
}

lookAheads = append(lookAheads, LookAhead{tokens[i], nextWord})
}
// fill the last element with empty next
lookAheads[len(lookAheads)-1].Ahead = ""

return lookAheads
}

func (lg Language) Alpha2Digit(text string, relaxed bool, signed bool, ordinalThreshold int) string {
segments := WORDSEP.Split(text, -1)
// for i, segment := range segments {
// log.Println("[segment]", i, segment)
// }
punct := WORDSEP.FindAllString(text, -1)
// for i, p := range punct {
// log.Println("[punct]", i, p)
// }

if len(punct) < len(segments) {
punct = append(punct, "")
}

segmentAndPuncts := []segmentAndPunct{}
for i, segment := range segments {
segmentAndPuncts = append(segmentAndPuncts,
segmentAndPunct{
segment,
punct[i],
},
)
}

outSegments := []string{}
for _, sp := range segmentAndPuncts {
tokens := strings.Split(sp.segment, " ")
log.Printf("tokens %v", tokens)

numBuilder := NewWordToDigitParser(lg, relaxed, signed, ordinalThreshold, "")
lastWord := ""
inNumber := false
outTokens := []string{}
for _, couple := range lookAhead(tokens) {

log.Printf("✅ [word] %s [ahead] %s", couple.Word, couple.Ahead)

pushed := numBuilder.push(strings.ToLower(couple.Word), strings.ToLower(couple.Ahead))
if pushed {
log.Printf("> condition 1: word %s ahead %s", couple.Word, couple.Ahead)
inNumber = true
} else if inNumber {
log.Printf("> condition 2: word %s ahead %s", couple.Word, couple.Ahead)
outTokens = append(outTokens, numBuilder.GetValue())
numBuilder = NewWordToDigitParser(lg, relaxed, signed, ordinalThreshold, lastWord)
inNumber = numBuilder.push(strings.ToLower(couple.Word), strings.ToLower(couple.Ahead))
}

if !inNumber {
log.Printf("> condition 3: word %s ahead %s", couple.Word, couple.Ahead)
outTokens = append(outTokens, couple.Word)
}

lastWord = strings.ToLower(couple.Word)

log.Printf("... lastWord %s, inNumber %t, outTokens %v", lastWord, inNumber, outTokens)

}

log.Printf("---")
numBuilder.close()
if numBuilder.GetValue() != "" {
outTokens = append(outTokens, numBuilder.GetValue())
}

outSegments = append(outSegments, strings.Join(outTokens, " "))
outSegments = append(outSegments, sp.punct)

}
text = strings.Join(outSegments, "")

return text
}
Loading