Merge pull request #1 from pablodz/feat/dev

first release
pablodz · May 14, 2024 · 65c510f · 65c510f
2 parents 17c1828 + a7b0b97
commit 65c510f
Show file tree

Hide file tree

Showing 11 changed files with 784 additions and 6 deletions.
diff --git a/.deepsource.toml b/.deepsource.toml
@@ -1,10 +1,9 @@
 version = 1
 
+test_patterns = ["**/*_test.go"]
+
 [[analyzers]]
 name = "go"
 
   [analyzers.meta]
-  import_root = "github.com/pablodz/itn"
-
-[[transformers]]
-name = "gofumpt"
+  import_root = "github.com/sopro-dev/sopro-core"
diff --git a/.github/workflows/tagger.yml b/.github/workflows/tagger.yml
@@ -0,0 +1,31 @@
+name: tagger
+on:
+  push:
+    branches:
+      - main
+permissions:
+  contents: write
+
+jobs:
+  tagger:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Create tag
+        id: tag
+        run: |
+          VERSION_FILE=".version"
+          VERSION_VALUE=$(cat $VERSION_FILE)
+          MAX_BRANCH_LENGTH=40
+          FIXED_BRANCH=$(echo ${GITHUB_REF:11:${MAX_BRANCH_LENGTH}} | sed 's/[^[:alnum:]]/-/g')
+          NEW_TAG=$(echo "$VERSION_VALUE-${FIXED_BRANCH}.$(date +%Y%m%d-%H%M%S)")
+          echo "NEW_TAG=$NEW_TAG" >> $GITHUB_ENV
+
+      - name: Push tag
+        run: |
+          git config --local user.email "[email protected]"
+          git config --local user.name "GitHub Actions"
+          git tag ${{ env.NEW_TAG }}
+          git push origin ${{ env.NEW_TAG }}
diff --git a/.version b/.version
@@ -0,0 +1 @@
+v0.1.1
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2024 Pablo
+Copyright (c) 2024 Pablo & text2num developers
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/README.md b/README.md
@@ -1 +1,18 @@
-# itn
+# Inverse Text Normalization
+
+## Installation
+
+```bash
+go get -v github.com/pablodz/itn@latest
+```
+
+## Examples
+
+Check [folder](/examples/)
+
+## Supported languages
+
+- ✅ español
+- 🌀 francés
+- 🌀 italiano
+- 🌀 portugués
diff --git a/examples/alpha/main.go b/examples/alpha/main.go
@@ -0,0 +1,16 @@
+package main
+
+import (
+	"github.com/pablodz/itn/itn"
+)
+
+func main() {
+	processor := itn.NewLanguageES()
+	new_string := processor.Alpha2Digit(
+		"uno dos quince",
+		false,
+		true,
+		3,
+	)
+	println(new_string)
+}
diff --git a/go.mod b/go.mod
@@ -0,0 +1,3 @@
+module github.com/pablodz/itn
+
+go 1.22.3
diff --git a/itn/base.go b/itn/base.go
@@ -0,0 +1,296 @@
+package itn
+
+import (
+	"fmt"
+	"log"
+	"regexp"
+	"strings"
+)
+
+type Language struct {
+	Multipliers                   map[string]int
+	Units                         map[string]int
+	STens                         map[string]int
+	MTens                         map[string]int
+	MTensWSTens                   []string
+	Hundred                       map[string]int
+	MHundreds                     map[string]int
+	Numbers                       map[string]int
+	Sign                          map[string]string
+	Zero                          []string
+	DecimalSep                    string
+	DecimalSYM                    string
+	AndNums                       []string
+	And                           string
+	NeverIfAlone                  []string
+	Relaxed                       map[string]RelaxTuple
+	Simplify_check_coef_appliable bool
+}
+
+func NewLanguageES() *Language {
+	l := &Language{
+		Multipliers: map[string]int{
+			"mil":      1000,
+			"miles":    1000,
+			"millon":   1000000,
+			"millón":   1000000,
+			"millones": 1000000,
+		},
+		Units: map[string]int{
+			"uno":    1,
+			"dos":    2,
+			"tres":   3,
+			"cuatro": 4,
+			"cinco":  5,
+			"seis":   6,
+			"siete":  7,
+			"ocho":   8,
+			"nueve":  9,
+			"un":     1, // optional
+			"una":    1, // optional
+
+		},
+		STens: map[string]int{
+			"diez":         10,
+			"once":         11,
+			"doce":         12,
+			"trece":        13,
+			"catorce":      14,
+			"quince":       15,
+			"dieciseis":    16,
+			"diecisiete":   17,
+			"dieciocho":    18,
+			"diecinueve":   19,
+			"veinte":       20,
+			"veintiuno":    21,
+			"veintidos":    22,
+			"veintitres":   23,
+			"veinticuatro": 24,
+			"veinticinco":  25,
+			"veintiseis":   26,
+			"veintisiete":  27,
+			"veintiocho":   28,
+			"veintinueve":  29,
+			"veintitrés":   23, // with accent
+			"veintidós":    22, // with accent
+		},
+		MTens: map[string]int{
+			"treinta":   30,
+			"cuarenta":  40,
+			"cincuenta": 50,
+			"sesenta":   60,
+			"setenta":   70,
+			"ochenta":   80,
+			"noventa":   90,
+		},
+		MTensWSTens: []string{},
+		Hundred: map[string]int{
+			"cien":          100,
+			"ciento":        100,
+			"cienta":        100,
+			"doscientos":    200,
+			"trescientos":   300,
+			"cuatrocientos": 400,
+			"quinientos":    500,
+			"seiscientos":   600,
+			"setecientos":   700,
+			"ochocientos":   800,
+			"novecientos":   900,
+			"doscientas":    200, // with feminine
+			"trescientas":   300, // with feminine
+			"cuatrocientas": 400, // with feminine
+			"quinientas":    500, // with feminine
+			"seiscientas":   600, // with feminine
+			"setecientas":   700, // with feminine
+			"ochocientas":   800, // with feminine
+			"novecientas":   900, // with feminine
+		},
+		Sign: map[string]string{
+			"mas":   "+",
+			"menos": "-",
+		},
+		Zero: []string{
+			"cero",
+		},
+		DecimalSep: "coma",
+		DecimalSYM: ".",
+		AndNums: []string{
+			"un",
+			"uno",
+			"una",
+			"dos",
+			"tres",
+			"cuatro",
+			"cinco",
+			"seis",
+			"siete",
+			"ocho",
+			"nueve",
+		},
+
+		And: "y",
+		NeverIfAlone: []string{
+			"un",
+			"uno",
+			"una",
+		},
+		Relaxed: map[string]RelaxTuple{},
+	}
+
+	// deep copy from l.multipliers
+	l.Numbers = map[string]int{
+		"mil":      1000,
+		"miles":    1000,
+		"millon":   1000000,
+		"millón":   1000000,
+		"millones": 1000000,
+	}
+
+	for k, v := range l.Units {
+		l.Numbers[k] = v
+	}
+	for k, v := range l.STens {
+		l.Numbers[k] = v
+	}
+	for k, v := range l.MTens {
+		l.Numbers[k] = v
+	}
+	for k, v := range l.Hundred {
+		l.Numbers[k] = v
+	}
+
+	return l
+}
+
+type RelaxTuple struct {
+	Zero string
+	One  string
+}
+
+func (lg *Language) Ord2Card(word string) string {
+	return ""
+}
+
+func (lg *Language) NumOrd(digits string, originalWord string) string {
+	if strings.HasSuffix(originalWord, "o") {
+		return fmt.Sprintf("%sº", digits)
+	}
+	return fmt.Sprintf("%sª", digits)
+}
+
+func (lg *Language) Normalize(word string) string {
+	return word
+}
+
+func (lg *Language) NotNumericWord(word string) bool {
+	return word == "" || word != lg.DecimalSep && !containsKey(lg.Numbers, word) && !contains(lg.Zero, word)
+}
+
+var WORDSEP = regexp.MustCompile(`\s*[\.,;\(\)…\[\]:!\?]+\s*|\n`)
+
+type segmentAndPunct struct {
+	segment string
+	punct   string
+}
+
+type LookAhead struct {
+	Word  string
+	Ahead string
+}
+
+func lookAhead(tokens []string) []LookAhead {
+	if len(tokens) == 0 {
+		return []LookAhead{}
+	}
+
+	lookAheads := []LookAhead{}
+	for i := 0; i < len(tokens); i++ {
+
+		nextWord := ""
+		if i+1 >= len(tokens) {
+			nextWord = ""
+		} else {
+			nextWord = tokens[i+1]
+		}
+
+		lookAheads = append(lookAheads, LookAhead{tokens[i], nextWord})
+	}
+	// fill the last element with empty next
+	lookAheads[len(lookAheads)-1].Ahead = ""
+
+	return lookAheads
+}
+
+func (lg Language) Alpha2Digit(text string, relaxed bool, signed bool, ordinalThreshold int) string {
+	segments := WORDSEP.Split(text, -1)
+	// for i, segment := range segments {
+	// 	log.Println("[segment]", i, segment)
+	// }
+	punct := WORDSEP.FindAllString(text, -1)
+	// for i, p := range punct {
+	// 	log.Println("[punct]", i, p)
+	// }
+
+	if len(punct) < len(segments) {
+		punct = append(punct, "")
+	}
+
+	segmentAndPuncts := []segmentAndPunct{}
+	for i, segment := range segments {
+		segmentAndPuncts = append(segmentAndPuncts,
+			segmentAndPunct{
+				segment,
+				punct[i],
+			},
+		)
+	}
+
+	outSegments := []string{}
+	for _, sp := range segmentAndPuncts {
+		tokens := strings.Split(sp.segment, " ")
+		log.Printf("tokens %v", tokens)
+
+		numBuilder := NewWordToDigitParser(lg, relaxed, signed, ordinalThreshold, "")
+		lastWord := ""
+		inNumber := false
+		outTokens := []string{}
+		for _, couple := range lookAhead(tokens) {
+
+			log.Printf("✅ [word] %s [ahead] %s", couple.Word, couple.Ahead)
+
+			pushed := numBuilder.push(strings.ToLower(couple.Word), strings.ToLower(couple.Ahead))
+			if pushed {
+				log.Printf("> condition 1: word %s ahead %s", couple.Word, couple.Ahead)
+				inNumber = true
+			} else if inNumber {
+				log.Printf("> condition 2: word %s ahead %s", couple.Word, couple.Ahead)
+				outTokens = append(outTokens, numBuilder.GetValue())
+				numBuilder = NewWordToDigitParser(lg, relaxed, signed, ordinalThreshold, lastWord)
+				inNumber = numBuilder.push(strings.ToLower(couple.Word), strings.ToLower(couple.Ahead))
+			}
+
+			if !inNumber {
+				log.Printf("> condition 3: word %s ahead %s", couple.Word, couple.Ahead)
+				outTokens = append(outTokens, couple.Word)
+			}
+
+			lastWord = strings.ToLower(couple.Word)
+
+			log.Printf("... lastWord %s, inNumber %t, outTokens %v", lastWord, inNumber, outTokens)
+
+		}
+
+		log.Printf("---")
+		numBuilder.close()
+		if numBuilder.GetValue() != "" {
+			outTokens = append(outTokens, numBuilder.GetValue())
+		}
+
+		outSegments = append(outSegments, strings.Join(outTokens, " "))
+		outSegments = append(outSegments, sp.punct)
+
+	}
+	text = strings.Join(outSegments, "")
+
+	return text
+}