Marking typo support (#18)

* Adding (pre-)processors, this introduces support for Marking Typo's (opt in, won't be used by default) * Go mod update (to 1.19, from 1.17)
Dynom · Feb 23, 2023 · 7a70d0b · 7a70d0b
1 parent 1955485
commit 7a70d0b
Show file tree

Hide file tree

Showing 20 changed files with 288 additions and 90 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -4,12 +4,12 @@ version: 2
 jobs:
   build-and-test:
     docker:
-      - image: cimg/go:1.17
+      - image: cimg/go:1.18
 
     environment:
       BINARY_NAME: "TySug-linux-amd64"
       TEST_RESULTS: "/tmp/test-results"
-      #GOFLAGS: "-buildvcs=false" # for 1.18
+      GOFLAGS: "-buildvcs=false -trimpath"
 
 
     steps:
@@ -23,7 +23,7 @@ jobs:
           name: Build
           command: |
             TAG=${CIRCLE_TAG:-dev}
-            CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o "${TEST_RESULTS}/${BINARY_NAME}" -a -ldflags="-w -s -X main.Version=${TAG}" ./cmd/web
+            GOFLAGS="-buildvcs=false -trimpath" CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o "${TEST_RESULTS}/${BINARY_NAME}" -a -ldflags="-w -s -X main.Version=${TAG}" ./cmd/web
 
       - run:
           # Check if we have updates to minor/patch level packages we're explicitly referencing
@@ -34,7 +34,7 @@ jobs:
       - run:
           name: Lint
           command: |
-            curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(go env GOPATH)/bin v1.44.2
+            curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(go env GOPATH)/bin v1.47.0
             golangci-lint run
 
       - run:

diff --git a/.golangci.toml b/.golangci.toml
@@ -7,7 +7,7 @@
         min-complexity = 20
 
     [linters-settings.gofumpt]
-        lang-version = "1.17"
+        lang-version = "1.19"
         extra-rules = true
 
     [linters-settings.goconst]
@@ -27,9 +27,7 @@
         "govet",
         "errcheck",
         "unused",
-        "structcheck",
-        "varcheck",
-        "deadcode",
+        "exhaustive",
 
         "stylecheck",
         "gosec",

diff --git a/README.md b/README.md
@@ -212,9 +212,6 @@ Dealing with typos is complicated and heavily context dependent.
 - Using the Web for Language Independent Spellchecking and Autocorrection - [http://static.googleusercontent.com/media/research.google.com/en/us/pubs/archive/36180.pdf](http://static.googleusercontent.com/media/research.google.com/en/us/pubs/archive/36180.pdf)
 - Spellchecking by computer - [https://www.dcs.bbk.ac.uk/..roger/spellchecking.html](https://www.dcs.bbk.ac.uk/~roger/spellchecking.html)
 
-# Wishlist
-- Support for [Marking Typos](https://en.wikipedia.org/wiki/Typographical_error#Marking_typos). -- Probably not particularly useful, but seems fun to implement.
-
 # Contributing
 
 First of all: Awesome!

diff --git a/buildDocker.sh b/buildDocker.sh
@@ -1,9 +1,9 @@
 #!/bin/sh
-LATEST_TAG="$(git describe --tags $(git rev-list --tags --max-count=1 4b825dc642cb6eb9a060e54bf8d69288fbee4904))"
-TAG_REF="$(git show-ref --hash --tags ${LATEST_TAG})"
+LATEST_TAG="$(git describe --tags "$(git rev-list --tags --max-count=1 4b825dc642cb6eb9a060e54bf8d69288fbee4904)")"
+TAG_REF="$(git show-ref --hash --tags "${LATEST_TAG}")"
 
-docker build -t dynom/tysug:${LATEST_TAG} \
+docker build -t "dynom/tysug:${LATEST_TAG}" \
 	--build-arg VERSION="${LATEST_TAG}" \
 	--build-arg GIT_REF="${TAG_REF}" \
 	. &&
-docker tag dynom/tysug:${LATEST_TAG} dynom/tysug:latest
+docker tag "dynom/tysug:${LATEST_TAG}" dynom/tysug:latest
diff --git a/cmd/web/main.go b/cmd/web/main.go
@@ -2,7 +2,6 @@ package main
 
 import (
 	"fmt"
-	"io/ioutil"
 	"net/http"
 	"os"
 
@@ -87,7 +86,7 @@ func main() {
 func buildConfig(fileName string) (Config, error) {
 	c := Config{}
 
-	b, err := ioutil.ReadFile(fileName)
+	b, err := os.ReadFile(fileName)
 	if err != nil {
 		return c, fmt.Errorf("unable to open %q, reason: %s", fileName, err)
 	}

diff --git a/createRelease.sh b/createRelease.sh
@@ -6,29 +6,29 @@ set -o pipefail -o nounset -o errexit -o errtrace
 ROOT_DIR="$(pwd)"
 while [ ! -d "${ROOT_DIR}/.git" ]; do
 
-    ROOT_DIR="$(dirname ${ROOT_DIR})"
-    if [ "x${ROOT_DIR}" == "x/" ]; then
+    ROOT_DIR="$(dirname "${ROOT_DIR}")"
+    if [[ "x${ROOT_DIR}" == "x/" ]]; then
         echo "Cannot find .git directory, I use that as reference for the commands."
         exit 1
     fi
 done
 
-# Determine our projectname
-NAME="$(basename $(pwd))"
+# Determine our project name
+NAME="$(basename "$(pwd)")"
 
 # Checking if we have any tags to start with, the cid is Git's magical initial repo hash
 TAGS=$(git rev-list --tags --count 4b825dc642cb6eb9a060e54bf8d69288fbee4904)
-if [ ${TAGS} -eq 0 ];
+if [[ "${TAGS}" -eq 0 ]];
 then
 	echo "No tags detected for ${ROOT_DIR}, please create a tag first!"
 	exit 1;
 fi
 
 # Figuring out what tag's we're on
-LATEST_TAG=$(git describe --tags $(git rev-list --tags --max-count=1 4b825dc642cb6eb9a060e54bf8d69288fbee4904))
+LATEST_TAG=$(git describe --tags "$(git rev-list --tags --max-count=1 4b825dc642cb6eb9a060e54bf8d69288fbee4904)")
 PREV_TAG=$(git tag --sort version:refname | tail -2 | head -1 || true)
 
-if [ "x${LATEST_TAG}" == "x" -a "x${PREV_TAG}" == "x" ];
+if [[ "x${LATEST_TAG}" == "x" && "x${PREV_TAG}" == "x" ]];
 then
     echo "No tag has been found?"
     exit 1
@@ -37,7 +37,7 @@ echo "Previous tag is: ${PREV_TAG}"
 echo "Building a release for tag: ${LATEST_TAG}"
 
 # Falling back to the first commit, if we only have one tag
-if [ "x${PREV_TAG}" == "x${LATEST_TAG}" ];
+if [[ "x${PREV_TAG}" == "x${LATEST_TAG}" ]];
 then
     PREV_TAG=$(git rev-list --max-parents=0 HEAD)
 fi
@@ -59,22 +59,32 @@ gox -ldflags "-s -w -X main.Version=${LATEST_TAG}" \
 	./cmd/web
 
 # Archive
-HERE=$(pwd)
-BUILDDIR=${HERE}/build
-for DIR in $(ls build/);
+HERE="$(pwd)"
+BUILD_DIR="${HERE}/build"
+for DIR in "${BUILD_DIR}"/*;
 do
-    OUTDIR="${HERE}/dist"
-    OUTFILENAME="${DIR}.tar.gz"
-    OUTFILE="${OUTDIR}/${OUTFILENAME}"
-    cd ${BUILDDIR}/${DIR} && \
-        tar -czf ${OUTFILE} * && \
-    cd ${OUTDIR} && \
-        shasum -a 512 ${OUTFILENAME} > ${OUTFILE}.sha512
+    BASE="$(basename "${DIR}")"
+    OUT_DIR="${HERE}/dist"
+    OUT_FILE_NAME="${BASE}.tar.gz"
+    OUT_FILE="${OUT_DIR}/${OUT_FILE_NAME}"
+    cd "${DIR}" && \
+        tar -czf "${OUT_FILE}" ./* && \
+    cd "${OUT_DIR}" && \
+        shasum -a 512 "${OUT_FILE_NAME}" > "${OUT_FILE}".sha512
 done
-cd ${HERE}
+cd "${HERE}"
 
 # Building the changelog
 DIFF_REF="${PREV_TAG}..${LATEST_TAG}"
-CHANGELOG="$(printf '# %s\n%s' 'Changelog' "$(git log ${DIFF_REF} --oneline --no-merges --reverse)")"
+CHANGELOG="$(printf '# %s\n%s' 'Changelog' "$(git log "${DIFF_REF}" --oneline --no-merges --reverse)")"
+
 echo "Building the changelog based on these two ref's: '${DIFF_REF}'"
-github-release Dynom/${NAME} ${LATEST_TAG} "$(git rev-parse --abbrev-ref HEAD)" "${CHANGELOG}" 'dist/*';
+ghr -owner "${GITHUB_USERNAME:-Dynom}" \
+    -repository "${GITHUB_REPOSITORY:${NAME}}" \
+    -commitish "$(git rev-parse HEAD)" \
+    -delete \
+    -body "${CHANGELOG}" \
+    "${LATEST_TAG}" \
+    ./dist/
+
+
diff --git a/finder/algorithm.go b/finder/algorithm.go
@@ -61,6 +61,7 @@ func NewWagnerFischer(insert, delete, substitution int) Algorithm {
 // - Reduced allocations
 // - Added rounding on the unaligned matches as per: http://www.alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
 // - Added support for 1 character inputs, by making sure the match distances is never negative
+//
 //nolint:gocognit
 func NewJaro() Algorithm {
 	return func(a, b string) float64 {

diff --git a/finder/find.go b/finder/find.go
@@ -11,16 +11,16 @@ import (
 
 // Finder is the type to find the nearest reference
 type Finder struct {
-	referenceMap    referenceMapType
-	reference       []string
-	referenceBucket referenceBucketType
-	Alg             Algorithm
-	LengthTolerance float64 // A number between 0.0-1.0 (percentage) to allow for length miss-match, anything outside this is considered not similar. Set to 0 to disable.
-	lock            *rwc.RWCMutex
-	bucketChars     uint // @todo figure out what (type of) bucket approach to take. Prefix or perhaps using an ngram/trie approach
+	referenceMap       referenceMapType
+	reference          []string
+	referenceBucket    referenceBucketType
+	algorithm          Algorithm
+	inputPreProcessors []Processor
+	lengthTolerance    float64 // A number between 0.0-1.0 (percentage) to allow for length miss-match, anything outside this is considered not similar. Set to 0 to disable.
+	lock               *rwc.RWCMutex
+	bucketChars        uint // @todo figure out what (type of) bucket approach to take. Prefix or perhaps using an ngram/trie approach
 }
 
-// Errors
 var (
 	ErrNoAlgorithmDefined    = errors.New("no algorithm defined")
 	ErrPrefixExceedsInputLen = errors.New("prefix length exceeds input length")
@@ -51,7 +51,7 @@ func New(list []string, options ...Option) (*Finder, error) {
 
 	i.Refresh(list)
 
-	if i.Alg == nil {
+	if i.algorithm == nil {
 		return i, ErrNoAlgorithmDefined
 	}
 
@@ -173,6 +173,12 @@ func (t *Finder) findTopRankingCtx(ctx context.Context, input string, prefixLeng
 		return []string{input}, WorstScoreValue, false, ErrPrefixExceedsInputLen
 	}
 
+	if len(t.inputPreProcessors) > 0 {
+		for _, p := range t.inputPreProcessors {
+			input = p(input)
+		}
+	}
+
 	t.lock.RLock()
 	defer t.lock.RUnlock()
 
@@ -198,11 +204,11 @@ func (t *Finder) findTopRankingCtx(ctx context.Context, input string, prefixLeng
 		}
 
 		// Test if the input length differs too much from the reference, making it an unlikely typo.
-		if !meetsLengthTolerance(t.LengthTolerance, input, ref) {
+		if !meetsLengthTolerance(t.lengthTolerance, input, ref) {
 			continue
 		}
 
-		score := t.Alg(input, ref)
+		score := t.algorithm(input, ref)
 		if score > hs {
 			hs = score
 			sameScore = sameScore[0:1]

diff --git a/finder/find_benchmarks_test.go b/finder/find_benchmarks_test.go
@@ -107,7 +107,7 @@ func BenchmarkFindWithBucket(b *testing.B) {
 		f, _ := New(refs,
 			WithAlgorithm(alg),
 			WithLengthTolerance(0),
-			WithPrefixBuckets(false),
+			WithPrefixBuckets(true),
 		)
 
 		b.ResetTimer()
@@ -120,7 +120,7 @@ func BenchmarkFindWithBucket(b *testing.B) {
 		f, _ := New(refs,
 			WithAlgorithm(alg),
 			WithLengthTolerance(0),
-			WithPrefixBuckets(true),
+			WithPrefixBuckets(false),
 		)
 
 		b.ResetTimer()
@@ -166,11 +166,11 @@ func BenchmarkCopyOrAppend(b *testing.B) {
 		}
 	})
 
-	// "dst smaller copy" can't work, since the result won't contain all items or requires logic which'll make the
-	// implementation slower than an append
+	// "dst smaller copy" can't work, since the result won't contain all items or requires logic which makes the
+	// implementation slower then append
 
 	b.Run("dst smaller append", func(b *testing.B) {
-		refsAppendDst = make([]string, int(numToAllocate/2))
+		refsAppendDst = make([]string, numToAllocate/2)
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			refsAppendDst = append(refsAppendSrc[:0:0], refsAppendSrc...)
@@ -198,7 +198,7 @@ func BenchmarkCopyOrAppend(b *testing.B) {
 	})
 
 	b.Run("dst larger append", func(b *testing.B) {
-		refsAppendDst = make([]string, int(numToAllocate*2))
+		refsAppendDst = make([]string, numToAllocate*2)
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			refsAppendDst = append(refsAppendSrc[:0:0], refsAppendSrc...)

diff --git a/finder/find_test.go b/finder/find_test.go
@@ -345,7 +345,7 @@ func TestNoInput(t *testing.T) {
 
 func TestContextCancel(t *testing.T) {
 	sug, err := New([]string{"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m"}, func(sug *Finder) {
-		sug.Alg = func(a, b string) float64 {
+		sug.algorithm = func(a, b string) float64 {
 			time.Sleep(10 * time.Millisecond)
 			return 1
 		}
@@ -453,7 +453,7 @@ func TestFinder_FindTopRankingPrefixCtx(t *testing.T) {
 	for _, tt := range tests {
 		t.Run(tt.name, func(t1 *testing.T) {
 			finder, _ := New(refs, func(sug *Finder) {
-				sug.Alg = func(a, b string) float64 {
+				sug.algorithm = func(a, b string) float64 {
 					return 1
 				}
 			})
@@ -750,15 +750,13 @@ func Test_meetsPrefixLengthMatch(t *testing.T) {
 }
 
 func BenchmarkFindTopRankingCTXRace(b *testing.B) {
-
 	sort.Strings(inspirationalRefList)
 	f, err := New(
 		inspirationalRefList[0:5],
 		WithAlgorithm(exampleAlgorithm),
 		WithLengthTolerance(0),
 		WithPrefixBuckets(false),
 	)
-
 	if err != nil {
 		b.Fatal("Setting up test failed")
 	}
@@ -781,15 +779,13 @@ func BenchmarkFindTopRankingCTXRace(b *testing.B) {
 }
 
 func BenchmarkFindTopRankingCTX(b *testing.B) {
-
 	sort.Strings(inspirationalRefList)
 	f, err := New(
 		inspirationalRefList[0:5],
 		WithAlgorithm(exampleAlgorithm),
 		WithLengthTolerance(0),
 		WithPrefixBuckets(false),
 	)
-
 	if err != nil {
 		b.Fatal("Setting up test failed")
 	}

diff --git a/finder/option.go b/finder/option.go
@@ -6,7 +6,7 @@ type Option func(sug *Finder)
 // WithAlgorithm allows you to set any algorithm
 func WithAlgorithm(alg Algorithm) Option {
 	return func(s *Finder) {
-		s.Alg = alg
+		s.algorithm = alg
 	}
 }
 
@@ -15,7 +15,7 @@ func WithAlgorithm(alg Algorithm) Option {
 // size, with a minimum of 1 character. A value of 0 (the default) disables this feature.
 func WithLengthTolerance(t float64) Option {
 	return func(s *Finder) {
-		s.LengthTolerance = t
+		s.lengthTolerance = t
 	}
 }
 
@@ -28,3 +28,9 @@ func WithPrefixBuckets(enable bool) Option {
 		}
 	}
 }
+
+func WithPreProcessor(p ...Processor) Option {
+	return func(sug *Finder) {
+		sug.inputPreProcessors = p
+	}
+}
diff --git a/finder/option_test.go b/finder/option_test.go
@@ -9,7 +9,7 @@ func TestSetAlgorithm(t *testing.T) {
 
 	sug, err := New([]string{}, WithAlgorithm(veryPositiveAlg))
 
-	if sug.Alg == nil || err == ErrNoAlgorithmDefined {
+	if sug.algorithm == nil || err == ErrNoAlgorithmDefined {
 		t.Errorf("Expected the algorithm to be set")
 	}
 }