Skip to content

Commit

Permalink
Marking typo support (#18)
Browse files Browse the repository at this point in the history
* Adding (pre-)processors, this introduces support for Marking Typo's (opt in, won't be used by default)
* Go mod update (to 1.19, from 1.17)
  • Loading branch information
Dynom authored Feb 23, 2023
1 parent 1955485 commit 7a70d0b
Show file tree
Hide file tree
Showing 20 changed files with 288 additions and 90 deletions.
8 changes: 4 additions & 4 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@ version: 2
jobs:
build-and-test:
docker:
- image: cimg/go:1.17
- image: cimg/go:1.18

environment:
BINARY_NAME: "TySug-linux-amd64"
TEST_RESULTS: "/tmp/test-results"
#GOFLAGS: "-buildvcs=false" # for 1.18
GOFLAGS: "-buildvcs=false -trimpath"


steps:
Expand All @@ -23,7 +23,7 @@ jobs:
name: Build
command: |
TAG=${CIRCLE_TAG:-dev}
CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o "${TEST_RESULTS}/${BINARY_NAME}" -a -ldflags="-w -s -X main.Version=${TAG}" ./cmd/web
GOFLAGS="-buildvcs=false -trimpath" CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o "${TEST_RESULTS}/${BINARY_NAME}" -a -ldflags="-w -s -X main.Version=${TAG}" ./cmd/web
- run:
# Check if we have updates to minor/patch level packages we're explicitly referencing
Expand All @@ -34,7 +34,7 @@ jobs:
- run:
name: Lint
command: |
curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(go env GOPATH)/bin v1.44.2
curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(go env GOPATH)/bin v1.47.0
golangci-lint run
- run:
Expand Down
6 changes: 2 additions & 4 deletions .golangci.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
min-complexity = 20

[linters-settings.gofumpt]
lang-version = "1.17"
lang-version = "1.19"
extra-rules = true

[linters-settings.goconst]
Expand All @@ -27,9 +27,7 @@
"govet",
"errcheck",
"unused",
"structcheck",
"varcheck",
"deadcode",
"exhaustive",

"stylecheck",
"gosec",
Expand Down
3 changes: 0 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -212,9 +212,6 @@ Dealing with typos is complicated and heavily context dependent.
- Using the Web for Language Independent Spellchecking and Autocorrection - [http://static.googleusercontent.com/media/research.google.com/en/us/pubs/archive/36180.pdf](http://static.googleusercontent.com/media/research.google.com/en/us/pubs/archive/36180.pdf)
- Spellchecking by computer - [https://www.dcs.bbk.ac.uk/..roger/spellchecking.html](https://www.dcs.bbk.ac.uk/~roger/spellchecking.html)

# Wishlist
- Support for [Marking Typos](https://en.wikipedia.org/wiki/Typographical_error#Marking_typos). -- Probably not particularly useful, but seems fun to implement.

# Contributing

First of all: Awesome!
Expand Down
8 changes: 4 additions & 4 deletions buildDocker.sh
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#!/bin/sh
LATEST_TAG="$(git describe --tags $(git rev-list --tags --max-count=1 4b825dc642cb6eb9a060e54bf8d69288fbee4904))"
TAG_REF="$(git show-ref --hash --tags ${LATEST_TAG})"
LATEST_TAG="$(git describe --tags "$(git rev-list --tags --max-count=1 4b825dc642cb6eb9a060e54bf8d69288fbee4904)")"
TAG_REF="$(git show-ref --hash --tags "${LATEST_TAG}")"

docker build -t dynom/tysug:${LATEST_TAG} \
docker build -t "dynom/tysug:${LATEST_TAG}" \
--build-arg VERSION="${LATEST_TAG}" \
--build-arg GIT_REF="${TAG_REF}" \
. &&
docker tag dynom/tysug:${LATEST_TAG} dynom/tysug:latest
docker tag "dynom/tysug:${LATEST_TAG}" dynom/tysug:latest
3 changes: 1 addition & 2 deletions cmd/web/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package main

import (
"fmt"
"io/ioutil"
"net/http"
"os"

Expand Down Expand Up @@ -87,7 +86,7 @@ func main() {
func buildConfig(fileName string) (Config, error) {
c := Config{}

b, err := ioutil.ReadFile(fileName)
b, err := os.ReadFile(fileName)
if err != nil {
return c, fmt.Errorf("unable to open %q, reason: %s", fileName, err)
}
Expand Down
52 changes: 31 additions & 21 deletions createRelease.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,29 +6,29 @@ set -o pipefail -o nounset -o errexit -o errtrace
ROOT_DIR="$(pwd)"
while [ ! -d "${ROOT_DIR}/.git" ]; do

ROOT_DIR="$(dirname ${ROOT_DIR})"
if [ "x${ROOT_DIR}" == "x/" ]; then
ROOT_DIR="$(dirname "${ROOT_DIR}")"
if [[ "x${ROOT_DIR}" == "x/" ]]; then
echo "Cannot find .git directory, I use that as reference for the commands."
exit 1
fi
done

# Determine our projectname
NAME="$(basename $(pwd))"
# Determine our project name
NAME="$(basename "$(pwd)")"

# Checking if we have any tags to start with, the cid is Git's magical initial repo hash
TAGS=$(git rev-list --tags --count 4b825dc642cb6eb9a060e54bf8d69288fbee4904)
if [ ${TAGS} -eq 0 ];
if [[ "${TAGS}" -eq 0 ]];
then
echo "No tags detected for ${ROOT_DIR}, please create a tag first!"
exit 1;
fi

# Figuring out what tag's we're on
LATEST_TAG=$(git describe --tags $(git rev-list --tags --max-count=1 4b825dc642cb6eb9a060e54bf8d69288fbee4904))
LATEST_TAG=$(git describe --tags "$(git rev-list --tags --max-count=1 4b825dc642cb6eb9a060e54bf8d69288fbee4904)")
PREV_TAG=$(git tag --sort version:refname | tail -2 | head -1 || true)

if [ "x${LATEST_TAG}" == "x" -a "x${PREV_TAG}" == "x" ];
if [[ "x${LATEST_TAG}" == "x" && "x${PREV_TAG}" == "x" ]];
then
echo "No tag has been found?"
exit 1
Expand All @@ -37,7 +37,7 @@ echo "Previous tag is: ${PREV_TAG}"
echo "Building a release for tag: ${LATEST_TAG}"

# Falling back to the first commit, if we only have one tag
if [ "x${PREV_TAG}" == "x${LATEST_TAG}" ];
if [[ "x${PREV_TAG}" == "x${LATEST_TAG}" ]];
then
PREV_TAG=$(git rev-list --max-parents=0 HEAD)
fi
Expand All @@ -59,22 +59,32 @@ gox -ldflags "-s -w -X main.Version=${LATEST_TAG}" \
./cmd/web

# Archive
HERE=$(pwd)
BUILDDIR=${HERE}/build
for DIR in $(ls build/);
HERE="$(pwd)"
BUILD_DIR="${HERE}/build"
for DIR in "${BUILD_DIR}"/*;
do
OUTDIR="${HERE}/dist"
OUTFILENAME="${DIR}.tar.gz"
OUTFILE="${OUTDIR}/${OUTFILENAME}"
cd ${BUILDDIR}/${DIR} && \
tar -czf ${OUTFILE} * && \
cd ${OUTDIR} && \
shasum -a 512 ${OUTFILENAME} > ${OUTFILE}.sha512
BASE="$(basename "${DIR}")"
OUT_DIR="${HERE}/dist"
OUT_FILE_NAME="${BASE}.tar.gz"
OUT_FILE="${OUT_DIR}/${OUT_FILE_NAME}"
cd "${DIR}" && \
tar -czf "${OUT_FILE}" ./* && \
cd "${OUT_DIR}" && \
shasum -a 512 "${OUT_FILE_NAME}" > "${OUT_FILE}".sha512
done
cd ${HERE}
cd "${HERE}"

# Building the changelog
DIFF_REF="${PREV_TAG}..${LATEST_TAG}"
CHANGELOG="$(printf '# %s\n%s' 'Changelog' "$(git log ${DIFF_REF} --oneline --no-merges --reverse)")"
CHANGELOG="$(printf '# %s\n%s' 'Changelog' "$(git log "${DIFF_REF}" --oneline --no-merges --reverse)")"

echo "Building the changelog based on these two ref's: '${DIFF_REF}'"
github-release Dynom/${NAME} ${LATEST_TAG} "$(git rev-parse --abbrev-ref HEAD)" "${CHANGELOG}" 'dist/*';
ghr -owner "${GITHUB_USERNAME:-Dynom}" \
-repository "${GITHUB_REPOSITORY:${NAME}}" \
-commitish "$(git rev-parse HEAD)" \
-delete \
-body "${CHANGELOG}" \
"${LATEST_TAG}" \
./dist/


1 change: 1 addition & 0 deletions finder/algorithm.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ func NewWagnerFischer(insert, delete, substitution int) Algorithm {
// - Reduced allocations
// - Added rounding on the unaligned matches as per: http://www.alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
// - Added support for 1 character inputs, by making sure the match distances is never negative
//
//nolint:gocognit
func NewJaro() Algorithm {
return func(a, b string) float64 {
Expand Down
28 changes: 17 additions & 11 deletions finder/find.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,16 @@ import (

// Finder is the type to find the nearest reference
type Finder struct {
referenceMap referenceMapType
reference []string
referenceBucket referenceBucketType
Alg Algorithm
LengthTolerance float64 // A number between 0.0-1.0 (percentage) to allow for length miss-match, anything outside this is considered not similar. Set to 0 to disable.
lock *rwc.RWCMutex
bucketChars uint // @todo figure out what (type of) bucket approach to take. Prefix or perhaps using an ngram/trie approach
referenceMap referenceMapType
reference []string
referenceBucket referenceBucketType
algorithm Algorithm
inputPreProcessors []Processor
lengthTolerance float64 // A number between 0.0-1.0 (percentage) to allow for length miss-match, anything outside this is considered not similar. Set to 0 to disable.
lock *rwc.RWCMutex
bucketChars uint // @todo figure out what (type of) bucket approach to take. Prefix or perhaps using an ngram/trie approach
}

// Errors
var (
ErrNoAlgorithmDefined = errors.New("no algorithm defined")
ErrPrefixExceedsInputLen = errors.New("prefix length exceeds input length")
Expand Down Expand Up @@ -51,7 +51,7 @@ func New(list []string, options ...Option) (*Finder, error) {

i.Refresh(list)

if i.Alg == nil {
if i.algorithm == nil {
return i, ErrNoAlgorithmDefined
}

Expand Down Expand Up @@ -173,6 +173,12 @@ func (t *Finder) findTopRankingCtx(ctx context.Context, input string, prefixLeng
return []string{input}, WorstScoreValue, false, ErrPrefixExceedsInputLen
}

if len(t.inputPreProcessors) > 0 {
for _, p := range t.inputPreProcessors {
input = p(input)
}
}

t.lock.RLock()
defer t.lock.RUnlock()

Expand All @@ -198,11 +204,11 @@ func (t *Finder) findTopRankingCtx(ctx context.Context, input string, prefixLeng
}

// Test if the input length differs too much from the reference, making it an unlikely typo.
if !meetsLengthTolerance(t.LengthTolerance, input, ref) {
if !meetsLengthTolerance(t.lengthTolerance, input, ref) {
continue
}

score := t.Alg(input, ref)
score := t.algorithm(input, ref)
if score > hs {
hs = score
sameScore = sameScore[0:1]
Expand Down
12 changes: 6 additions & 6 deletions finder/find_benchmarks_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ func BenchmarkFindWithBucket(b *testing.B) {
f, _ := New(refs,
WithAlgorithm(alg),
WithLengthTolerance(0),
WithPrefixBuckets(false),
WithPrefixBuckets(true),
)

b.ResetTimer()
Expand All @@ -120,7 +120,7 @@ func BenchmarkFindWithBucket(b *testing.B) {
f, _ := New(refs,
WithAlgorithm(alg),
WithLengthTolerance(0),
WithPrefixBuckets(true),
WithPrefixBuckets(false),
)

b.ResetTimer()
Expand Down Expand Up @@ -166,11 +166,11 @@ func BenchmarkCopyOrAppend(b *testing.B) {
}
})

// "dst smaller copy" can't work, since the result won't contain all items or requires logic which'll make the
// implementation slower than an append
// "dst smaller copy" can't work, since the result won't contain all items or requires logic which makes the
// implementation slower then append

b.Run("dst smaller append", func(b *testing.B) {
refsAppendDst = make([]string, int(numToAllocate/2))
refsAppendDst = make([]string, numToAllocate/2)
b.ResetTimer()
for i := 0; i < b.N; i++ {
refsAppendDst = append(refsAppendSrc[:0:0], refsAppendSrc...)
Expand Down Expand Up @@ -198,7 +198,7 @@ func BenchmarkCopyOrAppend(b *testing.B) {
})

b.Run("dst larger append", func(b *testing.B) {
refsAppendDst = make([]string, int(numToAllocate*2))
refsAppendDst = make([]string, numToAllocate*2)
b.ResetTimer()
for i := 0; i < b.N; i++ {
refsAppendDst = append(refsAppendSrc[:0:0], refsAppendSrc...)
Expand Down
8 changes: 2 additions & 6 deletions finder/find_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,7 @@ func TestNoInput(t *testing.T) {

func TestContextCancel(t *testing.T) {
sug, err := New([]string{"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m"}, func(sug *Finder) {
sug.Alg = func(a, b string) float64 {
sug.algorithm = func(a, b string) float64 {
time.Sleep(10 * time.Millisecond)
return 1
}
Expand Down Expand Up @@ -453,7 +453,7 @@ func TestFinder_FindTopRankingPrefixCtx(t *testing.T) {
for _, tt := range tests {
t.Run(tt.name, func(t1 *testing.T) {
finder, _ := New(refs, func(sug *Finder) {
sug.Alg = func(a, b string) float64 {
sug.algorithm = func(a, b string) float64 {
return 1
}
})
Expand Down Expand Up @@ -750,15 +750,13 @@ func Test_meetsPrefixLengthMatch(t *testing.T) {
}

func BenchmarkFindTopRankingCTXRace(b *testing.B) {

sort.Strings(inspirationalRefList)
f, err := New(
inspirationalRefList[0:5],
WithAlgorithm(exampleAlgorithm),
WithLengthTolerance(0),
WithPrefixBuckets(false),
)

if err != nil {
b.Fatal("Setting up test failed")
}
Expand All @@ -781,15 +779,13 @@ func BenchmarkFindTopRankingCTXRace(b *testing.B) {
}

func BenchmarkFindTopRankingCTX(b *testing.B) {

sort.Strings(inspirationalRefList)
f, err := New(
inspirationalRefList[0:5],
WithAlgorithm(exampleAlgorithm),
WithLengthTolerance(0),
WithPrefixBuckets(false),
)

if err != nil {
b.Fatal("Setting up test failed")
}
Expand Down
10 changes: 8 additions & 2 deletions finder/option.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ type Option func(sug *Finder)
// WithAlgorithm allows you to set any algorithm
func WithAlgorithm(alg Algorithm) Option {
return func(s *Finder) {
s.Alg = alg
s.algorithm = alg
}
}

Expand All @@ -15,7 +15,7 @@ func WithAlgorithm(alg Algorithm) Option {
// size, with a minimum of 1 character. A value of 0 (the default) disables this feature.
func WithLengthTolerance(t float64) Option {
return func(s *Finder) {
s.LengthTolerance = t
s.lengthTolerance = t
}
}

Expand All @@ -28,3 +28,9 @@ func WithPrefixBuckets(enable bool) Option {
}
}
}

func WithPreProcessor(p ...Processor) Option {
return func(sug *Finder) {
sug.inputPreProcessors = p
}
}
2 changes: 1 addition & 1 deletion finder/option_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ func TestSetAlgorithm(t *testing.T) {

sug, err := New([]string{}, WithAlgorithm(veryPositiveAlg))

if sug.Alg == nil || err == ErrNoAlgorithmDefined {
if sug.algorithm == nil || err == ErrNoAlgorithmDefined {
t.Errorf("Expected the algorithm to be set")
}
}
Expand Down
Loading

0 comments on commit 7a70d0b

Please sign in to comment.