Skip to content

Commit

Permalink
Updated smetrics package and changed the Jaro implementation to inclu…
Browse files Browse the repository at this point in the history
…de rounding of transposition halves
  • Loading branch information
Dynom committed Jul 28, 2020
1 parent 50e356e commit 50715df
Show file tree
Hide file tree
Showing 4 changed files with 13 additions and 76 deletions.
8 changes: 6 additions & 2 deletions finder/algorithm.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,11 @@ func NewWagnerFischer(insert, delete, substitution int) Algorithm {
}

// NewJaro returns the default Jaro algorithm
// @see https://rosettacode.org/wiki/Jaro_distance#Go
// @see Original https://rosettacode.org/wiki/Jaro_distance#Go
// Relevant discussions: https://github.com/xrash/smetrics/issues/7#issuecomment-664794681
// Changes over original:
// - Reduced allocations
// - Added rounding on the unaligned matches as per: http://www.alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
//nolint:gocyclo
func NewJaro() Algorithm {
return func(a, b string) float64 {
Expand Down Expand Up @@ -127,6 +131,6 @@ func NewJaro() Algorithm {

return (matches/float64(len(a)) +
matches/float64(len(b)) +
(matches-(transpositions/2))/matches) / 3
(matches-math.Floor(transpositions/2))/matches) / 3
}
}
77 changes: 4 additions & 73 deletions finder/algorithm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -501,7 +501,7 @@ func TestHomoPhoneJaroImplementations(t *testing.T) {
var scores = make([]float64, 4)
scores[smetricsJaro] = smetrics.Jaro(a, b)
scores[rosettaJaroV0] = RosettaJaroV0(a, b)
scores[rosettaJaroV1] = RosettaJaroV1(a, b)
scores[rosettaJaroV1] = NewJaro()(a, b)
scores[jaroDistanceMasatana] = func() float64 {
s, _ := JaroDistanceMasatana(a, b)
return s
Expand All @@ -525,6 +525,7 @@ func TestHomoPhoneJaroImplementations(t *testing.T) {
}

func TestJaroImplementations(t *testing.T) {
RosettaJaroV1 := NewJaro()
for _, tt := range jaroReferenceList {
score := NewJaro()(tt.a, tt.b)

Expand Down Expand Up @@ -670,6 +671,7 @@ func BenchmarkJaroImplementations(b *testing.B) {
})

b.Run("RosettaJaro V1", func(b *testing.B) {
RosettaJaroV1 := NewJaro()
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
Expand Down Expand Up @@ -711,6 +713,7 @@ func BenchmarkRosettaJaro(b *testing.B) {
}
})
b.Run("Single alloc", func(b *testing.B) {
RosettaJaroV1 := NewJaro()
b.ReportAllocs()
b.ResetTimer()

Expand Down Expand Up @@ -792,75 +795,3 @@ func RosettaJaroV0(a, b string) float64 {
matches/float64(len(b)) +
(matches-(transpositions/2))/matches) / 3
}

// @see https://rosettacode.org/wiki/Jaro_distance#Go
// Changes:
// - Allocation reduction
func RosettaJaroV1(a, b string) float64 {
if len(a) == 0 && len(b) == 0 {
return 1
}
if len(a) == 0 || len(b) == 0 {
return 0
}
matchDistance := len(a)
if len(b) > matchDistance {
matchDistance = len(b)
}

matchDistance = matchDistance/2 - 1
matchesCollected := make([]bool, len(a)+len(b))

var matches float64
var transpositions float64
for i := range a {
start := i - matchDistance
if start < 0 {
start = 0
}

end := i + matchDistance + 1
if end > len(b) {
end = len(b)
}

for k := start; k < end; k++ {
if matchesCollected[k+len(a)] {
continue
}
if a[i] != b[k] {
continue
}

matchesCollected[i] = true
matchesCollected[k+len(a)] = true
matches++
break
}
}

if matches == 0 {
return 0
}

k := 0
for i := range a {
if !matchesCollected[i] {
continue
}

for !matchesCollected[k+len(a)] {
k++
}

if a[i] != b[k] {
transpositions++
}

k++
}

return (matches/float64(len(a)) +
matches/float64(len(b)) +
(matches-(transpositions/2))/matches) / 3
}
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ require (
github.com/mitchellh/gox v1.0.1 // indirect
github.com/rs/cors v1.7.0
github.com/sirupsen/logrus v1.5.0
github.com/xrash/smetrics v0.0.0-20170218160415-a3153f7040e9
github.com/xrash/smetrics v0.0.0-20200723181607-f06e43cca1ab
github.com/zikes/sift4 v0.0.0-20151103205100-8e89a8aebc1f
golang.org/x/crypto v0.0.0-20200323165209-0ec3e9974c59 // indirect
golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e // indirect
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXf
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/xrash/smetrics v0.0.0-20170218160415-a3153f7040e9 h1:w8V9v0qVympSF6GjdjIyeqR7+EVhAF9CBQmkmW7Zw0w=
github.com/xrash/smetrics v0.0.0-20170218160415-a3153f7040e9/go.mod h1:N3UwUGtsrSj3ccvlPHLoLsHnpR27oXr4ZE984MbSER8=
github.com/xrash/smetrics v0.0.0-20200723181607-f06e43cca1ab h1:UJJSloV3F3NHvn3mWDJ6s2441UlIzjcyn2YUcEPgL0c=
github.com/xrash/smetrics v0.0.0-20200723181607-f06e43cca1ab/go.mod h1:N3UwUGtsrSj3ccvlPHLoLsHnpR27oXr4ZE984MbSER8=
github.com/zikes/sift4 v0.0.0-20151103205100-8e89a8aebc1f h1:uwsos4EwJT8IjkwPbPl78dIoouY41YQt4ocQAAKo5+o=
github.com/zikes/sift4 v0.0.0-20151103205100-8e89a8aebc1f/go.mod h1:RkT6UOQ/xx22IhYMzJoJsW3eMFc96fptPf6k/0dgncE=
golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
Expand Down

0 comments on commit 50715df

Please sign in to comment.