-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathjaro.go
79 lines (75 loc) · 1.81 KB
/
jaro.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
package textdistance
import (
"math"
)
// JaroDistance calculates jaro distance between s1 and s2.
// This implementation is influenced by an implementation of [lucene](http://lucene.apache.org/)
// Note that this calculation's result is normalized ( the result will be bewtwen 0 and 1)
// and if t1 and t2 are exactly the same, the result is 1.0.
// This function returns distance and prefix (for jaro-winkler distance)
func JaroDistance(s1, s2 string) (float64, int) {
if s1 == s2 {
return 1.0, 0.0
}
// compare length using rune slice length, as s1 and s2 are not necessarily ASCII-only strings
longer, shorter := []rune(s1), []rune(s2)
if len(longer) < len(shorter) {
longer, shorter = shorter, longer
}
scope := int(math.Floor(float64(len(longer)/2))) - 1
// m is the number of matching characters.
m := 0
matchFlags := make([]bool, len(longer))
matchIndexes := make([]int, len(longer))
for i := range matchIndexes {
matchIndexes[i] = -1
}
for i := 0; i < len(shorter); i++ {
k := Min(i+scope+1, len(longer))
for j := Max(i-scope, 0); j < k; j++ {
if matchFlags[j] || shorter[i] != longer[j] {
continue
}
matchIndexes[i] = j
matchFlags[j] = true
m++
break
}
}
ms1 := make([]rune, m)
ms2 := make([]rune, m)
si := 0
for i := 0; i < len(shorter); i++ {
if matchIndexes[i] != -1 {
ms1[si] = shorter[i]
si++
}
}
si = 0
for i := 0; i < len(longer); i++ {
if matchFlags[i] {
ms2[si] = longer[i]
si++
}
}
t := 0
for i, c := range ms1 {
if c != ms2[i] {
t++
}
}
prefix := 0
for i := 0; i < len(shorter); i++ {
if longer[i] == shorter[i] {
prefix++
} else {
break
}
}
if m == 0 {
return 0.0, 0.0
}
newt := float64(t) / 2.0
newm := float64(m)
return 1 / 3.0 * (newm/float64(len(shorter)) + newm/float64(len(longer)) + (newm-newt)/newm), prefix
}