-
Notifications
You must be signed in to change notification settings - Fork 8
/
similarity.go
138 lines (114 loc) · 3.29 KB
/
similarity.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
package SetSimilaritySearch
import "math"
func min(a, b int) int {
if a < b {
return a
}
return b
}
func max(a, b int) int {
if a > b {
return a
}
return b
}
// IntersectionSize computes the number of overlaps of two transformed sets
// (sorted integers).
func intersectionSize(s1, s2 []int) int {
var i, j int
var overlap int
for i < len(s1) && j < len(s2) {
switch d := s1[i] - s2[j]; {
case d == 0:
overlap++
i++
j++
case d < 0:
i++
case d > 0:
j++
}
}
return overlap
}
type function func([]int, []int) float64
// Jaccard computes the Jaccard similarity of two transformed sets.
func jaccard(s1, s2 []int) float64 {
if len(s1) == 0 && len(s2) == 0 {
return 0.0
}
intersectionSize := intersectionSize(s1, s2)
return float64(intersectionSize) / float64(len(s1)+len(s2)-intersectionSize)
}
// Containment computes the Containment of s1 in s2 -- the fraction of s1
// being found in s2.
func containment(s1, s2 []int) float64 {
if len(s1) == 0 {
return 0.0
}
intersectionSize := intersectionSize(s1, s2)
return float64(intersectionSize) / float64(len(s1))
}
func cosine(s1, s2 []int) float64 {
if len(s1) == 0 && len(s2) == 0 {
return 0.0
}
intersectionSize := intersectionSize(s1, s2)
return float64(intersectionSize) / math.Sqrt(float64(len(s1)*len(s2)))
}
type overlapThresholdFunction func(int, float64) int
// x is the set size
// t is the Jaccard threshold
func jaccardOverlapThresholdFunc(x int, t float64) int {
return max(1, int(float64(x)*t))
}
var jaccardOverlapIndexThresholdFunc = jaccardOverlapThresholdFunc
func cosineOverlapThresholdFunc(x int, t float64) int {
return int(math.Sqrt(float64(x)) * t)
}
var cosineOverlapIndexThresholdFunc = cosineOverlapThresholdFunc
// This is used for query only.
func containmentOverlapThresholdFunc(x int, t float64) int {
return max(1, int(float64(x)*t))
}
func containmentOverlapIndexThresholdFunc(x int, t float64) int {
return 1
}
type positionFilter func([]int, []int, int, int, float64) bool
func jaccardPositionFilter(s1, s2 []int, p1, p2 int, t float64) bool {
l1, l2 := len(s1), len(s2)
return float64(min(l1-p1, l2-p2))/float64(max(l1, l2)) >= t
}
func containmentPositionFilter(s1, s2 []int, p1, p2 int, t float64) bool {
l1, l2 := len(s1), len(s2)
return float64(min(l1-p1, l2-p2))/float64(l1) >= t
}
func cosinePositionFilter(s1, s2 []int, p1, p2 int, t float64) bool {
l1, l2 := len(s1), len(s2)
return float64(min(l1-p1, l2-p2))/math.Sqrt(float64(max(l1, l2))) >= t
}
var similarityFuncs = map[string]function{
"jaccard": jaccard,
"containment": containment,
"cosine": cosine,
}
var overlapThresholdFuncs = map[string]overlapThresholdFunction{
"jaccard": jaccardOverlapThresholdFunc,
"containment": containmentOverlapThresholdFunc,
"cosine": cosineOverlapThresholdFunc,
}
var overlapIndexThresholdFuncs = map[string]overlapThresholdFunction{
"jaccard": jaccardOverlapIndexThresholdFunc,
"containment": containmentOverlapIndexThresholdFunc,
"cosine": cosineOverlapIndexThresholdFunc,
}
var positionFilterFuncs = map[string]positionFilter{
"jaccard": jaccardPositionFilter,
"containment": containmentPositionFilter,
"cosine": cosinePositionFilter,
}
var symmetricSimilarityFuncs = map[string]bool{
"jaccard": true,
"containment": false,
"cosine": true,
}