-
Notifications
You must be signed in to change notification settings - Fork 8
/
utils.go
177 lines (172 loc) · 4.79 KB
/
utils.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
package SetSimilaritySearch
import (
"bufio"
"errors"
"io"
"sort"
"strconv"
"strings"
)
type flattenedRawSetEntry struct {
setID string
rawToken string
}
// ReadFlattenedRawSets takes an input of a flattened set file,
// that contains unique lines in the format "<set ID> <token>", and returns
// the extracted set IDs and raw sets.
// Lines starting with "#" are ignored,
// If the input format is "<token> <set ID>" then set reversed to true.
func ReadFlattenedRawSets(file io.Reader,
reversed bool) (setIDs []string, rawSets [][]string, err error) {
// Read flattened raw set entries.
entries := make([]flattenedRawSetEntry, 0)
scanner := bufio.NewScanner(file)
scanner.Buffer(nil, 1024*1024*1024*4)
for scanner.Scan() {
line := scanner.Text()
if strings.HasPrefix(line, "#") {
continue
}
fields := strings.Fields(line)
if len(fields) != 2 {
return nil, nil, errors.New("incorrect line detected")
}
var entry flattenedRawSetEntry
if reversed {
entry = flattenedRawSetEntry{fields[1], fields[0]}
} else {
entry = flattenedRawSetEntry{fields[0], fields[1]}
}
entries = append(entries, entry)
}
if err := scanner.Err(); err != nil {
return nil, nil, err
}
// Sort entries by setID.
sort.Slice(entries, func(i, j int) bool {
return entries[i].setID < entries[j].setID
})
// Create raw sets by merging flattened entries.
setIDs = make([]string, 0)
rawSets = make([][]string, 0)
currSetID := entries[0].setID
currSet := make([]string, 0)
for _, entry := range entries {
if entry.setID != currSetID {
// Append the completed set.
setIDs = append(setIDs, currSetID)
rawSets = append(rawSets, currSet)
// Create new set.
currSetID = entry.setID
currSet = make([]string, 0)
}
currSet = append(currSet, entry.rawToken)
}
// Append the last set.
setIDs = append(setIDs, currSetID)
rawSets = append(rawSets, currSet)
return setIDs, rawSets, nil
}
// ReadFlattenedSortedRawSets takes an input of a flattened set file,
// that contains unique lines in the format "<set ID> <token>",
// sorted by <set ID>, and returns the extracted set IDs and raw sets.
// Lines starting with "#" are ignored,
// This function is more efficient than ReadFlattenedRawSets, but expects
// the input lines to be sorted.
func ReadFlattenedSortedRawSets(file io.Reader) (setIDs []string,
rawSets [][]string, err error) {
// Create raw sets by merging flattened entries.
setIDs = make([]string, 0)
rawSets = make([][]string, 0)
var currSetID string
firstLine := true
currSet := make([]string, 0)
scanner := bufio.NewScanner(file)
scanner.Buffer(nil, 1024*1024*1024*4)
for scanner.Scan() {
line := scanner.Text()
if strings.HasPrefix(line, "#") {
continue
}
fields := strings.Fields(line)
if len(fields) != 2 {
return nil, nil, errors.New("incorrect line detected")
}
setID := fields[0]
rawToken := fields[1]
if firstLine {
currSetID = setID
firstLine = false
}
if setID != currSetID {
// Append the completed set.
setIDs = append(setIDs, currSetID)
rawSets = append(rawSets, currSet)
// Create new set.
currSetID = setID
currSet = make([]string, 0)
}
currSet = append(currSet, rawToken)
}
if err := scanner.Err(); err != nil {
return nil, nil, err
}
// Append the last set.
setIDs = append(setIDs, currSetID)
rawSets = append(rawSets, currSet)
return setIDs, rawSets, nil
}
// ReadFlattenedSortedTransformedSets takes an input of a flattened
// transformed set file,
// that contains unique lines in the format "<set ID:int> <token:int>",
// sorted by <set ID>, and returns the extracted set IDs and raw sets.
// Lines starting with "#" are ignored,
func ReadFlattenedSortedTransformedSets(file io.Reader) (setIDs []int,
sets [][]int, err error) {
// Create raw sets by merging flattened entries.
setIDs = make([]int, 0)
sets = make([][]int, 0)
var currSetID int
firstLine := true
currSet := make([]int, 0)
scanner := bufio.NewScanner(file)
scanner.Buffer(nil, 1024*1024*1024*4)
for scanner.Scan() {
line := scanner.Text()
if strings.HasPrefix(line, "#") {
continue
}
fields := strings.Fields(line)
if len(fields) != 2 {
return nil, nil, errors.New("incorrect line detected")
}
setID, err := strconv.Atoi(fields[0])
if err != nil {
return nil, nil, err
}
token, err := strconv.Atoi(fields[1])
if err != nil {
return nil, nil, err
}
if firstLine {
currSetID = setID
firstLine = false
}
if setID != currSetID {
// Append the completed set.
setIDs = append(setIDs, currSetID)
sets = append(sets, currSet)
// Create new set.
currSetID = setID
currSet = make([]int, 0)
}
currSet = append(currSet, token)
}
if err := scanner.Err(); err != nil {
return nil, nil, err
}
// Append the last set.
setIDs = append(setIDs, currSetID)
sets = append(sets, currSet)
return setIDs, sets, nil
}