-
Notifications
You must be signed in to change notification settings - Fork 46
/
count-min-sketch.ts
212 lines (194 loc) · 6.61 KB
/
count-min-sketch.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
/* file : count-min-sketch.ts
MIT License
Copyright (c) 2017-2020 Thomas Minier & Arnaud Grall
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
import BaseFilter from '../base-filter'
import CountingFilter from '../interfaces/counting-filter'
import {AutoExportable, Field, Parameter} from '../exportable'
import {allocateArray, HashableInput} from '../utils'
/**
* The count–min sketch (CM sketch) is a probabilistic data structure that serves as a frequency table of events in a stream of data.
* It uses hash functions to map events to frequencies, but unlike a hash table uses only sub-linear space, at the expense of overcounting some events due to collisions.
*
* Reference: Cormode, G., & Muthukrishnan, S. (2005). An improved data stream summary: the count-min sketch and its applications. Journal of Algorithms, 55(1), 58-75.
* @see {@link http://vaffanculo.twiki.di.uniroma1.it/pub/Ing_algo/WebHome/p14_Cormode_JAl_05.pdf} for more details on Count Min Sketch
* @extends Exportable
* @author Thomas Minier & Arnaud Grall
*/
@AutoExportable<CountMinSketch>('CountMinSketch', ['_seed'])
export default class CountMinSketch
extends BaseFilter
implements CountingFilter<HashableInput>
{
@Field()
public _columns: number
@Field()
public _rows: number
@Field()
public _matrix: Array<Array<number>>
@Field()
public _allSums: number
/**
* Constructor
* @param columns - Number of columns
* @param rows - Number of rows
*/
constructor(
@Parameter('_columns') columns: number,
@Parameter('_rows') rows: number
) {
super()
this._columns = columns
this._rows = rows
this._matrix = allocateArray(this._rows, () =>
allocateArray(this._columns, 0)
)
this._allSums = 0
}
/**
* Create a count-min sketch, with a target error rate and probability of accuracy
* @param errorRate - The error rate
* @param accuracy - The probability of accuracy
* @return A new Count Min Sketch optimal for the input parameters
*/
public static create(errorRate: number, accuracy = 0.999): CountMinSketch {
// columns = Math.ceil(Math.E / epsilon) and rows = Math.ceil(Math.log(1 / delta))
const columns = Math.ceil(Math.E / errorRate)
const rows = Math.ceil(Math.log(1 / accuracy))
return new CountMinSketch(columns, rows)
}
/**
* Create a Count Min Sketch from a set of items, with a target error rate and probability of accuracy
* @param items - An iterable to yield items to be inserted into the filter
* @param errorRate - The error rate
* @param accuracy - The probability of accuracy
* @return A new Count Min Sketch filled with the iterable's items.
*/
public static from(
items: Iterable<HashableInput>,
errorRate: number,
accuracy = 0.999
): CountMinSketch {
const filter = CountMinSketch.create(errorRate, accuracy)
for (const item of items) {
filter.update(item)
}
return filter
}
/**
* Return the number of columns in the sketch
*/
public get columns(): number {
return this._columns
}
/**
* Return the number of rows in the sketch
*/
public get rows(): number {
return this._rows
}
/**
* Get the sum of all counts in the sketch
*/
public get sum(): number {
return this._allSums
}
/**
* Update the count min sketch with a new occurrence of an element
* @param element - The new element
* @param count - Number of occurences of the elemnt (defauls to one)
*/
public update(element: HashableInput, count = 1): void {
this._allSums += count
const indexes = this._hashing.getIndexes(
element,
this._columns,
this._rows,
this.seed
)
for (let i = 0; i < this._rows; i++) {
this._matrix[i][indexes[i]] += count
}
}
/**
* Perform a point query: estimate the number of occurence of an element
* @param element - The element we want to count
* @return The estimate number of occurence of the element
*/
public count(element: HashableInput): number {
let min = Infinity
const indexes = this._hashing.getIndexes(
element,
this._columns,
this._rows,
this.seed
)
for (let i = 0; i < this._rows; i++) {
const v = this._matrix[i][indexes[i]]
min = Math.min(v, min)
}
return min
}
/**
* Check if another Count Min Sketch is equal to this one
* @param filter - The filter to compare to this one
* @return True if they are equal, false otherwise
*/
public equals(other: CountMinSketch): boolean {
if (this._columns !== other._columns || this._rows !== other._rows) {
return false
}
for (let i = 0; i < this._rows; i++) {
for (let j = 0; j < this._columns; j++) {
if (this._matrix[i][j] !== other._matrix[i][j]) {
return false
}
}
}
return true
}
/**
* Merge (in place) this sketch with another sketch, if they have the same number of columns and rows.
* @param sketch - The sketch to merge with
*/
public merge(sketch: CountMinSketch): void {
if (this._columns !== sketch._columns) {
throw new Error(
'Cannot merge two sketches with different number of columns'
)
}
if (this._rows !== sketch._rows) {
throw new Error('Cannot merge two sketches with different number of rows')
}
for (let i = 0; i < this._rows; i++) {
for (let j = 0; j < this._columns; j++) {
this._matrix[i][j] += sketch._matrix[i][j]
}
}
}
/**
* Clone the sketch
* @return A new cloned sketch
*/
public clone(): CountMinSketch {
const sketch = new CountMinSketch(this._columns, this._rows)
sketch.merge(this)
sketch.seed = this.seed
return sketch
}
}