Skip to content

Commit

Permalink
move all hash related functions to the BaseFilter class
Browse files Browse the repository at this point in the history
  • Loading branch information
folkvir committed Jan 3, 2022
1 parent 3305f9c commit 12417e7
Show file tree
Hide file tree
Showing 11 changed files with 306 additions and 261 deletions.
214 changes: 206 additions & 8 deletions src/base-filter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,17 @@ SOFTWARE.

'use strict'

import * as utils from './utils'
import seedrandom from 'seedrandom'
import XXH from 'xxhashjs'
import {
doubleHashing,
getDefaultSeed,
HashableInput,
numberToHex,
TwoHashes,
TwoHashesIntAndString,
TwoHashesTemplated,
} from './utils'

/**
* Exported prng type because it is not from seedrandom
Expand All @@ -49,22 +58,22 @@ export default abstract class BaseFilter {
private _rng: prng

constructor() {
this._seed = utils.getDefaultSeed()
this._seed = getDefaultSeed()
this._rng = seedrandom(`${this._seed}`) as prng
}

/**
* Get the seed used in this structure
*/
get seed(): number {
public get seed(): number {
return this._seed
}

/**
* Set the seed for this structure
* @param seed the new seed that will be used in this structure
*/
set seed(seed: number) {
public set seed(seed: number) {
this._seed = seed
this._rng = seedrandom(`${this._seed}`) as prng
}
Expand All @@ -73,22 +82,22 @@ export default abstract class BaseFilter {
* Get a function used to draw random number
* @return A factory function used to draw random integer
*/
get random(): prng {
public get random(): prng {
return this._rng
}

/**
* Return a next random seeded int32 integer
* @returns
*/
nextInt32(): number {
public nextInt32(): number {
return this._rng.int32()
}

/**
* Save the current structure as a JSON object
*/
saveAsJSON(): Object {
public saveAsJSON(): Object {
throw new Error('not-implemented')
}

Expand All @@ -98,7 +107,196 @@ export default abstract class BaseFilter {
* @return Return the Object loaded from the provided JSON object
*/
// eslint-disable-next-line @typescript-eslint/no-explicit-any
static fromJSON(json: JSON): any {
public static fromJSON(json: JSON): any {
throw new Error(`not-implemented: ${json}`)
}

/**
* Generate a set of distinct indexes on interval [0, size) using the double hashing technique
* For generating efficiently distinct indexes we re-hash after detecting a cycle by changing slightly the seed.
* It has the effect of generating faster distinct indexes without loosing entirely the utility of the double hashing.
* For small number of indexes it will work perfectly. For a number close to the size, and size very large
* Advise: do not generate `size` indexes for a large interval. In practice, size should be equal
* to the number of hash functions used and is often low.
*
* @param element - The element to hash
* @param size - the range on which we can generate an index [0, size) = size
* @param number - The number of indexes desired
* @param seed - The seed used
* @return Array<number>
* @author Arnaud Grall
* @author Simon Woolf (SimonWoolf)
*/
protected _getDistinctIndexes(
element: HashableInput,
size: number,
number: number,
seed?: number
): Array<number> {
if (seed === undefined) {
seed = getDefaultSeed()
}
let n = 0
const indexes: Set<number> = new Set()
let hashes = this._hashTwice(element, seed)
// let cycle = 0
while (indexes.size < number) {
const ind = hashes.first % size
if (!indexes.has(ind)) {
indexes.add(ind)
}
hashes.first = (hashes.first + hashes.second) % size
hashes.second = (hashes.second + n) % size
n++

if (n > size) {
// Enhanced double hashing stops cycles of length less than `size` in the case where
// size is coprime with the second hash. But you still get cycles of length `size`.
// So if we reach there and haven't finished, append a prime to the input and
// rehash.
seed++
hashes = this._hashTwice(element, seed)
}
}
return [...indexes.values()]
}

/**
* Generate N indexes on range [0, size)
* It uses the double hashing technique to generate the indexes.
* It hash twice the value only once before generating the indexes.
* Warning: you can have a lot of modulo collisions.
* @param element - The element to hash
* @param size - The range on which we can generate the index, exclusive
* @param hashCount - The number of indexes we want
* @return An array of indexes on range [0, size)
*/
protected _getIndexes(
element: HashableInput,
size: number,
hashCount: number,
seed?: number
): Array<number> {
if (seed === undefined) {
seed = getDefaultSeed()
}
const arr = []
const hashes = this._hashTwice(element, seed)
for (let i = 0; i < hashCount; i++) {
arr.push(doubleHashing(i, hashes.first, hashes.second, size))
}
return arr
}

/**
* @public
* @internal
* Hash an element of type {@link HashableInput} into {@link Number}
* Can be overrided as long as you return a value of type {@link Number}
* Don't forget to use the seed when hashing, otherwise if some kind of randomness is in the process
* you may have inconsistent behaviors between 2 runs.
* @param element
* @param seed
* @returns A 64bits floating point {@link Number}
*/
protected _serialize(element: HashableInput, seed?: number) {
if (!seed) {
seed = getDefaultSeed()
}
return Number(XXH.h64(element, seed).toNumber())
}

/**
* @private
* @internal
* (64-bits only) Hash a value into two values (in hex or integer format)
* @param value - The value to hash
* @param asInt - (optional) If True, the values will be returned as an integer. Otherwise, as hexadecimal values.
* @param seed the seed used for hashing
* @return The results of the hash functions applied to the value (in hex or integer)
* @author Arnaud Grall & Thomas Minier
*/
protected _hashTwice(value: HashableInput, seed?: number): TwoHashes {
if (seed === undefined) {
seed = getDefaultSeed()
}
return {
first: this._serialize(value, seed + 1),
second: this._serialize(value, seed + 2),
}
}

/**
* Hash twice an element into their HEX string representations
* @param value
* @param seed
* @returns TwoHashesTemplated<string>
*/
protected _hashTwiceAsString(
value: HashableInput,
seed?: number
): TwoHashesTemplated<string> {
const {first, second} = this._hashTwice(value, seed)
return {
first: numberToHex(first),
second: numberToHex(second),
}
}

/**
* (64-bits only) Same as hashTwice but return Numbers and String equivalent
* @param val the value to hash
* @param seed the seed to change when hashing
* @return TwoHashesIntAndString
* @author Arnaud Grall
*/
protected _hashTwiceIntAndString(
val: HashableInput,
seed?: number
): TwoHashesIntAndString {
if (seed === undefined) {
seed = getDefaultSeed()
}
const one = this._hashIntAndString(val, seed + 1)
const two = this._hashIntAndString(val, seed + 2)
return {
int: {
first: one.int,
second: two.int,
},
string: {
first: one.string,
second: two.string,
},
}
}

/**
* Hash an item as an unsigned int
* @param elem - Element to hash
* @param seed - The hash seed. If its is UINT32 make sure to set the length to 32
* @param length - The length of hashes (defaults to 32 bits)
* @return The hash value as an unsigned int
* @author Arnaud Grall
*/
protected _hashAsInt(elem: HashableInput, seed?: number): number {
if (seed === undefined) {
seed = getDefaultSeed()
}
return this._serialize(elem, seed)
}

/**
* Hash an item and return its number and HEX string representation
* @param elem - Element to hash
* @param seed - The hash seed. If its is UINT32 make sure to set the length to 32
* @param base - The base in which the string will be returned, default: 16
* @param length - The length of hashes (defaults to 32 bits)
* @return The item hased as an int and a string
* @author Arnaud Grall
*/
protected _hashIntAndString(elem: HashableInput, seed?: number) {
const hash = this._hashAsInt(elem, seed)
return {int: hash, string: numberToHex(hash)}
}
}
18 changes: 14 additions & 4 deletions src/bloom/bloom-filter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ import BaseFilter from '../base-filter'
import BitSet from './bit-set'
import {AutoExportable, Field, Parameter} from '../exportable'
import {optimalFilterSize, optimalHashes} from '../formulas'
import {HashableInput, getIndexes} from '../utils'
import {HashableInput} from '../utils'

/**
* A Bloom filter is a space-efficient probabilistic data structure, conceived by Burton Howard Bloom in 1970,
Expand Down Expand Up @@ -83,7 +83,7 @@ export default class BloomFilter
static create(nbItems: number, errorRate: number): BloomFilter {
const size = optimalFilterSize(nbItems, errorRate)
const hashes = optimalHashes(size, nbItems)
return new BloomFilter(size, hashes)
return new this(size, hashes)
}

/**
Expand Down Expand Up @@ -138,7 +138,12 @@ export default class BloomFilter
* ```
*/
add(element: HashableInput): void {
const indexes = getIndexes(element, this._size, this._nbHashes, this.seed)
const indexes = this._getIndexes(
element,
this._size,
this._nbHashes,
this.seed
)
for (let i = 0; i < indexes.length; i++) {
this._filter.add(indexes[i])
}
Expand All @@ -157,7 +162,12 @@ export default class BloomFilter
* ```
*/
has(element: HashableInput): boolean {
const indexes = getIndexes(element, this._size, this._nbHashes, this.seed)
const indexes = this._getIndexes(
element,
this._size,
this._nbHashes,
this.seed
)
for (let i = 0; i < indexes.length; i++) {
if (!this._filter.has(indexes[i])) {
return false
Expand Down
23 changes: 19 additions & 4 deletions src/bloom/counting-bloom-filter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ import BaseFilter from '../base-filter'
import WritableFilter from '../interfaces/writable-filter'
import {AutoExportable, Field, Parameter} from '../exportable'
import {optimalFilterSize, optimalHashes} from '../formulas'
import {HashableInput, allocateArray, getIndexes} from '../utils'
import {HashableInput, allocateArray} from '../utils'

/**
* A Counting Bloom filter works in a similar manner as a regular Bloom filter; however, it is able to keep track of insertions and deletions. In a counting Bloom filter, each entry in the Bloom filter is a small counter associated with a basic Bloom filter bit.
Expand Down Expand Up @@ -128,7 +128,12 @@ export default class CountingBloomFilter
* ```
*/
add(element: HashableInput): void {
const indexes = getIndexes(element, this._size, this._nbHashes, this.seed)
const indexes = this._getIndexes(
element,
this._size,
this._nbHashes,
this.seed
)
for (let i = 0; i < indexes.length; i++) {
// increment counter
this._filter[indexes[i]][1] += 1
Expand All @@ -150,7 +155,12 @@ export default class CountingBloomFilter
* ```
*/
remove(element: HashableInput): boolean {
const indexes = getIndexes(element, this._size, this._nbHashes, this.seed)
const indexes = this._getIndexes(
element,
this._size,
this._nbHashes,
this.seed
)
const success = true
for (let i = 0; i < indexes.length; i++) {
// decrement counter
Expand All @@ -177,7 +187,12 @@ export default class CountingBloomFilter
* ```
*/
has(element: HashableInput): boolean {
const indexes = getIndexes(element, this._size, this._nbHashes, this.seed)
const indexes = this._getIndexes(
element,
this._size,
this._nbHashes,
this.seed
)
for (let i = 0; i < indexes.length; i++) {
if (!this._filter[indexes[i]][0]) {
return false
Expand Down
Loading

0 comments on commit 12417e7

Please sign in to comment.