diff --git a/README.md b/README.md index e9d5479..2d0ea04 100644 --- a/README.md +++ b/README.md @@ -9,9 +9,11 @@ JavaScript/TypeScript implementation of probabilistic data structures: Bloom Fil ❗️**Compatibility**❗️ -- Be carefull when migrating from a version to another. +- Be careful when migrating from a version to another. - Bug fixes were introduced in `1.3.7` and from `1.3.9` to `2.0.0+` for hashing and indexing data. Then, you **must re-build completely your filters from start** to be compatible with the new versions. -- To keep the `breaking changes` rule of npm versions we will make now new `majored versions` since 1.3.9 whenever a modification is done on the hashing/indexing system or breaks the current API. + +From v4.0.0+ we use xx3h 64-bits and 128-bits hash functions from [@node-rs/xxhash`](https://www.npmjs.com/package/@node-rs/xxhash). +So you must rebuild your filters if you use this new version. See the changelog for the complete list of changes. # Table of contents @@ -79,7 +81,7 @@ const items = ['alice', 'bob'] const errorRate = 0.04 // 4 % error rate filter = BloomFilter.create(items.length, errorRate) -// or create a bloom filter optimal for a collections of items and a desired error rate +// or create a bloom filter optimal for a collection of items and a desired error rate filter = BloomFilter.from(items, errorRate) ``` @@ -91,7 +93,7 @@ This filter works by partitioning the M-sized bit array into k slices of size `m Each hash function produces an index over `m` for its respective slice. Thus, each element is described by exactly `k` bits, meaning the distribution of false positives is uniform across all elements. -Be careful, as a Partitioned Bloom Filter have much higher collison risks that a classic Bloom Filter on small sets of data. +Be careful, as a Partitioned Bloom Filter have much higher collision risks that a classic Bloom Filter on small sets of data. **Reference:** Chang, F., Feng, W. C., & Li, K. (2004, March). _Approximate caches for packet classification._ In INFOCOM 2004. Twenty-third AnnualJoint Conference of the IEEE Computer and Communications Societies (Vol. 4, pp. 2196-2207). IEEE. ([Full text article](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.153.6902&rep=rep1&type=pdf)) @@ -137,7 +139,7 @@ number of elements stored, while assuring a maximum false positive probability **Reference:** ALMEIDA, Paulo Sérgio, BAQUERO, Carlos, PREGUIÇA, Nuno, et al. Scalable bloom filters. Information Processing Letters, 2007, vol. 101, no 6, p. 255-261. ([Full text article](https://gsd.di.uminho.pt/members/cbm/ps/dbloom.pdf)) -This filter use internally [Paritionned Bloom Filters](#partitioned-bloom-filter). +This filter use internally [Partitioned Bloom Filters](#partitioned-bloom-filter). #### Methods @@ -261,9 +263,9 @@ It uses hash functions to map events to frequencies, but unlike a hash table use #### Methods -- `update(element: HashableInput, count = 1) -> void`: add `count` occurences of an element into the sketch. -- `count(element: HashableInput) -> number`: estimate the number of occurences of an element. -- `merge(other: CountMinSketch) -> CountMinSketch`: merge occurences of two sketches. +- `update(element: HashableInput, count = 1) -> void`: add `count` occurrences of an element into the sketch. +- `count(element: HashableInput) -> number`: estimate the number of occurrences of an element. +- `merge(other: CountMinSketch) -> CountMinSketch`: merge occur rences of two sketches. - `equals(other: CountMinSketch) -> boolean`: Test if two sketchs are equals. - `clone(): CountMinSketch`: Clone the sketch. @@ -297,16 +299,16 @@ sketch = CountMinSketch.from(items, errorRate, accuracy) ### HyperLogLog HyperLogLog is an algorithm for the count-distinct problem, approximating the number of distinct elements in a multiset. Calculating the exact cardinality of a multiset requires an amount of memory proportional to the cardinality, which is impractical for very large data sets. Probabilistic cardinality estimators, such as the HyperLogLog algorithm, use significantly less memory than this, at the cost of obtaining only an approximation of the cardinality. -The HyperLogLog algorithm is able to estimate cardinalities greather than `10e9` with a typical accuracy (standard error) of `2%`, using around 1.5 kB of memory (see reference). +The HyperLogLog algorithm is able to estimate cardinalities greater than `10e9` with a typical accuracy (standard error) of `2%`, using around 1.5 kB of memory (see reference). **Reference:** Philippe Flajolet, Éric Fusy, Olivier Gandouet and Frédéric Meunier (2007). _"Hyperloglog: The analysis of a near-optimal cardinality estimation algorithm"_. Discrete Mathematics and Theoretical Computer Science Proceedings. ([Full text article](http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf)) #### Methods -- `update(element: HashableInput) -> void`: add a new occurence of an element to the sketch. +- `update(element: HashableInput) -> void`: add a new occurrence of an element to the sketch. - `count() -> number`: estimate the number of distinct elements in the sketch. -- `merge(other: HyperLogLog) -> HyperLogLog`: merge occurences of two sketches. +- `merge(other: HyperLogLog) -> HyperLogLog`: merge occurrences of two sketches. - `equals(other: HyperLogLog) -> boolean`: Test if two sketchs are equals. ```javascript @@ -349,7 +351,7 @@ It does so by computing fixed sized signatures for a set of numbers using random #### `MinHash` methods - `add(element: number) -> void`: add a new element to the set. -- `bulkLoad(elements: number[]) -> void`: efficently add several new elements to the set. +- `bulkLoad(elements: number[]) -> void`: efficiently add several new elements to the set. - `isEmpty() -> boolean`: test if the signature of the MinHash is empty. - `compareWith(other: MinHash) -> number`: estimate the Jaccard similarity coefficient with another MinHash set. @@ -396,7 +398,7 @@ interface TopkElement { #### Methods -- `add(element: string, count: number = 1) -> void`: add one or more new occurences of an element to the sketch. +- `add(element: string, count: number = 1) -> void`: add one or more new occurrences of an element to the sketch. - `values() -> Array`: get the top-k values as an array of objects. - `iterator() -> Iterator`: get the top-k values as an iterator that yields objects. @@ -440,7 +442,7 @@ They can simultaneously calculate D(A−B) and D(B−A) using O(d) space. This d - `remove(element: string) -> void`: delete an element from the filter, returning True if the deletion was a success and False otherwise. - `has(element: string) -> boolean`: Test an element for membership, returning False if the element is definitively not in the filter and True is the element might be in the filter. - `equals(other: InvertibleBloomFilter) -> boolean`: Test if two filters are equals. -- `substract(remote: InvertibleBloomFilter)`: peform the XOR substraction of two IBLTs. +- `substract(remote: InvertibleBloomFilter)`: perform the XOR substraction of two IBLTs. - `decode() -> {additional: string[], missing: string[]} `: decode an IBLT. - `listEntries() -> string[]`: list all entries in the IBLT using a Generator. @@ -487,11 +489,12 @@ console.log(`Elements of remote missing elements from iblt: ${result.missing}`) ### XOR Filter -**Available as 8-bits and 16-bits fingerprint length** +**Available as 8-bits, 16-bits, 32-bits and 64-bits fingerprint sizes** A XOR Filter is a better space-efficient probabilistic data structure than Bloom Filters. Very usefull for space efficiency of readonly sets. + **Reference:** Graf, Thomas Mueller, and Daniel Lemire. "Xor filters: Faster and smaller than bloom and cuckoo filters." Journal of Experimental Algorithmics (JEA) 25 (2020): 1-16. ([Full text article](https://arxiv.org/abs/1912.08258)) @@ -547,51 +550,46 @@ console.log(filter.has('bob')) // output: false ## Seeding and Hashing -By default every hash function is seeded with an internal seed which is equal to `0x1234567890`. If you want to change it: +By default every hash function is seeded with an internal seed which is equal to `BigInt('0x1234567890')`. If you want to change it: ```javascript const { BloomFilter } = require('bloom-filter') const bl = new BloomFilter(...) -console.log(bl.seed) // 78187493520 -bl.seed = 0xABCD -console.log(bl.seed) // 43981 +console.log(bl.seed) // 78187493520n +bl.seed = BigInt('0xABCD') +console.log(bl.seed) // 43981n ``` -By default we hash elements using `XXH.h64` function from [`xxhashjs`](https://github.com/pierrec/js-xxhash). -In the case you want to use your own hash functions, you can use your own Hashing class by extending the default one. Example: +By default we hash elements using `xxh64` and `xxh68` function from [@node-rs/xxhash`](https://www.npmjs.com/package/@node-rs/xxhash). +In the case you want to use your own hash functions, you can use your own hash functions class by extending the default one. Example: ```js const {BloomFilter, Hashing} = require('bloom-filters') - -class CustomHashing extends Hashing { - serialize(_element, _seed) { - return Number(1) - } +const filter = BloomFilter.create(2, 0.01) +bfs.Hashing.lib = { + xxh64: (x, _) => 1n, + xxh128: (x, _) => 1n, } - -const bl = BloomFilter.create(2, 0.01) -// override just your structure locally -bl._hashing = new CustomHashing() -bl.add('a') +filter._hashing._lib = bfs.Hashing.lib +const hashes = filter._hashing.hashTwice('something equal to 1n') +console.log(hashes.first === 1n, hashes.second == 1n) ``` -See `test/utils-test.js` "_Use different hash functions_" describe close. - ## Documentation See [documentation online](https://callidon.github.io/bloom-filters/) or generate it in directory `doc/` with: `npm run doc` ## Tests and Development -- Tests are performed using [mocha](https://github.com/mochajs/mocha) and [nyc](https://github.com/istanbuljs/nyc) (code coverage) on node 12.x, 14.x, 15.x and 16.x for the moment. +- Tests are performed using jest on node 18+ - Linting and formatting are made using `prettier` and `eslint` When submitting pull requests please follow the following guidance: - -- Please open pull requests on the develop branch. **Direct contributions to the master branch will be refused without comments** -- Add tests when possible in the `test` folder. +- Describe the changes as much as possible +- Provide meaningful examples and guidance for testing +- Add tests when possible in the `tests` folder. - Functions, methods, variables and types must be documented using typedoc annotations -- Run `yarn test` (build, lint and run the mocha tests suite) +- Run `yarn test` to run all the step. ## References @@ -610,8 +608,8 @@ When submitting pull requests please follow the following guidance: | **Version** | **Release date** | **Major changes** | | ----------- | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `v2.1.0` | 03/2022 | - Add Scalable Bloom filters
- Use array of BitSet for Partitionned Bloom Filter
- Fix wrong MinHash comparison | -| `v2.0.0` | 02/2022 | - Use correctly double hashing [#issue43](https://github.com/Callidon/bloom-filters/issues/43).
- Move all hashing related functions to its specific Hash class in a component of the BaseFilter class. It also allows for overriding the serizalize function for using custom hash functions
- Add [#PR44](https://github.com/Callidon/bloom-filters/pull/44) optimizing the BloomFilter internal storage with Uint arrays.
- Disable 10.x, 15.x node tests.
- Add XorFilter [#29](https://github.com/Callidon/bloom-filters/issues/29)
- Add `.nextInt32()` function to get a new random seeded int 32-bits from the current seed.
- Make all properties public for allowing developpers to override everything. | +| `v2.1.0` | 03/2022 | - Add Scalable Bloom filters
- Use array of BitSet for Partitioned Bloom Filter
- Fix wrong MinHash comparison | +| `v2.0.0` | 02/2022 | - Use correctly double hashing [#issue43](https://github.com/Callidon/bloom-filters/issues/43).
- Move all hashing related functions to its specific Hash class in a component of the BaseFilter class. It also allows for overriding the serizalize function for using custom hash functions
- Add [#PR44](https://github.com/Callidon/bloom-filters/pull/44) optimizing the BloomFilter internal storage with Uint arrays.
- Disable 10.x, 15.x node tests.
- Add XorFilter [#29](https://github.com/Callidon/bloom-filters/issues/29)
- Add `.nextInt32()` function to get a new random seeded int 32-bits from the current seed.
- Make all properties public for allowing developers to override everything. | | `v1.3.0` | 10/04/2020 | Added the MinHash set | | `v1.2.0` | 08/04/2020 | Add the TopK class | | `v1.1.0` | 03/04/2020 | Add the HyperLogLog sketch | @@ -624,27 +622,3 @@ When submitting pull requests please follow the following guidance: ## License [MIT License](https://github.com/Callidon/bloom-filters/blob/master/LICENSE) - - -## Next/v4.0.0 todo list - -* Specify this is now an ESM module -* Specify we do not use xxhashjs but @node-rs/xxashs with a webassembly when bundled and a platform specific binary when using node. -* Docs for a browser/bundle usage with rspack and webpack examples -* Docs for node - -Versioning/Publishing: -* version was bumped to 4.0.0-alpha.0 with `yarn version premajor --preid alpha` from the 3.0.1 -* incrementing is: `yarn version prerelease` -* publishing is: `yarn publish --tag alpha` - -Fix tests: -[x] fix utils: seed mandatory, getDistinctIndices was cycling, the fix is to trigger the randomizer periodically; aka after `size` cycles -[ ] fix Cuckoo: -* The seed is now a BigInt and all functions internally works with bigint. -* TODO: JSON.stringify can't work with BigInt so we must encode a bigint as `{'$bf$bigint': this.seed.toString()}` and revive the input with `BigInt(input.value)` See: https://github.com/GoogleChromeLabs/jsbi/issues/30 and MDN recommendation https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/BigInt#use_within_json . The note mentions to be carefull with the reviver, so if we use `$bf$bigint` it should be enough to not collide with a serialized property. Since we have our own format we don't need to patch JSON. We can adapt our own serializer/deserializer. -[ ] fix Hyper log log -[ ] fix iblt -[ ] fix min hash -[ ] fix scalable -[ ] fix xor diff --git a/examples/node.cjs b/examples/node.cjs index 02c5d32..3fffc93 100644 --- a/examples/node.cjs +++ b/examples/node.cjs @@ -11,4 +11,13 @@ const items = ['alice', 'bob'] const errorRate = 0.04 // 4 % error rate filter = bfs.BloomFilter.create(items.length, errorRate) filter = bfs.BloomFilter.from(items, errorRate) -bfs.BloomFilter.fromJSON(filter.saveAsJSON()) \ No newline at end of file +bfs.BloomFilter.fromJSON(filter.saveAsJSON()) + +bfs.Hashing.lib = { + xxh64: (x, _) => 1n, + xxh128: (x, _) => 1n, +} +filter._hashing._lib = bfs.Hashing.lib +const hashes = filter._hashing.hashTwice('x') +assert(hashes.first === 1n) +assert(hashes.second === 1n) \ No newline at end of file diff --git a/package.json b/package.json index a663f63..e50b940 100644 --- a/package.json +++ b/package.json @@ -38,9 +38,9 @@ "import": "./dist/module/dist/counting-bloom-filter.js", "require": "./dist/commonjs/dist/counting-bloom-filter.js" }, - "./partitionned-filter.js": { - "import": "./dist/module/dist/partitionned-bloom-filter.js", - "require": "./dist/commonjs/dist/partitionned-bloom-filter.js" + "./partitioned-filter.js": { + "import": "./dist/module/dist/partitioned-bloom-filter.js", + "require": "./dist/commonjs/dist/partitioned-bloom-filter.js" }, "./scalable-filter.js": { "import": "./dist/module/dist/scalable-bloom-filter.js", @@ -123,7 +123,7 @@ "test:module": "node examples/node.mjs", "test:lint-build": "yarn lint && yarn build", "test": "yarn test:lint-build && yarn test:commonjs && yarn test:module && jest", - "doc": "typedoc --sort alphabetical --out docs/ --emit both --includeVersion src/index.ts", + "doc": "typedoc --sort alphabetical --out docs/ --emit both --includeVersion src/**/*.ts", "clean": "rimraf docs/ dist/module/dist dist/commonjs/dist dist/website/rspack dist/website/webpack" }, "repository": { @@ -136,7 +136,7 @@ "bloom filter", "probabilistic", "datastructure", - "partitionned bloom filter", + "partitioned bloom filter", "scalable bloom filter", "counting bloom filter", "invertible bloom filter", diff --git a/src/count-min-sketch.ts b/src/count-min-sketch.ts index ffaa2b8..dcb4298 100644 --- a/src/count-min-sketch.ts +++ b/src/count-min-sketch.ts @@ -95,7 +95,7 @@ export default class CountMinSketch extends BaseFilter implements CountingFilter /** * Update the count min sketch with a new occurrence of an element * @param element - The new element - * @param count - Number of occurences of the elemnt (defauls to one) + * @param count - Number of occurrences of the elemnt (defauls to one) */ public update(element: HashableInput, count = 1): void { this._allSums += count @@ -106,9 +106,9 @@ export default class CountMinSketch extends BaseFilter implements CountingFilter } /** - * Perform a point query: estimate the number of occurence of an element + * Perform a point query: estimate the number of occurrence of an element * @param element - The element we want to count - * @return The estimate number of occurence of the element + * @return The estimate number of occurrence of the element */ public count(element: HashableInput): number { let min = Infinity diff --git a/src/interfaces/counting-filter.ts b/src/interfaces/counting-filter.ts index 3d0a803..8a28cea 100644 --- a/src/interfaces/counting-filter.ts +++ b/src/interfaces/counting-filter.ts @@ -1,5 +1,5 @@ /** - * A filter that can count occurences of items and estimate their frequencies. + * A filter that can count occurrences of items and estimate their frequencies. * @author Thomas Minier * @author Arnaud Grall */ @@ -7,14 +7,14 @@ export default interface CountingFilter { /** * Update the count min sketch with a new occurrence of an element * @param element - The new element - * @param count - Number of occurences of the elemnt (defauls to one) + * @param count - Number of occurrences of the elemnt (defauls to one) */ update(element: T, count: number): void /** - * Perform a point query: estimate the number of occurence of an element + * Perform a point query: estimate the number of occurrence of an element * @param element - The element we want to count - * @return The estimate number of occurence of the element + * @return The estimate number of occurrence of the element */ count(element: T): number } diff --git a/src/scalable-bloom-filter.ts b/src/scalable-bloom-filter.ts index fd08de4..82bb3a6 100644 --- a/src/scalable-bloom-filter.ts +++ b/src/scalable-bloom-filter.ts @@ -176,7 +176,7 @@ export default class ScalableBloomFilter } /** - * Create a Scalable Bloom Filter based on Partitionned Bloom Filter. + * Create a Scalable Bloom Filter based on Partitioned Bloom Filter. * @param _size the starting size of the filter * @param _error_rate ther error rate desired of the filter * @param _ratio the tightening ration diff --git a/tests/hyperloglog.test.ts b/tests/hyperloglog.test.ts index 7ae1637..421794d 100644 --- a/tests/hyperloglog.test.ts +++ b/tests/hyperloglog.test.ts @@ -36,7 +36,7 @@ test('should support update and cardinality estimations (count) operations', () ) } }) -test('should peforms the union of two HyperLogLog sketches', () => { +test('should performs the union of two HyperLogLog sketches', () => { const first = new HyperLogLog(2 ** 4) const second = new HyperLogLog(2 ** 4) first.update('alice') @@ -115,10 +115,10 @@ test('issue#(https://github.com/Callidon/bloom-filters/issues/69)', () => { const sketch = new HyperLogLog(128) // push 10000 distinct elements const n = 2 ** 14 - for (let i = 0; i