Callidon · folkvir · Dec 17, 2021 · Nov 24, 2021 · Nov 25, 2021 · Dec 2, 2021
diff --git a/package.json b/package.json
@@ -47,6 +47,7 @@
     "typescript": "^3.7.5"
   },
   "dependencies": {
+    "base64-arraybuffer": "^1.0.1",
     "is-buffer": "^2.0.4",
     "lodash": "^4.17.15",
     "lodash.eq": "^4.0.0",

diff --git a/src/api.ts b/src/api.ts
@@ -27,6 +27,7 @@ SOFTWARE.
 export { default as BloomFilter } from './bloom/bloom-filter'
 export { default as CountingBloomFilter } from './bloom/counting-bloom-filter'
 export { default as PartitionedBloomFilter } from './bloom/partitioned-bloom-filter'
+export { default as BitSet } from './bloom/bit-set'
 export { default as CountMinSketch } from './sketch/count-min-sketch'
 export { default as HyperLogLog } from './sketch/hyperloglog'
 export { default as TopK } from './sketch/topk'

diff --git a/src/bloom/bit-set.ts b/src/bloom/bit-set.ts
@@ -0,0 +1,168 @@
+/* file : BitSet.ts
+MIT License
+
+Copyright (c) 2021 David Leppik
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+import {encode, decode} from "base64-arraybuffer";
+
+const bitsPerWord = 8;
+
+/**
+ * A memory-efficient Boolean array. Contains just the minimal operations needed for our Bloom filter implementation.
+ *
+ * @author David Leppik
+ */
+export default class BitSet {
+    readonly size: number;
+
+    // Uint32Array may be slightly faster due to memory alignment, but this avoids endianness when serializing
+    private array: Uint8Array;
+
+    /**
+     * Constructor. All bits are initially set to false.
+     * @param size the number of bits that can be stored. (This is NOT required to be a multiple of 8.)
+     */
+    constructor(size: number) {
+        this.size = size
+        this.array = new Uint8Array(Math.ceil(size / bitsPerWord))
+    }
+
+    /**
+     * Returns the value of the bit at the given index
+     * @param index position of the bit, zero-indexed
+     */
+    has(index: number): boolean {
+        const wordIndex = Math.floor(index / bitsPerWord)
+        const mask = 1 << (index % bitsPerWord)
+        return (this.array[wordIndex] & mask) !== 0
+    }
+
+    /**
+     * Set the bit to true
+     * @param index position of the bit, zero-indexed
+     */
+    add(index: number) {
+        const wordIndex = Math.floor(index / bitsPerWord)
+        const mask = 1 << (index % bitsPerWord)
+        this.array[wordIndex] = this.array[wordIndex] | mask
-        this.array[wordIndex] = this.array[wordIndex] | mask
+        this.array[wordIndex] |= mask
-        this.array[wordIndex] = this.array[wordIndex] | mask
+        this.array[wordIndex] |= mask
+    }
+
+    /**
+     * Returns the maximum true bit.
+     */
+    max(): number {
+        for (let i = this.array.length - 1; i >= 0; i--) {
+            let bits = this.array[i];
+            if (bits) {
+                return BitSet.highBit(bits) + (i*bitsPerWord);
+            }
+        }
+        return 0;
+    }
+
+    /**
+     * Returns the number of true bits.
+     */
+    bitCount(): number {
+        let result = 0
+        for (let i = 0; i < this.array.length; i++) {
+            result += BitSet.countBits(this.array[i]) // Assumes we never have bits set beyond the end
+        }
+        return result
+    }
+
+    /**
+     * Returns true if the size and contents are identical.
+     * @param other another BitSet
+     */
+    equals(other: BitSet): boolean {
+        if (other.size !== this.size) {
+            return false
+        }
+        for (let i = 0; i < this.array.length; i++) {
+            if (this.array[i] !== other.array[i]) {
+                return false
+            }
+        }
+        return true
+    }
+
+    /** 
+     * Returns a JSON-encodable object readable by {@link import}. 
+     */
+    export(): { size: number, content: string } {
+        return {
+            size: this.size,
+            content: encode(this.array)
+        }
+    }
+
+    /**
+     * Returns an object written by {@link export}.
+     * @param data an object written by {@link export}
+     */
+    static import(data: any): BitSet {
+        if (typeof data.size !== "number") {
+            throw Error("BitSet missing size")
+        }
+        if (typeof data.content !== "string") {
+            throw Error("BitSet: missing content")
+        }
+        const result = new BitSet(data.size)
+        const buffer = decode(data.content)
+        result.array = new Uint8Array(buffer)
+        return result
+    }
+
+    /**
+     * Returns the index of the maximum bit in the number, or -1 for 0
+     * @bits an unsigned 8-bit number
+     * @example
+     * BitSet.highBit(0) // returns -1
+     * BitSet.highBit(5) // returns 2
+     */
+    private static highBit(bits: number): number {
+        let result = bitsPerWord - 1;
+        let mask = 1 << result;
+        while (result >= 0 && ((mask & bits) !== mask)) {
+            mask >>>= 1;
+            result--;
+        }
+        return result;
+    }
+
+    /**
+     * Returns the number of true bits in the number
+     * @bits an unsigned 8-bit number
+     * @example
+     * BitSet.countBits(0) // returns 0
+     * BitSet.countBits(3) // returns 2
+     */
+    private static countBits(bits: number): number {
+        let result = bits & 1;
+        while (bits !== 0) {
+            bits = bits >>> 1;
+            result += (bits & 1)
+        }
+        return result
+    }
+}
diff --git a/src/bloom/bloom-filter.ts b/src/bloom/bloom-filter.ts
@@ -26,9 +26,10 @@ SOFTWARE.
 
 import ClassicFilter from '../interfaces/classic-filter'
 import BaseFilter from '../base-filter'
+import BitSet from "./bit-set";
 import { AutoExportable, Field, Parameter } from '../exportable'
 import { optimalFilterSize, optimalHashes } from '../formulas'
-import { HashableInput, allocateArray, getDistinctIndices } from '../utils'
+import { HashableInput, getDistinctIndices } from '../utils'
 
 /**
  * A Bloom filter is a space-efficient probabilistic data structure, conceived by Burton Howard Bloom in 1970,
@@ -42,16 +43,13 @@ import { HashableInput, allocateArray, getDistinctIndices } from '../utils'
 @AutoExportable<BloomFilter>('BloomFilter', ['_seed'])
 export default class BloomFilter extends BaseFilter implements ClassicFilter<HashableInput> {
   @Field()
-  private _size: number
+  private readonly _size: number
 
   @Field()
-  private _nbHashes: number
+  private readonly _nbHashes: number
 
-  @Field()
-  private _filter: Array<number>
-
-  @Field()
-  private _length: number
+  @Field<BitSet>(f => f.export(), d => BitSet.import(d))
+  private readonly _filter: BitSet
 
   /**
    * Constructor
@@ -65,13 +63,12 @@ export default class BloomFilter extends BaseFilter implements ClassicFilter<Has
     }
     this._size = size
     this._nbHashes = nbHashes
-    this._filter = allocateArray(this._size, 0)
-    this._length = 0
+    this._filter = new BitSet(size)
   }
 
   /**
    * Create an optimal bloom filter providing the maximum of elements stored and the error rate desired
-   * @param  items      - The maximum nuber of item to store
+   * @param  nbItems      - The maximum number of item to store
    * @param  errorRate  - The error rate desired for a maximum of items inserted
    * @return A new {@link BloomFilter}
    */
@@ -84,15 +81,19 @@ export default class BloomFilter extends BaseFilter implements ClassicFilter<Has
   /**
    * Build a new Bloom Filter from an existing iterable with a fixed error rate
    * @param items - The iterable used to populate the filter
-   * @param errorRate - The error rate, i.e. 'false positive' rate, targetted by the filter
+   * @param errorRate - The error rate, i.e. 'false positive' rate, targeted by the filter
+   * @param seed - The random number seed (optional)
    * @return A new Bloom Filter filled with the iterable's elements
    * @example
    * // create a filter with a false positive rate of 0.1
    * const filter = BloomFilter.from(['alice', 'bob', 'carl'], 0.1);
    */
-  static from (items: Iterable<HashableInput>, errorRate: number): BloomFilter {
+  static from (items: Iterable<HashableInput>, errorRate: number, seed?: number): BloomFilter {
     const array = Array.from(items)
     const filter = BloomFilter.create(array.length, errorRate)
+    if (typeof seed === 'number') {
+      filter.seed = seed
+    }
     array.forEach(element => filter.add(element))
     return filter
   }
@@ -110,7 +111,7 @@ export default class BloomFilter extends BaseFilter implements ClassicFilter<Has
    * @return The filter length
    */
   get length (): number {
-    return this._length
+    return this._filter.bitCount()
   }
 
   /**
@@ -123,10 +124,7 @@ export default class BloomFilter extends BaseFilter implements ClassicFilter<Has
   add (element: HashableInput): void {
     const indexes = getDistinctIndices(element, this._size, this._nbHashes, this.seed)
     for (let i = 0; i < indexes.length; i++) {
-      if (!this._filter[indexes[i]]) {
-        this._length++
-      }
-      this._filter[indexes[i]] = 1
+      this._filter.add(indexes[i]);
     }
   }
 
@@ -143,7 +141,7 @@ export default class BloomFilter extends BaseFilter implements ClassicFilter<Has
   has (element: HashableInput): boolean {
     const indexes = getDistinctIndices(element, this._size, this._nbHashes, this.seed)
     for (let i = 0; i < indexes.length; i++) {
-      if (!this._filter[indexes[i]]) {
+      if (!this._filter.has(indexes[i])) {
         return false
       }
     }
@@ -158,18 +156,18 @@ export default class BloomFilter extends BaseFilter implements ClassicFilter<Has
    * console.log(filter.rate()); // output: something around 0.1
    */
   rate (): number {
-    return Math.pow(1 - Math.exp(-this._length / this._size), this._nbHashes)
+    return Math.pow(1 - Math.exp(-this.length / this._size), this._nbHashes)
   }
 
   /**
    * Check if another Bloom Filter is equal to this one
-   * @param  filter - The filter to compare to this one
+   * @param  other - The filter to compare to this one
    * @return True if they are equal, false otherwise
    */
   equals (other: BloomFilter): boolean {
-    if (this._size !== other._size || this._nbHashes !== other._nbHashes || this._length !== other._length) {
+    if (this._size !== other._size || this._nbHashes !== other._nbHashes) {
       return false
     }
-    return this._filter.every((value, index) => other._filter[index] === value)
+    return this._filter.equals(other._filter)
   }
 }