diff --git a/stats/src/main/java/com/facebook/airlift/stats/cardinality/Bitmap.java b/stats/src/main/java/com/facebook/airlift/stats/cardinality/Bitmap.java deleted file mode 100644 index 63fac48dbd..0000000000 --- a/stats/src/main/java/com/facebook/airlift/stats/cardinality/Bitmap.java +++ /dev/null @@ -1,181 +0,0 @@ -package com.facebook.airlift.stats.cardinality; - -import com.google.common.annotations.VisibleForTesting; -import io.airlift.slice.SizeOf; -import io.airlift.slice.SliceInput; -import org.openjdk.jol.info.ClassLayout; - -import static com.google.common.base.Preconditions.checkArgument; - -/** - * A level of abstraction over the bitmaps used in sketches such as LPCA and SFM. - * These are essentially arrays of booleans that support flipping and applying randomized response. - * Concretely, these are stored as byte arrays. - */ -public class Bitmap -{ - private static final int INSTANCE_SIZE = ClassLayout.parseClass(Bitmap.class).instanceSize(); - - private final byte[] bitmap; - - public Bitmap(int length) - { - validateLength(length); - bitmap = new byte[length / Byte.SIZE]; - } - - private Bitmap(byte[] bytes) - { - bitmap = bytes; - } - - public static Bitmap fromBytes(byte[] bytes) - { - return new Bitmap(bytes); - } - - public static Bitmap fromSliceInput(SliceInput input, int length) - { - validateLength(length); - byte[] bytes = new byte[length / Byte.SIZE]; - for (int i = 0; i < bytes.length; i++) { - bytes[i] = input.readByte(); - } - return Bitmap.fromBytes(bytes); - } - - public byte[] toBytes() - { - return bitmap; - } - - @VisibleForTesting - static int bitmapBitShift(int position) - { - return position % Byte.SIZE; - } - - @VisibleForTesting - static int bitmapByteIndex(int position) - { - // n.b.: position is 0-indexed - return Math.floorDiv(position, Byte.SIZE); - } - - public int byteLength() - { - return bitmap.length; - } - - @Override - public Bitmap clone() - { - return Bitmap.fromBytes(bitmap.clone()); - } - - public long getRetainedSizeInBytes() - { - return INSTANCE_SIZE + SizeOf.sizeOf(bitmap); - } - - public boolean getBit(int position) - { - int b = bitmapByteIndex(position); - int shift = bitmapBitShift(position); - - return ((bitmap[b] >> shift) & 1) == 1; - } - - /** - * The number of 1-bits in the bitmap - */ - public int getBitCount() - { - int count = 0; - for (byte b : bitmap) { - count += Integer.bitCount(Byte.toUnsignedInt(b)); - } - return count; - } - - /** - * Randomly (and independently) flip all bits with specified probability - */ - public void flipAll(double probability, RandomizationStrategy randomizationStrategy) - { - for (int i = 0; i < bitmap.length * Byte.SIZE; i++) { - flipBit(i, probability, randomizationStrategy); - } - } - - /** - * Deterministically flips the bit at a given position - */ - public void flipBit(int position) - { - byte oneBit = (byte) (1 << bitmapBitShift(position)); - bitmap[bitmapByteIndex(position)] ^= oneBit; - } - - /** - * Randomly flips the bit at a given position with specified probability - */ - public void flipBit(int position, double probability, RandomizationStrategy randomizationStrategy) - { - if (randomizationStrategy.nextBoolean(probability)) { - flipBit(position); - } - } - - public int length() - { - return bitmap.length * Byte.SIZE; - } - - /** - * Explicitly set the value of the bit at a given position - */ - public void setBit(int position, boolean value) - { - byte oneBit = (byte) (1 << bitmapBitShift(position)); - if (value) { - bitmap[bitmapByteIndex(position)] |= oneBit; - } - else { - bitmap[bitmapByteIndex(position)] &= ~oneBit; - } - } - - public Bitmap or(Bitmap other) - { - byte[] bytes = toBytes().clone(); - byte[] bytesOther = other.toBytes(); - - checkArgument(bytes.length == bytesOther.length, "cannot OR two bitmaps of different size"); - - for (int i = 0; i < bytes.length; i++) { - bytes[i] |= bytesOther[i]; - } - - return Bitmap.fromBytes(bytes); - } - - public Bitmap xor(Bitmap other) - { - byte[] bytes = toBytes().clone(); - byte[] bytesOther = other.toBytes(); - - checkArgument(bytes.length == bytesOther.length, "cannot XOR two bitmaps of different size"); - - for (int i = 0; i < bytes.length; i++) { - bytes[i] ^= bytesOther[i]; - } - - return Bitmap.fromBytes(bytes); - } - - private static void validateLength(int length) - { - checkArgument(length > 0 && length % Byte.SIZE == 0, "bitmap size must be a positive multiple of %s", Byte.SIZE); - } -} diff --git a/stats/src/main/java/com/facebook/airlift/stats/cardinality/Format.java b/stats/src/main/java/com/facebook/airlift/stats/cardinality/Format.java index d6c06745d5..5745842cdf 100644 --- a/stats/src/main/java/com/facebook/airlift/stats/cardinality/Format.java +++ b/stats/src/main/java/com/facebook/airlift/stats/cardinality/Format.java @@ -19,8 +19,7 @@ enum Format DENSE_V1(1), SPARSE_V2(2), DENSE_V2(3), - PRIVATE_LPCA_V1(4), - SFM_V1(7); + PRIVATE_LPCA_V1(4); private byte tag; diff --git a/stats/src/main/java/com/facebook/airlift/stats/cardinality/PrivateLpcaSketch.java b/stats/src/main/java/com/facebook/airlift/stats/cardinality/PrivateLpcaSketch.java index 6c2fa040c7..0faf08200d 100644 --- a/stats/src/main/java/com/facebook/airlift/stats/cardinality/PrivateLpcaSketch.java +++ b/stats/src/main/java/com/facebook/airlift/stats/cardinality/PrivateLpcaSketch.java @@ -39,7 +39,7 @@ @NotThreadSafe public class PrivateLpcaSketch { - private Bitmap bitmap; + private byte[] bitmap; private final int threshold; private final int numberOfBuckets; private final double epsilonThreshold; @@ -82,7 +82,10 @@ public PrivateLpcaSketch(Slice serialized, RandomizationStrategy randomizationSt threshold = input.readInt(); epsilonThreshold = input.readDouble(); epsilonRandomizedResponse = input.readDouble(); - bitmap = Bitmap.fromSliceInput(input, numberOfBuckets); + bitmap = new byte[numberOfBuckets / Byte.SIZE]; + for (int i = 0; i < bitmap.length; i++) { + bitmap[i] = input.readByte(); + } } private void applyRandomizedResponse() @@ -90,7 +93,7 @@ private void applyRandomizedResponse() double p = getFlipProbability(); for (int i = 0; i < numberOfBuckets; i++) { if (randomizationStrategy.nextBoolean(p)) { - bitmap.flipBit(i); + flipBit(i); } } } @@ -99,10 +102,23 @@ private void applyRandomizedResponse(int bucket) { double p = getFlipProbability(); if (randomizationStrategy.nextBoolean(p)) { - bitmap.flipBit(bucket); + flipBit(bucket); } } + @VisibleForTesting + static int bitmapBitShift(int bucket) + { + return bucket % Byte.SIZE; + } + + @VisibleForTesting + static int bitmapByteIndex(int bucket) + { + // n.b.: bucket is 0-indexed + return Math.floorDiv(bucket, Byte.SIZE); + } + public long cardinality() { double proportion = getDebiasedBitProportion(); @@ -139,7 +155,14 @@ private int findThreshold(HyperLogLog hll) } @VisibleForTesting - Bitmap getBitmap() + void flipBit(int bucket) + { + byte oneBit = (byte) (1 << bitmapBitShift(bucket)); + bitmap[bitmapByteIndex(bucket)] ^= oneBit; + } + + @VisibleForTesting + byte[] getBitmap() { return bitmap; } @@ -152,8 +175,8 @@ private double getDebiasedBitProportion() // So the proportion of bits equal to 1 has expectation: // p + (1-2p) T, // where T is the true proportion. - double probability = getFlipProbability(); - return (getRawBitProportion() - probability) / (1 - 2 * probability); + double effProbability = randomizationStrategy.effectiveProbability(getFlipProbability()); + return (getRawBitProportion() - effProbability) / (1 - 2 * effProbability); } private double getFlipProbability() @@ -169,7 +192,11 @@ public int getNumberOfBuckets() @VisibleForTesting double getRawBitProportion() { - return (double) bitmap.getBitCount() / numberOfBuckets; + double count = 0; + for (byte b : bitmap) { + count += Integer.bitCount(b & BYTE_MASK); + } + return count / numberOfBuckets; } public int getThreshold() @@ -187,11 +214,23 @@ public Slice serialize() .appendInt(threshold) .appendDouble(epsilonThreshold) .appendDouble(epsilonRandomizedResponse) - .appendBytes(bitmap.toBytes()); + .appendBytes(bitmap); return output.slice(); } + @VisibleForTesting + void setBit(int bucket, boolean value) + { + byte oneBit = (byte) (1 << bitmapBitShift(bucket)); + if (value) { + bitmap[bitmapByteIndex(bucket)] |= oneBit; + } + else { + bitmap[bitmapByteIndex(bucket)] &= ~oneBit; + } + } + /** * Updates current sketch by adding data from a second HyperLogLog * @@ -206,7 +245,7 @@ public void update(HyperLogLog hllOther) // if the new HLL's bucket value is at or below threshold, we don't need to do anything // if above threshold, we need to set to 1 and then re-apply randomized response on the bit if (value > threshold) { - bitmap.setBit(i, true); + setBit(i, true); applyRandomizedResponse(i); } }); @@ -214,7 +253,7 @@ public void update(HyperLogLog hllOther) private void writeBitmap(HyperLogLog hll) { - bitmap = new Bitmap(numberOfBuckets); - hll.eachBucket((i, value) -> bitmap.setBit(i, value > threshold)); + bitmap = new byte[numberOfBuckets / Byte.SIZE]; + hll.eachBucket((i, value) -> setBit(i, value > threshold)); } } diff --git a/stats/src/main/java/com/facebook/airlift/stats/cardinality/RandomizationStrategy.java b/stats/src/main/java/com/facebook/airlift/stats/cardinality/RandomizationStrategy.java index 2a5a071bdc..5de009c27a 100644 --- a/stats/src/main/java/com/facebook/airlift/stats/cardinality/RandomizationStrategy.java +++ b/stats/src/main/java/com/facebook/airlift/stats/cardinality/RandomizationStrategy.java @@ -13,21 +13,11 @@ */ package com.facebook.airlift.stats.cardinality; -public abstract class RandomizationStrategy +public interface RandomizationStrategy { - abstract long getRetainedSizeInBytes(); + double effectiveProbability(double probability); - public boolean nextBoolean(double probability) - { - return nextDouble() <= probability; - } + boolean nextBoolean(double probability); - abstract double nextDouble(); - - public double nextLaplace(double scale) - { - double quantile = nextDouble(); - int z = nextDouble() <= 0.5 ? 1 : 0; - return (2 * z - 1) * scale * Math.log(quantile); - } + double nextLaplace(double scale); } diff --git a/stats/src/main/java/com/facebook/airlift/stats/cardinality/SecureRandomizationStrategy.java b/stats/src/main/java/com/facebook/airlift/stats/cardinality/SecureRandomizationStrategy.java index 135ba48638..5f8832221f 100644 --- a/stats/src/main/java/com/facebook/airlift/stats/cardinality/SecureRandomizationStrategy.java +++ b/stats/src/main/java/com/facebook/airlift/stats/cardinality/SecureRandomizationStrategy.java @@ -13,31 +13,36 @@ */ package com.facebook.airlift.stats.cardinality; -import org.openjdk.jol.info.ClassLayout; - import java.security.SecureRandom; -/** - * Note: Due to finite-precision implementation details, usage of floating-point functions - * as random noise, while cryptographically secure, may leak information from a privacy context. - * See "On Significance of the Least Significant Bits For Differential Privacy" by Mironov - * and use judiciously. - */ public class SecureRandomizationStrategy - extends RandomizationStrategy + implements RandomizationStrategy { - private static final int INSTANCE_SIZE = ClassLayout.parseClass(SecureRandomizationStrategy.class).instanceSize(); - private static final SecureRandom random = new SecureRandom(); + private final SecureRandom random; - public SecureRandomizationStrategy() {} + public SecureRandomizationStrategy() + { + this.random = new SecureRandom(); + } + + public double effectiveProbability(double probability) + { + return probability; + } - public long getRetainedSizeInBytes() + public boolean nextBoolean(double probability) { - return INSTANCE_SIZE; + return random.nextDouble() <= probability; } - public double nextDouble() + public double nextLaplace(double scale) { - return random.nextDouble(); + // Note: Due to finite-precision implementation details, usage of this as random noise, + // while cryptographically secure, may leak information from a privacy context. + // See "On Significance of the Least Significant Bits For Differential Privacy" by Mironov + // and use judiciously. + double quantile = random.nextDouble(); + int z = random.nextBoolean() ? 1 : 0; + return (2 * z - 1) * scale * Math.log(quantile); } } diff --git a/stats/src/main/java/com/facebook/airlift/stats/cardinality/SfmSketch.java b/stats/src/main/java/com/facebook/airlift/stats/cardinality/SfmSketch.java deleted file mode 100644 index e921cb736f..0000000000 --- a/stats/src/main/java/com/facebook/airlift/stats/cardinality/SfmSketch.java +++ /dev/null @@ -1,386 +0,0 @@ -package com.facebook.airlift.stats.cardinality; - -import com.google.common.annotations.VisibleForTesting; -import io.airlift.slice.BasicSliceInput; -import io.airlift.slice.DynamicSliceOutput; -import io.airlift.slice.Murmur3Hash128; -import io.airlift.slice.SizeOf; -import io.airlift.slice.Slice; -import org.openjdk.jol.info.ClassLayout; - -import javax.annotation.concurrent.NotThreadSafe; - -import static com.facebook.airlift.stats.cardinality.Utils.computeIndex; -import static com.facebook.airlift.stats.cardinality.Utils.indexBitLength; -import static com.facebook.airlift.stats.cardinality.Utils.numberOfBuckets; -import static com.facebook.airlift.stats.cardinality.Utils.numberOfTrailingZeros; -import static com.google.common.base.Preconditions.checkArgument; - -/** - * SfmSketch is a sketch for distinct counting, very similar to HyperLogLog. - * This sketch is introduced as the Sketch-Flip-Merge (SFM) summary in the paper - * Sketch-Flip-Merge: Mergeable Sketches for Private Distinct Counting. - *
- * The primary differences between SfmSketch and HyperLogLog are that - * (a) SfmSketch supports differential privacy, and - * (b) where HyperLogLog tracks only max observed bucket values, SfmSketch tracks all bucket values observed. - *
- * This means that SfmSketch is a larger sketch than HyperLogLog, but offers the ability to store completely - * DP sketches with a fixed, public hash function while maintaining accurate cardinality estimates. - *
- * SfmSketch is created in a non-private mode. Privacy must be enabled through the enablePrivacy() function. - * Once made private, the sketch becomes immutable. Privacy is quantified by the parameter epsilon. - *
- * When epsilon is greater than 0, the sketch is epsilon-DP, and bits are randomized to preserve privacy. - * When epsilon == NON_PRIVATE_EPSILON, the sketch is not private, and bits are set deterministically. - *
- * The best accuracy comes with NON_PRIVATE_EPSILON. For private epsilons, larger gives more accuracy, - * while smaller gives more privacy. - */ -@NotThreadSafe -public class SfmSketch -{ - public static final double NON_PRIVATE_EPSILON = Double.POSITIVE_INFINITY; - - private static final int MAX_ESTIMATION_ITERATIONS = 1000; - private static final int INSTANCE_SIZE = ClassLayout.parseClass(SfmSketch.class).instanceSize(); - - private final int indexBitLength; - private final int precision; - private final RandomizationStrategy randomizationStrategy; - - private double randomizedResponseProbability; - private Bitmap bitmap; - - private SfmSketch(Bitmap bitmap, int indexBitLength, int precision, double randomizedResponseProbability, RandomizationStrategy randomizationStrategy) - { - validatePrefixLength(indexBitLength); - validatePrecision(precision, indexBitLength); - validateRandomizedResponseProbability(randomizedResponseProbability); - - this.bitmap = bitmap; - this.indexBitLength = indexBitLength; - this.precision = precision; - this.randomizedResponseProbability = randomizedResponseProbability; - this.randomizationStrategy = randomizationStrategy; - } - - /** - * Create a new SfmSketch in non-private mode. To make private, - * call enablePrivacy() after populating the sketch. - */ - public static SfmSketch create(int numberOfBuckets, int precision) - { - return create(numberOfBuckets, precision, new SecureRandomizationStrategy()); - } - - /** - * Create a new SfmSketch in non-private mode. To make private, - * call enablePrivacy() after populating the sketch. - */ - public static SfmSketch create(int numberOfBuckets, int precision, RandomizationStrategy randomizationStrategy) - { - // Only create non-private sketches. - // Private sketches are immutable, so they're kind of useless to create. - double randomizedResponseProbability = getRandomizedResponseProbability(NON_PRIVATE_EPSILON); - int indexBitLength = indexBitLength(numberOfBuckets); - Bitmap bitmap = new Bitmap(numberOfBuckets * precision); - return new SfmSketch(bitmap, indexBitLength, precision, randomizedResponseProbability, randomizationStrategy); - } - - public static SfmSketch deserialize(Slice serialized) - { - return deserialize(serialized, new SecureRandomizationStrategy()); - } - - public static SfmSketch deserialize(Slice serialized, RandomizationStrategy randomizationStrategy) - { - // Format: - // format | indexBitLength | precision | epsilon | bitmap - BasicSliceInput input = serialized.getInput(); - byte format = input.readByte(); - checkArgument(format == Format.SFM_V1.getTag(), "Wrong format tag"); - - int indexBitLength = input.readInt(); - int precision = input.readInt(); - double randomizedResponseProbability = input.readDouble(); - - Bitmap bitmap = Bitmap.fromSliceInput(input, numberOfBuckets(indexBitLength) * precision); - return new SfmSketch(bitmap, indexBitLength, precision, randomizedResponseProbability, randomizationStrategy); - } - - public void add(long value) - { - addHash(Murmur3Hash128.hash64(value)); - } - - public void add(Slice value) - { - addHash(Murmur3Hash128.hash64(value)); - } - - public void addHash(long hash) - { - int index = computeIndex(hash, indexBitLength); - // cap zeros at precision - 1 - // essentially, we're looking at a (precision - 1)-bit hash - int zeros = Math.min(precision - 1, numberOfTrailingZeros(hash, indexBitLength)); - flipBitOn(index, zeros); - } - - /** - * Estimates cardinality via maximum psuedolikelihood (Newton's method) - */ - public long cardinality() - { - // The initial guess of 1 may seem awful, but this converges quickly, and starting small returns better results for small cardinalities. - // This generally takes <= 40 iterations, even for cardinalities as large as 10^33. - double guess = 1; - double changeInGuess = Double.POSITIVE_INFINITY; - int iterations = 0; - while (Math.abs(changeInGuess) > 0.1 && iterations < MAX_ESTIMATION_ITERATIONS) { - changeInGuess = -logLikelihoodFirstDerivative(guess) / logLikelihoodSecondDerivative(guess); - guess += changeInGuess; - iterations += 1; - } - return Math.max(0, Math.round(guess)); - } - - /** - * Enable privacy on a non-privacy-enabled sketch - *
- * Per Lemma 4.7, arXiv:2302.02056, - * flipping every bit with probability 1/(e^epsilon + 1) achieves differential privacy. - */ - public void enablePrivacy(double epsilon) - { - checkArgument(!isPrivacyEnabled(), "sketch is already privacy-enabled"); - validateEpsilon(epsilon); - - randomizedResponseProbability = getRandomizedResponseProbability(epsilon); - - // Flip every bit with fixed probability - for (int i = 0; i < bitmap.length(); i++) { - bitmap.flipBit(i, randomizedResponseProbability, randomizationStrategy); - } - } - - public int estimatedSerializedSize() - { - return SizeOf.SIZE_OF_BYTE + // type + version - SizeOf.SIZE_OF_INT + // indexBitLength - SizeOf.SIZE_OF_INT + // precision - SizeOf.SIZE_OF_DOUBLE + // randomized response probability - (bitmap.byteLength() * SizeOf.SIZE_OF_BYTE); // bitmap - } - - private void flipBitOn(int bucket, int level) - { - checkArgument(!isPrivacyEnabled(), "privacy-enabled SfmSketch is immutable"); - - int i = getBitLocation(bucket, level); - bitmap.setBit(i, true); - } - - @VisibleForTesting - int getBitLocation(int bucket, int level) - { - return level * numberOfBuckets(indexBitLength) + bucket; - } - - public Bitmap getBitmap() - { - return bitmap; - } - - @VisibleForTesting - double getOnProbability() - { - // probability of a 1-bit remaining a 1-bit under randomized response - return 1 - randomizedResponseProbability; - } - - static double getRandomizedResponseProbability(double epsilon) - { - // If non-private, we don't use randomized response. - // Otherwise, flip bits with probability 1/(exp(epsilon) + 1). - if (epsilon == NON_PRIVATE_EPSILON) { - return 0; - } - return 1.0 / (Math.exp(epsilon) + 1); - } - - @VisibleForTesting - double getRandomizedResponseProbability() - { - // probability of a 0-bit flipping to a 1-bit under randomized response - return randomizedResponseProbability; - } - - public long getRetainedSizeInBytes() - { - return INSTANCE_SIZE + bitmap.getRetainedSizeInBytes() + randomizationStrategy.getRetainedSizeInBytes(); - } - - public boolean isPrivacyEnabled() - { - return getRandomizedResponseProbability() > 0; - } - - private double logLikelihoodFirstDerivative(double n) - { - // Technically, this is the first derivative of the log of a psuedolikelihood. - double result = 0; - for (int level = 0; level < precision; level++) { - double termOn = logLikelihoodTermFirstDerivative(level, true, n); - double termOff = logLikelihoodTermFirstDerivative(level, false, n); - for (int bucket = 0; bucket < numberOfBuckets(indexBitLength); bucket++) { - result += bitmap.getBit(getBitLocation(bucket, level)) ? termOn : termOff; - } - } - return result; - } - - private double logLikelihoodTermFirstDerivative(int level, boolean on, double n) - { - double p = observationProbability(level); - int sign = on ? -1 : 1; - double c1 = on ? getOnProbability() : 1 - getOnProbability(); - double c2 = getOnProbability() - getRandomizedResponseProbability(); - return Math.log1p(-p) * (1 - c1 / (c1 + sign * c2 * Math.pow(1 - p, n))); - } - - private double logLikelihoodSecondDerivative(double n) - { - // Technically, this is the second derivative of the log of a psuedolikelihood. - double result = 0; - for (int level = 0; level < precision; level++) { - double termOn = logLikelihoodTermSecondDerivative(level, true, n); - double termOff = logLikelihoodTermSecondDerivative(level, false, n); - for (int bucket = 0; bucket < numberOfBuckets(indexBitLength); bucket++) { - result += bitmap.getBit(getBitLocation(bucket, level)) ? termOn : termOff; - } - } - return result; - } - - private double logLikelihoodTermSecondDerivative(int level, boolean on, double n) - { - double p = observationProbability(level); - int sign = on ? -1 : 1; - double c1 = on ? getOnProbability() : 1 - getOnProbability(); - double c2 = getOnProbability() - getRandomizedResponseProbability(); - return sign * c1 * c2 * Math.pow(Math.log1p(-p), 2) * Math.pow(1 - p, n) * Math.pow(c1 + sign * c2 * Math.pow(1 - p, n), -2); - } - - /** - * Merging two sketches with randomizedResponseProbability values p1 and p2 is equivalent to - * having created two non-private sketches, merged them, then enabled privacy with a - * randomizedResponseProbability value of: - *
- * (p1 + p2 - 3 * p1 * p2) / (1 - 2 * p1 * p2) - *
- * This can be derived from the fact that two private sketches created with epsilon1 and epsilon2 - * merge to be equivalent to a single sketch created with epsilon: - *
- * -log(exp(-epsilon1) + exp(-epsilon2) - exp(-(epsilon1 + epsilon2)) - *
- * For details, see Theorem 4.8, arXiv:2302.02056. - * For verification, see the unit tests. - */ - @VisibleForTesting - static double mergeRandomizedResponseProbabilities(double p1, double p2) - { - return (p1 + p2 - 3 * p1 * p2) / (1 - 2 * p1 * p2); - } - - /** - * Performs a merge of the other sketch into the current sketch. This is performed - * as a randomized merge as described in Theorem 4.8, - * arXiv:2302.02056. - *
- * The formula used in this function is a simplification of the form presented in the original paper. - * See also Section 3, arXiv:2306.09394. - */ - public void mergeWith(SfmSketch other) - { - // Strictly speaking, we may be able to provide more general merging than suggested here. - // It's not clear how useful this would be in practice. - checkArgument(precision == other.precision, "cannot merge two SFM sketches with different precision: %s vs. %s", precision, other.precision); - checkArgument(indexBitLength == other.indexBitLength, "cannot merge two SFM sketches with different indexBitLength: %s vs. %s", - indexBitLength, other.indexBitLength); - - if (!isPrivacyEnabled() && !other.isPrivacyEnabled()) { - // if neither sketch is private, we just take the OR of the sketches - setBitmap(bitmap.or(other.getBitmap())); - } - else { - // if either sketch is private, we combine using a randomized merge - // (the non-private case above is a special case of this more complicated math) - double p1 = randomizedResponseProbability; - double p2 = other.randomizedResponseProbability; - double p = mergeRandomizedResponseProbabilities(p1, p2); - double normalizer = (1 - 2 * p) / ((1 - 2 * p1) * (1 - 2 * p2)); - - for (int i = 0; i < bitmap.length(); i++) { - double bit1 = bitmap.getBit(i) ? 1 : 0; - double bit2 = other.bitmap.getBit(i) ? 1 : 0; - double x = 1 - 2 * p - normalizer * (1 - p1 - bit1) * (1 - p2 - bit2); - double probability = p + normalizer * x; - probability = Math.min(1.0, Math.max(0.0, probability)); - bitmap.setBit(i, randomizationStrategy.nextBoolean(probability)); - } - } - - randomizedResponseProbability = mergeRandomizedResponseProbabilities(randomizedResponseProbability, other.randomizedResponseProbability); - } - - private double observationProbability(int level) - { - // probability of observing a run of zeros of length level in any single bucket - // note: this is NOT (in general) the probability of having a 1 in the corresponding location in the sketch - // (it is if bits are set deterministically, as when epsilon < 0) - return Math.pow(2.0, -(level + 1)) / numberOfBuckets(indexBitLength); - } - - public Slice serialize() - { - int size = estimatedSerializedSize(); - - DynamicSliceOutput output = new DynamicSliceOutput(size) - .appendByte(Format.SFM_V1.getTag()) - .appendInt(indexBitLength) - .appendInt(precision) - .appendDouble(randomizedResponseProbability) - .appendBytes(bitmap.toBytes()); - - return output.slice(); - } - - @VisibleForTesting - void setBitmap(Bitmap bitmap) - { - this.bitmap = bitmap; - } - - private static void validateEpsilon(double epsilon) - { - checkArgument(epsilon > 0, "epsilon must be greater than zero or equal to NON_PRIVATE_EPSILON"); - } - - private static void validatePrecision(int precision, int indexBitLength) - { - checkArgument(precision > 0 && precision % Byte.SIZE == 0, "precision must be a positive multiple of %s", Byte.SIZE); - checkArgument(precision + indexBitLength <= Long.SIZE, "precision + indexBitLength cannot exceed %s", Long.SIZE); - } - - private static void validatePrefixLength(int indexBitLength) - { - checkArgument(indexBitLength >= 1 && indexBitLength <= 32, "indexBitLength is out of range"); - } - - private static void validateRandomizedResponseProbability(double p) - { - checkArgument(p >= 0 && p <= 0.5, "randomizedResponseProbability should be in the interval [0, 0.5]"); - } -} diff --git a/stats/src/main/java/com/facebook/airlift/stats/cardinality/Utils.java b/stats/src/main/java/com/facebook/airlift/stats/cardinality/Utils.java index caba728fa6..4d5ee9abbb 100644 --- a/stats/src/main/java/com/facebook/airlift/stats/cardinality/Utils.java +++ b/stats/src/main/java/com/facebook/airlift/stats/cardinality/Utils.java @@ -64,12 +64,6 @@ public static int numberOfLeadingZeros(long hash, int indexBitLength) return Long.numberOfLeadingZeros(value); } - public static int numberOfTrailingZeros(long hash, int indexBitLength) - { - long value = hash | (1L << (Long.SIZE - indexBitLength)); // place a 1 in the final position of the prefix to avoid flowing into prefix when the hash happens to be 0 - return Long.numberOfTrailingZeros(value); - } - public static int computeValue(long hash, int indexBitLength) { return numberOfLeadingZeros(hash, indexBitLength) + 1; diff --git a/stats/src/test/java/com/facebook/airlift/stats/cardinality/TestBitmap.java b/stats/src/test/java/com/facebook/airlift/stats/cardinality/TestBitmap.java deleted file mode 100644 index a52ef0a78d..0000000000 --- a/stats/src/test/java/com/facebook/airlift/stats/cardinality/TestBitmap.java +++ /dev/null @@ -1,185 +0,0 @@ -package com.facebook.airlift.stats.cardinality; - -import org.openjdk.jol.info.ClassLayout; -import org.testng.annotations.Test; - -import java.util.Random; - -import static org.testng.Assert.assertEquals; -import static org.testng.Assert.assertFalse; -import static org.testng.Assert.assertTrue; - -public class TestBitmap -{ - private TestBitmap() {} - - @Test - public static void testRoundTrip() - { - byte[] bytes = randomBytes(100); - assertEquals(Bitmap.fromBytes(bytes).toBytes(), bytes); - } - - @Test - public static void testSetBit() - { - Bitmap bitmap = new Bitmap(24); - - // This should create the following bitmap: - // 00000011_00000101_01010101 - bitmap.setBit(0, true); - bitmap.setBit(1, true); - bitmap.setBit(8, true); - bitmap.setBit(10, true); - bitmap.setBit(16, true); - bitmap.setBit(18, true); - bitmap.setBit(20, true); - bitmap.setBit(22, true); - - byte[] bytes = bitmap.toBytes(); - assertEquals(bytes[0], 0b00000011); - assertEquals(bytes[1], 0b00000101); - assertEquals(bytes[2], 0b01010101); - - // Now clear all bits - for (int i = 0; i < 24; i++) { - bitmap.setBit(i, false); - } - - bytes = bitmap.toBytes(); - assertEquals(bytes[0], 0); - assertEquals(bytes[1], 0); - assertEquals(bytes[2], 0); - } - - @Test - public static void testGetBit() - { - Bitmap bitmap = new Bitmap(4096); - - for (int i = 0; i < 4096; i++) { - bitmap.setBit(i, true); - assertTrue(bitmap.getBit(i)); - bitmap.setBit(i, false); - assertFalse(bitmap.getBit(i)); - } - } - - @Test - public void testGetBitCount() - { - int length = 1024; - Bitmap bitmap = new Bitmap(length); - assertEquals(bitmap.getBitCount(), 0); // all zeros at initialization - for (int i = 0; i < length; i++) { - bitmap.setBit(i, true); - assertEquals(bitmap.getBitCount(), i + 1); // i + 1 "true" bits - } - } - - @Test - public static void testFlipBit() - { - Bitmap bitmap = new Bitmap(4096); - - for (int i = 0; i < 4096; i++) { - bitmap.flipBit(i); - assertTrue(bitmap.getBit(i)); - bitmap.flipBit(i); - assertFalse(bitmap.getBit(i)); - bitmap.flipBit(i); - assertTrue(bitmap.getBit(i)); - } - } - - @Test - public static void testLength() - { - for (int i = 1; i <= 10; i++) { - Bitmap bitmap = new Bitmap(i * 8); - assertEquals(bitmap.length(), i * 8); - assertEquals(bitmap.byteLength(), i); - } - } - - @Test - public static void testRandomFlips() - { - Bitmap bitmap = new Bitmap(16); - - // Note: TestingDeterministicRandomizationStrategy flips deterministically if and only if probability >= 0.5. - - TestingDeterministicRandomizationStrategy randomizationStrategy = new TestingDeterministicRandomizationStrategy(); - bitmap.flipBit(0, 0.75, randomizationStrategy); - assertTrue(bitmap.getBit(0)); - bitmap.flipBit(0, 0.75, randomizationStrategy); - assertFalse(bitmap.getBit(0)); - bitmap.flipBit(0, 0.25, randomizationStrategy); - assertFalse(bitmap.getBit(0)); - - bitmap.flipAll(0.75, randomizationStrategy); - for (int i = 0; i < 16; i++) { - assertTrue(bitmap.getBit(i)); - } - - bitmap.flipAll(0.25, randomizationStrategy); - for (int i = 0; i < 16; i++) { - assertTrue(bitmap.getBit(i)); - } - } - - @Test - public static void testClone() - { - Bitmap bitmapA = Bitmap.fromBytes(randomBytes(100)); - Bitmap bitmapB = bitmapA.clone(); - - // all bits should match - for (int i = 0; i < 100 * 8; i++) { - assertEquals(bitmapA.getBit(i), bitmapB.getBit(i)); - } - - // but the bitmaps should point to different bits - bitmapA.flipBit(0); - assertEquals(bitmapA.getBit(0), !bitmapB.getBit(0)); - } - - @Test - public static void testOr() - { - Bitmap bitmapA = Bitmap.fromBytes(randomBytes(100)); - Bitmap bitmapB = Bitmap.fromBytes(randomBytes(100)); - Bitmap bitmapC = bitmapA.or(bitmapB); - - for (int i = 0; i < 100 * 8; i++) { - assertEquals(bitmapC.getBit(i), bitmapA.getBit(i) | bitmapB.getBit(i)); - } - } - - @Test - public static void testXor() - { - Bitmap bitmapA = Bitmap.fromBytes(randomBytes(100)); - Bitmap bitmapB = Bitmap.fromBytes(randomBytes(100)); - Bitmap bitmapC = bitmapA.xor(bitmapB); - - for (int i = 0; i < 100 * 8; i++) { - assertEquals(bitmapC.getBit(i), bitmapA.getBit(i) ^ bitmapB.getBit(i)); - } - } - - @Test - public static void testRetainedSize() - { - Bitmap bitmap = Bitmap.fromBytes(randomBytes(100)); - assertEquals(bitmap.getRetainedSizeInBytes(), 100 + 16 + ClassLayout.parseClass(Bitmap.class).instanceSize()); - } - - private static byte[] randomBytes(int length) - { - byte[] bytes = new byte[length]; - Random random = new Random(); - random.nextBytes(bytes); - return bytes; - } -} diff --git a/stats/src/test/java/com/facebook/airlift/stats/cardinality/TestPrivateLpcaSketch.java b/stats/src/test/java/com/facebook/airlift/stats/cardinality/TestPrivateLpcaSketch.java index be7814a735..2c0574f7f6 100644 --- a/stats/src/test/java/com/facebook/airlift/stats/cardinality/TestPrivateLpcaSketch.java +++ b/stats/src/test/java/com/facebook/airlift/stats/cardinality/TestPrivateLpcaSketch.java @@ -18,6 +18,7 @@ import static io.airlift.slice.testing.SliceAssertions.assertSlicesEqual; import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertTrue; public class TestPrivateLpcaSketch @@ -29,11 +30,11 @@ public void testThresholding() for (int i = 0; i < 100_000; i++) { hll.add(i); } - PrivateLpcaSketch lpca = new PrivateLpcaSketch(hll, Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY, new TestingDeterministicRandomizationStrategy()); + PrivateLpcaSketch lpca = new PrivateLpcaSketch(hll, 1.0, 1.0, new TestingRandomizationStrategy()); int threshold = lpca.getThreshold(); int[] rawBuckets = getBucketValues(hll); for (int i = 0; i < rawBuckets.length; i++) { - assertEquals(lpca.getBitmap().getBit(i), rawBuckets[i] > threshold); + assertEquals(getBit(lpca, i), rawBuckets[i] > threshold); } } @@ -59,7 +60,7 @@ public void testBitmapSize() for (int count : bucketCounts) { HyperLogLog hll = HyperLogLog.newInstance(count); PrivateLpcaSketch lpca = new PrivateLpcaSketch(hll, 1.0, 1.0); - assertEquals(lpca.getBitmap().length(), count); + assertEquals(lpca.getBitmap().length * 8, count); } } @@ -73,14 +74,14 @@ public void testUpdate() hll2.add(-i); } - PrivateLpcaSketch lpca = new PrivateLpcaSketch(hll1, Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY, new TestingDeterministicRandomizationStrategy()); + PrivateLpcaSketch lpca = new PrivateLpcaSketch(hll1, 1.0, 1.0, new TestingRandomizationStrategy()); lpca.update(hll2); int threshold = lpca.getThreshold(); int[] values1 = getBucketValues(hll1); int[] values2 = getBucketValues(hll2); for (int i = 0; i < values1.length; i++) { - assertEquals(lpca.getBitmap().getBit(i), Math.max(values1[i], values2[i]) > threshold); + assertEquals(getBit(lpca, i), Math.max(values1[i], values2[i]) > threshold); } } @@ -89,7 +90,7 @@ public void testUpdateIncompatible() { HyperLogLog hll1 = HyperLogLog.newInstance(1024); HyperLogLog hll2 = HyperLogLog.newInstance(512); - PrivateLpcaSketch lpca = new PrivateLpcaSketch(hll1, 1.0, 1.0, new TestingDeterministicRandomizationStrategy()); + PrivateLpcaSketch lpca = new PrivateLpcaSketch(hll1, 1.0, 1.0, new TestingRandomizationStrategy()); boolean thrown = false; @@ -103,20 +104,56 @@ public void testUpdateIncompatible() assertTrue(thrown); } + @Test + public void testSetBit() + { + HyperLogLog hll = HyperLogLog.newInstance(32); + PrivateLpcaSketch lpca = new PrivateLpcaSketch(hll, 1.0, 1.0, new TestingRandomizationStrategy()); + + for (int b = 0; b < lpca.getNumberOfBuckets(); b++) { + lpca.setBit(b, true); + assertTrue(getBit(lpca, b)); + lpca.setBit(b, false); + assertFalse(getBit(lpca, b)); + } + } + + @Test + public void testFlipBit() + { + HyperLogLog hll = HyperLogLog.newInstance(32); + PrivateLpcaSketch lpca = new PrivateLpcaSketch(hll, 1.0, 1.0, new TestingRandomizationStrategy()); + + lpca.setBit(10, true); + assertTrue(getBit(lpca, 10)); + lpca.flipBit(10); + assertFalse(getBit(lpca, 10)); + lpca.flipBit(10); + assertTrue(getBit(lpca, 10)); + } + @Test public void testBitProportion() { HyperLogLog hll = HyperLogLog.newInstance(32); - PrivateLpcaSketch lpca = new PrivateLpcaSketch(hll, 1.0, 1.0, new TestingDeterministicRandomizationStrategy()); + PrivateLpcaSketch lpca = new PrivateLpcaSketch(hll, 1.0, 1.0, new TestingRandomizationStrategy()); int cutoff = 18; for (int i = 0; i < lpca.getNumberOfBuckets(); i++) { - lpca.getBitmap().setBit(i, i < cutoff); + lpca.setBit(i, i < cutoff); } assertEquals(lpca.getRawBitProportion(), 18.0 / 32.0); } + private boolean getBit(PrivateLpcaSketch lpca, int bucket) + { + int b = PrivateLpcaSketch.bitmapByteIndex(bucket); + int shift = PrivateLpcaSketch.bitmapBitShift(bucket); + + return ((lpca.getBitmap()[b] >> shift) & 1) == 1; + } + private int[] getBucketValues(HyperLogLog hll) { int[] values = new int[hll.getNumberOfBuckets()]; diff --git a/stats/src/test/java/com/facebook/airlift/stats/cardinality/TestSfmSketch.java b/stats/src/test/java/com/facebook/airlift/stats/cardinality/TestSfmSketch.java deleted file mode 100644 index 5393cb44ed..0000000000 --- a/stats/src/test/java/com/facebook/airlift/stats/cardinality/TestSfmSketch.java +++ /dev/null @@ -1,329 +0,0 @@ -package com.facebook.airlift.stats.cardinality; - -import io.airlift.slice.Slice; -import org.openjdk.jol.info.ClassLayout; -import org.testng.annotations.Test; - -import static io.airlift.slice.testing.SliceAssertions.assertSlicesEqual; -import static org.testng.Assert.assertEquals; -import static org.testng.Assert.assertFalse; -import static org.testng.Assert.assertNotEquals; -import static org.testng.Assert.assertTrue; - -public class TestSfmSketch -{ - @Test - public void testRoundTrip() - { - SfmSketch sketch = SfmSketch.create(4096, 24, new TestingSeededRandomizationStrategy(1)); - for (int i = 0; i < 100_000; i++) { - sketch.add(i); - } - sketch.enablePrivacy(2); - Slice serialized = sketch.serialize(); - SfmSketch unserialized = SfmSketch.deserialize(serialized, new TestingSeededRandomizationStrategy(1)); - assertSlicesEqual(serialized, unserialized.serialize()); - } - - @Test - public void testPrivacyEnabled() - { - SfmSketch sketch = SfmSketch.create(32, 24, new TestingSeededRandomizationStrategy(1)); - assertFalse(sketch.isPrivacyEnabled()); - sketch.enablePrivacy(SfmSketch.NON_PRIVATE_EPSILON); - assertFalse(sketch.isPrivacyEnabled()); - sketch.enablePrivacy(1.23); - assertTrue(sketch.isPrivacyEnabled()); - } - - @Test - public void testSerializedSize() - { - SfmSketch sketch = SfmSketch.create(4096, 24, new TestingSeededRandomizationStrategy(1)); - sketch.enablePrivacy(1.23); - assertEquals(sketch.estimatedSerializedSize(), sketch.serialize().length()); - } - - @Test - public void testRetainedSize() - { - SfmSketch sketch = SfmSketch.create(4096, 24, new SecureRandomizationStrategy()); - sketch.enablePrivacy(4); - assertEquals(sketch.getRetainedSizeInBytes(), - ClassLayout.parseClass(SfmSketch.class).instanceSize() + - ClassLayout.parseClass(SecureRandomizationStrategy.class).instanceSize() + - sketch.getBitmap().getRetainedSizeInBytes()); - } - - @Test - public void testBitmapSize() - { - int[] buckets = {32, 64, 512, 1024, 4096, 32768}; - int[] precisions = {8, 16, 24, 32}; - - for (int numberOfBuckets : buckets) { - for (int precision : precisions) { - SfmSketch sketch = SfmSketch.create(numberOfBuckets, precision, new TestingSeededRandomizationStrategy(1)); - assertEquals(sketch.getBitmap().length(), numberOfBuckets * precision); - } - } - } - - @Test - public void testMergeNonPrivate() - { - SfmSketch sketch = SfmSketch.create(4096, 24, new TestingSeededRandomizationStrategy(1)); - SfmSketch sketch2 = SfmSketch.create(4096, 24, new TestingSeededRandomizationStrategy(1)); - - // insert 100,000 non-negative integers + 100,000 negative integers - for (int i = 0; i < 100_000; i++) { - sketch.add(i); - sketch2.add(-i - 1); - } - Bitmap before = sketch.getBitmap().clone(); - sketch.mergeWith(sketch2); - - // The two bitmaps should be merged with OR, - // and the resulting bitmap is not private. - assertEquals(sketch.getBitmap().toBytes(), before.or(sketch2.getBitmap()).toBytes()); - assertFalse(sketch.isPrivacyEnabled()); - } - - @Test - public void testMergePrivate() - { - SfmSketch sketch = SfmSketch.create(4096, 24, new TestingSeededRandomizationStrategy(1)); - SfmSketch sketch2 = SfmSketch.create(4096, 24, new TestingSeededRandomizationStrategy(2)); - - // insert 100,000 non-negative integers + 100,000 negative integers - for (int i = 0; i < 100_000; i++) { - sketch.add(i); - sketch2.add(-i - 1); - } - - Bitmap nonPrivateBitmap = sketch.getBitmap().clone(); - Bitmap nonPrivateBitmap2 = sketch2.getBitmap().clone(); - - sketch.enablePrivacy(3); - sketch2.enablePrivacy(4); - double p1 = sketch.getRandomizedResponseProbability(); - double p2 = sketch2.getRandomizedResponseProbability(); - - Bitmap unmergedBitmap = sketch.getBitmap().clone(); - sketch.mergeWith(sketch2); - - // The resulting bitmap is a randomized merge equivalent to a noisy (not deterministic) OR. - // As a result, the bitmap should not equal an OR, but it should have roughly the same number - // of 1-bits as an OR that is flipped with the merged randomizedResponseProbability. - // The resulting merged sketch is private. - assertTrue(sketch.isPrivacyEnabled()); - assertEquals(sketch.getRandomizedResponseProbability(), SfmSketch.mergeRandomizedResponseProbabilities(p1, p2)); - assertNotEquals(sketch.getBitmap().toBytes(), unmergedBitmap.or(sketch2.getBitmap()).toBytes()); - - int actualBitCount = sketch.getBitmap().getBitCount(); - Bitmap hypotheticalBitmap = nonPrivateBitmap.or(nonPrivateBitmap2); - hypotheticalBitmap.flipAll(sketch.getRandomizedResponseProbability(), new TestingSeededRandomizationStrategy(1)); - // The number of 1-bits in the merged sketch should approximately equal the number of 1-bits in our hypothetical bitmap. - assertEquals(hypotheticalBitmap.getBitCount(), actualBitCount, 100); - } - - @Test - public void testMergeMixed() - { - SfmSketch sketch = SfmSketch.create(4096, 24, new TestingSeededRandomizationStrategy(1)); - SfmSketch sketch2 = SfmSketch.create(4096, 24, new TestingSeededRandomizationStrategy(2)); - for (int i = 0; i < 100_000; i++) { - sketch.add(i); - sketch2.add(-i - 1); - } - sketch2.enablePrivacy(3); - Bitmap before = sketch.getBitmap().clone(); - sketch.mergeWith(sketch2); - - // The resulting sketch is private. - assertTrue(sketch.isPrivacyEnabled()); - - // A mixed-privacy merge is mathematically similar to a normal private merge, but - // it turns out that some bits are deterministic. In particular, the bits of the - // merged sketch corresponding to 0s in the non-private sketch should exactly match - // the private sketch. - for (int i = 0; i < before.length(); i++) { - if (!before.getBit(i)) { - assertEquals(sketch.getBitmap().getBit(i), sketch2.getBitmap().getBit(i)); - } - } - } - - @Test - public void testMergedProbabilities() - { - // should be symmetric - assertEquals(SfmSketch.mergeRandomizedResponseProbabilities(0.1, 0.2), SfmSketch.mergeRandomizedResponseProbabilities(0.2, 0.1)); - - // private + nonprivate = private - assertEquals(SfmSketch.mergeRandomizedResponseProbabilities(0, 0.1), 0.1); - assertEquals(SfmSketch.mergeRandomizedResponseProbabilities(0.15, 0), 0.15); - - // nonprivate + nonprivate = nonprivate - assertEquals(SfmSketch.mergeRandomizedResponseProbabilities(0.0, 0.0), 0.0); - - // private + private = private (noisier) - // In particular, according to https://arxiv.org/pdf/2302.02056.pdf, Theorem 4.8, two sketches - // with epsilon1 and epsilon2 should have a merged epsilonStar of: - // -log(e^-epsilon1 + e^-epsilon2 - e^-(epsilon1 + epsilon2)) - double epsilon1 = 1.2; - double epsilon2 = 3.4; - double p1 = SfmSketch.getRandomizedResponseProbability(epsilon1); - double p2 = SfmSketch.getRandomizedResponseProbability(epsilon2); - double epsilonStar = -Math.log(Math.exp(-epsilon1) + Math.exp(-epsilon2) - Math.exp(-(epsilon1 + epsilon2))); - double pStar = SfmSketch.getRandomizedResponseProbability(epsilonStar); - assertEquals(SfmSketch.mergeRandomizedResponseProbabilities(p1, p2), pStar, 1E-6); - // note: the merged sketch is noisier (higher probability of flipped bits) - assertTrue(pStar > Math.max(p1, p2)); - } - - @Test - public void testEmptySketchCardinality() - { - SfmSketch nonPrivateSketch = SfmSketch.create(4096, 24, new TestingSeededRandomizationStrategy(1)); - SfmSketch privateSketch = SfmSketch.create(4096, 24, new TestingSeededRandomizationStrategy(2)); - privateSketch.enablePrivacy(3); - - // Non-private should return exactly 0 - assertEquals(nonPrivateSketch.cardinality(), 0); - - // Private will be a noisy sketch, so it should return approximately zero, but will be rather noisy. - assertEquals(privateSketch.cardinality(), 0, 200); - } - - @Test - public void testSmallCardinality() - { - int[] ns = {1, 5, 10, 50, 100, 200, 500, 1000}; - - for (int n : ns) { - SfmSketch nonPrivateSketch = SfmSketch.create(4096, 24, new TestingSeededRandomizationStrategy(1)); - SfmSketch privateSketch = SfmSketch.create(4096, 24, new TestingSeededRandomizationStrategy(2)); - - for (int i = 0; i < n; i++) { - nonPrivateSketch.add(i); - privateSketch.add(i); - } - - privateSketch.enablePrivacy(3); - - // Non-private should actually be quite good for small numbers - assertEquals(nonPrivateSketch.cardinality(), n, Math.max(10, 0.1 * n)); - - // Private isn't quite as good... - assertEquals(privateSketch.cardinality(), n, 200); - } - } - - @Test - public void testActualCardinalityEstimates() - { - // Since we don't have precise accuracy guarantees, this test is pretty loose. - // Note: this is slow for cardinalities beyond, say, 1 million. See `testSimulatedCardinalityEstimates` below. - int[] magnitudes = {4, 5, 6}; - double[] epsilons = {4, SfmSketch.NON_PRIVATE_EPSILON}; - for (int mag : magnitudes) { - int n = (int) Math.pow(10, mag); - for (double eps : epsilons) { - SfmSketch sketch = SfmSketch.create(4096, 24, new TestingSeededRandomizationStrategy(1)); - for (int i = 0; i < n; i++) { - sketch.add(i); - } - sketch.enablePrivacy(eps); - assertEquals(sketch.cardinality(), n, n * 0.1); - } - } - } - - @Test - public void testSimulatedCardinalityEstimates() - { - // Instead of creating sketches by adding items, we simulate them for fast testing of huge cardinalities. - // For reference, 10^33 is one decillion. - // The goal here is to test general functionality and numerical stability. - int[] magnitudes = {6, 9, 12, 15, 18, 21, 24, 27, 30, 33}; - double[] epsilons = {4, SfmSketch.NON_PRIVATE_EPSILON}; - for (int mag : magnitudes) { - int n = (int) Math.pow(10, mag); - for (double eps : epsilons) { - SfmSketch sketch = createSketchWithTargetCardinality(4096, 24, eps, n); - assertEquals(sketch.cardinality(), n, n * 0.1); - } - } - } - - @Test - public void testMergedCardinalities() - { - double[] epsilons = {3, 4, SfmSketch.NON_PRIVATE_EPSILON}; - - // Test each pair of epsilons - // This gives us equal private epsilons, unequal private epsilons, mixed private and nonprivate, and totally nonprivate - for (double eps1 : epsilons) { - for (double eps2 : epsilons) { - SfmSketch sketch = SfmSketch.create(4096, 24, new TestingSeededRandomizationStrategy(1)); - SfmSketch sketch2 = SfmSketch.create(4096, 24, new TestingSeededRandomizationStrategy(2)); - // insert 300,000 positive integers and 200,000 negative integers - for (int i = 0; i < 300_000; i++) { - sketch.add(i + 1); - if (i < 200_000) { - sketch2.add(-i); - } - } - - sketch.enablePrivacy(eps1); - sketch2.enablePrivacy(eps2); - sketch.mergeWith(sketch2); - assertEquals(sketch.cardinality(), 500_000, 500_000 * 0.1); - } - } - } - - @Test - public static void testEnablePrivacy() - { - SfmSketch sketch = SfmSketch.create(4096, 24, new TestingSeededRandomizationStrategy(1)); - double epsilon = 4; - - for (int i = 0; i < 100_000; i++) { - sketch.add(i); - } - - long cardinalityBefore = sketch.cardinality(); - sketch.enablePrivacy(epsilon); - long cardinalityAfter = sketch.cardinality(); - - // Randomized response probability should reflect the new (private) epsilon - assertEquals(sketch.getRandomizedResponseProbability(), SfmSketch.getRandomizedResponseProbability(epsilon)); - assertTrue(sketch.isPrivacyEnabled()); - - // Cardinality should remain approximately the same - assertEquals(cardinalityAfter, cardinalityBefore, cardinalityBefore * 0.1); - } - - private static SfmSketch createSketchWithTargetCardinality(int numberOfBuckets, int precision, double epsilon, int cardinality) - { - // Building a sketch by adding items is really slow (O(n)) if you want to test billions/trillions/quadrillions/etc. - // Simulating the sketch is much faster (O(buckets * precision)). - RandomizationStrategy random = new TestingSeededRandomizationStrategy(1); - SfmSketch sketch = SfmSketch.create(numberOfBuckets, precision, random); - Bitmap bitmap = sketch.getBitmap(); - double c1 = sketch.getOnProbability(); - double c2 = sketch.getOnProbability() - sketch.getRandomizedResponseProbability(); - - for (int l = 0; l < precision; l++) { - double p = c1 - c2 * Math.pow(1 - Math.pow(2, -(l + 1)) / numberOfBuckets, cardinality); - for (int b = 0; b < numberOfBuckets; b++) { - bitmap.setBit(sketch.getBitLocation(b, l), random.nextBoolean(p)); - } - } - - sketch.enablePrivacy(epsilon); - return sketch; - } -} diff --git a/stats/src/test/java/com/facebook/airlift/stats/cardinality/TestUtils.java b/stats/src/test/java/com/facebook/airlift/stats/cardinality/TestUtils.java index 73752f4485..c0db2dd277 100644 --- a/stats/src/test/java/com/facebook/airlift/stats/cardinality/TestUtils.java +++ b/stats/src/test/java/com/facebook/airlift/stats/cardinality/TestUtils.java @@ -14,14 +14,9 @@ package com.facebook.airlift.stats.cardinality; import com.google.common.collect.ImmutableList; -import org.testng.annotations.Test; import java.util.List; -import static org.testng.Assert.assertEquals; -import static org.testng.Assert.assertFalse; -import static org.testng.Assert.assertTrue; - public final class TestUtils { private TestUtils() {} @@ -45,61 +40,4 @@ public static long createHashForBucket(int indexBitLength, int bucket, int leadi hash |= (long) bucket << (Long.SIZE - indexBitLength); return hash; } - - @Test - public void testPowerOf2() - { - for (int i = 1; i < 20; i++) { - assertTrue(Utils.isPowerOf2(Math.round(Math.pow(2, i)))); - assertFalse(Utils.isPowerOf2(Math.round(Math.pow(2, i)) + 1)); - } - } - - @Test - public void testNumberOfBuckets() - { - for (int i = 1; i < 20; i++) { - assertEquals(Utils.numberOfBuckets(i), Math.round(Math.pow(2, i))); - } - } - - @Test - public void testIndexBitLength() - { - for (int i = 1; i < 20; i++) { - assertEquals(Utils.indexBitLength((int) Math.pow(2, i)), i); - } - } - - @Test - public void testNumberOfLeadingZeros() - { - for (int indexBitLength : new int[]{6, 12, 18}) { - for (int i = 0; i < Long.SIZE - indexBitLength; i++) { - long hash = createHashForBucket(indexBitLength, 0, i); - assertEquals(Utils.numberOfLeadingZeros(hash, indexBitLength), i); - } - } - } - - @Test - public void testNumberOfTrailingZeros() - { - for (int indexBitLength : new int[]{6, 12, 18}) { - for (int i = 0; i < Long.SIZE - 1; i++) { - long hash = 1L << i; - assertEquals(Utils.numberOfTrailingZeros(hash, indexBitLength), Math.min(i, Long.SIZE - indexBitLength)); - } - } - } - - @Test - public void testComputeIndex() - { - for (int indexBitLength : new int[]{6, 12, 18}) { - long index = 5L; - long hash = index << (Long.SIZE - indexBitLength); - assertEquals(Utils.computeIndex(hash, indexBitLength), index); - } - } } diff --git a/stats/src/test/java/com/facebook/airlift/stats/cardinality/TestingDeterministicRandomizationStrategy.java b/stats/src/test/java/com/facebook/airlift/stats/cardinality/TestingDeterministicRandomizationStrategy.java deleted file mode 100644 index fd518ca59e..0000000000 --- a/stats/src/test/java/com/facebook/airlift/stats/cardinality/TestingDeterministicRandomizationStrategy.java +++ /dev/null @@ -1,20 +0,0 @@ -package com.facebook.airlift.stats.cardinality; - -/** - * Non-random numbers for testing - */ -public class TestingDeterministicRandomizationStrategy - extends RandomizationStrategy -{ - public TestingDeterministicRandomizationStrategy() {} - - public long getRetainedSizeInBytes() - { - return 0; // This is false, but it's not particularly important here. - } - - public double nextDouble() - { - return 0.5; - } -} diff --git a/stats/src/test/java/com/facebook/airlift/stats/cardinality/TestingRandomizationStrategy.java b/stats/src/test/java/com/facebook/airlift/stats/cardinality/TestingRandomizationStrategy.java new file mode 100644 index 0000000000..36d90b92ac --- /dev/null +++ b/stats/src/test/java/com/facebook/airlift/stats/cardinality/TestingRandomizationStrategy.java @@ -0,0 +1,36 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.facebook.airlift.stats.cardinality; + +/** + * A deterministic alternative to randomness (i.e., mock random numbers) + */ +public class TestingRandomizationStrategy + implements RandomizationStrategy +{ + public double effectiveProbability(double probability) + { + return 0.0; + } + + public boolean nextBoolean(double probability) + { + return false; + } + + public double nextLaplace(double scale) + { + return 0.0; + } +} diff --git a/stats/src/test/java/com/facebook/airlift/stats/cardinality/TestingSeededRandomizationStrategy.java b/stats/src/test/java/com/facebook/airlift/stats/cardinality/TestingSeededRandomizationStrategy.java deleted file mode 100644 index b19936ec52..0000000000 --- a/stats/src/test/java/com/facebook/airlift/stats/cardinality/TestingSeededRandomizationStrategy.java +++ /dev/null @@ -1,27 +0,0 @@ -package com.facebook.airlift.stats.cardinality; - -import java.util.Random; - -/** - * Seeded random numbers for testing - */ -public class TestingSeededRandomizationStrategy - extends RandomizationStrategy -{ - private final Random random; - - public TestingSeededRandomizationStrategy(long seed) - { - this.random = new Random(seed); - } - - public long getRetainedSizeInBytes() - { - return 0; // This is false, but it's not particularly important here. - } - - public double nextDouble() - { - return random.nextDouble(); - } -}