From b94b2778e69eafe7818d1e11cd924af07c4ffeb4 Mon Sep 17 00:00:00 2001 From: Gaston Arguindegui Date: Mon, 17 Jun 2019 15:51:00 -0700 Subject: [PATCH 1/4] Add implementation of T-Digest This commit copied the original code as it is intentionally. Refactoring will be done in future commits. --- .../presto/tdigest/AbstractTDigest.java | 151 +++ .../com/facebook/presto/tdigest/Centroid.java | 157 +++ .../presto/tdigest/MergingDigest.java | 926 ++++++++++++++++++ .../presto/tdigest/ScaleFunction.java | 617 ++++++++++++ .../com/facebook/presto/tdigest/Sort.java | 482 +++++++++ .../com/facebook/presto/tdigest/TDigest.java | 226 +++++ 6 files changed, 2559 insertions(+) create mode 100644 presto-main/src/main/java/com/facebook/presto/tdigest/AbstractTDigest.java create mode 100644 presto-main/src/main/java/com/facebook/presto/tdigest/Centroid.java create mode 100644 presto-main/src/main/java/com/facebook/presto/tdigest/MergingDigest.java create mode 100644 presto-main/src/main/java/com/facebook/presto/tdigest/ScaleFunction.java create mode 100644 presto-main/src/main/java/com/facebook/presto/tdigest/Sort.java create mode 100644 presto-main/src/main/java/com/facebook/presto/tdigest/TDigest.java diff --git a/presto-main/src/main/java/com/facebook/presto/tdigest/AbstractTDigest.java b/presto-main/src/main/java/com/facebook/presto/tdigest/AbstractTDigest.java new file mode 100644 index 0000000000000..ce0e618bf9101 --- /dev/null +++ b/presto-main/src/main/java/com/facebook/presto/tdigest/AbstractTDigest.java @@ -0,0 +1,151 @@ +/* + * Licensed to Ted Dunning under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.facebook.presto.tdigest; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Random; + +public abstract class AbstractTDigest extends TDigest +{ + final Random gen = new Random(); + boolean recordAllData = false; + + /** + * Same as {@link #weightedAverageSorted(double, double, double, double)} but flips + * the order of the variables if x2 is greater than + * x1. + */ + static double weightedAverage(double x1, double w1, double x2, double w2) { + if (x1 <= x2) { + return weightedAverageSorted(x1, w1, x2, w2); + } else { + return weightedAverageSorted(x2, w2, x1, w1); + } + } + + /** + * Compute the weighted average between x1 with a weight of + * w1 and x2 with a weight of w2. + * This expects x1 to be less than or equal to x2 + * and is guaranteed to return a number between x1 and + * x2. + */ + private static double weightedAverageSorted(double x1, double w1, double x2, double w2) { + assert x1 <= x2; + final double x = (x1 * w1 + x2 * w2) / (w1 + w2); + return Math.max(x1, Math.min(x, x2)); + } + + static double interpolate(double x, double x0, double x1) { + return (x - x0) / (x1 - x0); + } + + static void encode(ByteBuffer buf, int n) { + int k = 0; + while (n < 0 || n > 0x7f) { + byte b = (byte) (0x80 | (0x7f & n)); + buf.put(b); + n = n >>> 7; + k++; + if (k >= 6) { + throw new IllegalStateException("Size is implausibly large"); + } + } + buf.put((byte) n); + } + + static int decode(ByteBuffer buf) { + int v = buf.get(); + int z = 0x7f & v; + int shift = 7; + while ((v & 0x80) != 0) { + if (shift > 28) { + throw new IllegalStateException("Shift too large in decode"); + } + v = buf.get(); + z += (v & 0x7f) << shift; + shift += 7; + } + return z; + } + + abstract void add(double x, int w, Centroid base); + + /** + * Computes an interpolated value of a quantile that is between two centroids. + * + * Index is the quantile desired multiplied by the total number of samples - 1. + * + * @param index Denormalized quantile desired + * @param previousIndex The denormalized quantile corresponding to the center of the previous centroid. + * @param nextIndex The denormalized quantile corresponding to the center of the following centroid. + * @param previousMean The mean of the previous centroid. + * @param nextMean The mean of the following centroid. + * @return The interpolated mean. + */ + static double quantile(double index, double previousIndex, double nextIndex, double previousMean, double nextMean) { + final double delta = nextIndex - previousIndex; + final double previousWeight = (nextIndex - index) / delta; + final double nextWeight = (index - previousIndex) / delta; + return previousMean * previousWeight + nextMean * nextWeight; + } + + /** + * Sets up so that all centroids will record all data assigned to them. For testing only, really. + */ + @Override + public TDigest recordAllData() { + recordAllData = true; + return this; + } + + @Override + public boolean isRecording() { + return recordAllData; + } + + /** + * Adds a sample to a histogram. + * + * @param x The value to add. + */ + @Override + public void add(double x) { + add(x, 1); + } + + @Override + public void add(TDigest other) { + List tmp = new ArrayList<>(); + for (Centroid centroid : other.centroids()) { + tmp.add(centroid); + } + + Collections.shuffle(tmp, gen); + for (Centroid centroid : tmp) { + add(centroid.mean(), centroid.count(), centroid); + } + } + + protected Centroid createCentroid(double mean, int id) { + return new Centroid(mean, id, recordAllData); + } +} diff --git a/presto-main/src/main/java/com/facebook/presto/tdigest/Centroid.java b/presto-main/src/main/java/com/facebook/presto/tdigest/Centroid.java new file mode 100644 index 0000000000000..d865a9a194488 --- /dev/null +++ b/presto-main/src/main/java/com/facebook/presto/tdigest/Centroid.java @@ -0,0 +1,157 @@ +/* + * Licensed to Ted Dunning under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.facebook.presto.tdigest; + +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; + +/** + * A single centroid which represents a number of data points. + */ +public class Centroid implements Comparable, Serializable { + private static final AtomicInteger uniqueCount = new AtomicInteger(1); + + private double centroid = 0; + private int count = 0; + + // The ID is transient because it must be unique within a given JVM. A new + // ID should be generated from uniqueCount when a Centroid is deserialized. + private transient int id; + + private List actualData = null; + + private Centroid(boolean record) { + id = uniqueCount.getAndIncrement(); + if (record) { + actualData = new ArrayList<>(); + } + } + + public Centroid(double x) { + this(false); + start(x, 1, uniqueCount.getAndIncrement()); + } + + public Centroid(double x, int w) { + this(false); + start(x, w, uniqueCount.getAndIncrement()); + } + + public Centroid(double x, int w, int id) { + this(false); + start(x, w, id); + } + + public Centroid(double x, int id, boolean record) { + this(record); + start(x, 1, id); + } + + Centroid(double x, int w, List data) { + this(x, w); + actualData = data; + } + + private void start(double x, int w, int id) { + this.id = id; + add(x, w); + } + + public void add(double x, int w) { + if (actualData != null) { + actualData.add(x); + } + count += w; + centroid += w * (x - centroid) / count; + } + + public double mean() { + return centroid; + } + + public int count() { + return count; + } + + public int id() { + return id; + } + + @Override + public String toString() { + return "Centroid{" + + "centroid=" + centroid + + ", count=" + count + + '}'; + } + + @Override + public int hashCode() { + return id; + } + + @Override + public int compareTo(@SuppressWarnings("NullableProblems") Centroid o) { + int r = Double.compare(centroid, o.centroid); + if (r == 0) { + r = id - o.id; + } + return r; + } + + public List data() { + return actualData; + } + + @SuppressWarnings("WeakerAccess") + public void insertData(double x) { + if (actualData == null) { + actualData = new ArrayList<>(); + } + actualData.add(x); + } + + public static Centroid createWeighted(double x, int w, Iterable data) { + Centroid r = new Centroid(data != null); + r.add(x, w, data); + return r; + } + + public void add(double x, int w, Iterable data) { + if (actualData != null) { + if (data != null) { + for (Double old : data) { + actualData.add(old); + } + } else { + actualData.add(x); + } + } + centroid = AbstractTDigest.weightedAverage(centroid, count, x, w); + count += w; + } + + private void readObject(ObjectInputStream in) throws ClassNotFoundException, IOException { + in.defaultReadObject(); + id = uniqueCount.getAndIncrement(); + } +} diff --git a/presto-main/src/main/java/com/facebook/presto/tdigest/MergingDigest.java b/presto-main/src/main/java/com/facebook/presto/tdigest/MergingDigest.java new file mode 100644 index 0000000000000..e4e7b32f9d882 --- /dev/null +++ b/presto-main/src/main/java/com/facebook/presto/tdigest/MergingDigest.java @@ -0,0 +1,926 @@ +/* + * Licensed to Ted Dunning under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.facebook.presto.tdigest; + +import java.nio.ByteBuffer; +import java.util.AbstractCollection; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; + +/** + * Maintains a t-digest by collecting new points in a buffer that is then sorted occasionally and merged + * into a sorted array that contains previously computed centroids. + *

+ * This can be very fast because the cost of sorting and merging is amortized over several insertion. If + * we keep N centroids total and have the input array is k long, then the amortized cost is something like + *

+ * N/k + log k + *

+ * These costs even out when N/k = log k. Balancing costs is often a good place to start in optimizing an + * algorithm. For different values of compression factor, the following table shows estimated asymptotic + * values of N and suggested values of k: + * + * + * + * + * + * + * + * + * + * + *
CompressionNk
507825
10015742
20031473
Sizing considerations for t-digest
+ *

+ * The virtues of this kind of t-digest implementation include: + *

    + *
  • No allocation is required after initialization
  • + *
  • The data structure automatically compresses existing centroids when possible
  • + *
  • No Java object overhead is incurred for centroids since data is kept in primitive arrays
  • + *
+ *

+ * The current implementation takes the liberty of using ping-pong buffers for implementing the merge resulting + * in a substantial memory penalty, but the complexity of an in place merge was not considered as worthwhile + * since even with the overhead, the memory cost is less than 40 bytes per centroid which is much less than half + * what the AVLTreeDigest uses and no dynamic allocation is required at all. + */ +public class MergingDigest extends AbstractTDigest +{ + private int mergeCount = 0; + + private final double publicCompression; + private final double compression; + + // points to the first unused centroid + private int lastUsedCell; + + // sum_i weight[i] See also unmergedWeight + private double totalWeight = 0; + + // number of points that have been added to each merged centroid + private final double[] weight; + // mean of points added to each merged centroid + private final double[] mean; + + // history of all data added to centroids (for testing purposes) + private List> data = null; + + // sum_i tempWeight[i] + private double unmergedWeight = 0; + + // this is the index of the next temporary centroid + // this is a more Java-like convention than lastUsedCell uses + private int tempUsed = 0; + private final double[] tempWeight; + private final double[] tempMean; + private List> tempData = null; + + + // array used for sorting the temp centroids. This is a field + // to avoid allocations during operation + private final int[] order; + + // if true, alternate upward and downward merge passes + public boolean useAlternatingSort = true; + // if true, use higher working value of compression during construction, then reduce on presentation + public boolean useTwoLevelCompression = true; + + // this forces centroid merging based on size limit rather than + // based on accumulated k-index. This can be much faster since we + // scale functions are more expensive than the corresponding + // weight limits. + public static boolean useWeightLimit = true; + + /** + * Allocates a buffer merging t-digest. This is the normally used constructor that + * allocates default sized internal arrays. Other versions are available, but should + * only be used for special cases. + * + * @param compression The compression factor + */ + @SuppressWarnings("WeakerAccess") + public MergingDigest(double compression) { + this(compression, -1); + } + + /** + * If you know the size of the temporary buffer for incoming points, you can use this entry point. + * + * @param compression Compression factor for t-digest. Same as 1/\delta in the paper. + * @param bufferSize How many samples to retain before merging. + */ + @SuppressWarnings("WeakerAccess") + public MergingDigest(double compression, int bufferSize) { + // we can guarantee that we only need ceiling(compression). + this(compression, bufferSize, -1); + } + + /** + * Fully specified constructor. Normally only used for deserializing a buffer t-digest. + * + * @param compression Compression factor + * @param bufferSize Number of temporary centroids + * @param size Size of main buffer + */ + @SuppressWarnings("WeakerAccess") + public MergingDigest(double compression, int bufferSize, int size) { + // ensure compression >= 10 + // default size = 2 * ceil(compression) + // default bufferSize = 5 * size + // scale = max(2, bufferSize / size - 1) + // compression, publicCompression = sqrt(scale-1)*compression, compression + // ensure size > 2 * compression + weightLimitFudge + // ensure bufferSize > 2*size + + // force reasonable value. Anything less than 10 doesn't make much sense because + // too few centroids are retained + if (compression < 10) { + compression = 10; + } + + // the weight limit is too conservative about sizes and can require a bit of extra room + double sizeFudge = 0; + if (useWeightLimit) { + sizeFudge = 10; + if (compression < 30) sizeFudge += 20; + } + + // default size + size = (int) Math.max(2 * compression + sizeFudge, size); + + // default buffer + if (bufferSize == -1) { + // TODO update with current numbers + // having a big buffer is good for speed + // experiments show bufferSize = 1 gives half the performance of bufferSize=10 + // bufferSize = 2 gives 40% worse performance than 10 + // but bufferSize = 5 only costs about 5-10% + // + // compression factor time(us) + // 50 1 0.275799 + // 50 2 0.151368 + // 50 5 0.108856 + // 50 10 0.102530 + // 100 1 0.215121 + // 100 2 0.142743 + // 100 5 0.112278 + // 100 10 0.107753 + // 200 1 0.210972 + // 200 2 0.148613 + // 200 5 0.118220 + // 200 10 0.112970 + // 500 1 0.219469 + // 500 2 0.158364 + // 500 5 0.127552 + // 500 10 0.121505 + bufferSize = 5 * size; + } + + // ensure enough space in buffer + if (bufferSize <= 2 * size) { + bufferSize = 2 * size; + } + + // scale is the ratio of extra buffer to the final size + // we have to account for the fact that we copy all live centroids into the incoming space + double scale = Math.max(1, bufferSize / size - 1); + if (!useTwoLevelCompression) { + scale = 1; + } + + // publicCompression is how many centroids the user asked for + // compression is how many we actually keep + this.publicCompression = compression; + this.compression = Math.sqrt(scale) * publicCompression; + + // changing the compression could cause buffers to be too small, readjust if so + if (size < this.compression + sizeFudge) { + size = (int) Math.ceil(this.compression + sizeFudge); + } + + // ensure enough space in buffer (possibly again) + if (bufferSize <= 2 * size) { + bufferSize = 2 * size; + } + + weight = new double[size]; + mean = new double[size]; + + tempWeight = new double[bufferSize]; + tempMean = new double[bufferSize]; + order = new int[bufferSize]; + + lastUsedCell = 0; + } + + /** + * Turns on internal data recording. + */ + @Override + public TDigest recordAllData() { + super.recordAllData(); + data = new ArrayList<>(); + tempData = new ArrayList<>(); + return this; + } + + @Override + void add(double x, int w, Centroid base) { + add(x, w, base.data()); + } + + @Override + public void add(double x, int w) { + add(x, w, (List) null); + } + + private void add(double x, int w, List history) { + if (Double.isNaN(x)) { + throw new IllegalArgumentException("Cannot add NaN to t-digest"); + } + if (tempUsed >= tempWeight.length - lastUsedCell - 1) { + mergeNewValues(); + } + int where = tempUsed++; + tempWeight[where] = w; + tempMean[where] = x; + unmergedWeight += w; + if (x < min) { + min = x; + } + if (x > max) { + max = x; + } + + if (data != null) { + if (tempData == null) { + tempData = new ArrayList<>(); + } + while (tempData.size() <= where) { + tempData.add(new ArrayList()); + } + if (history == null) { + history = Collections.singletonList(x); + } + tempData.get(where).addAll(history); + } + } + + private void add(double[] m, double[] w, int count, List> data) { + if (m.length != w.length) { + throw new IllegalArgumentException("Arrays not same length"); + } + if (m.length < count + lastUsedCell) { + // make room to add existing centroids + double[] m1 = new double[count + lastUsedCell]; + System.arraycopy(m, 0, m1, 0, count); + m = m1; + double[] w1 = new double[count + lastUsedCell]; + System.arraycopy(w, 0, w1, 0, count); + w = w1; + } + double total = 0; + for (int i = 0; i < count; i++) { + total += w[i]; + } + merge(m, w, count, data, null, total, false, compression); + } + + @Override + public void add(List others) { + if (others.size() == 0) { + return; + } + int size = lastUsedCell; + for (TDigest other : others) { + other.compress(); + size += other.centroidCount(); + } + + double[] m = new double[size]; + double[] w = new double[size]; + List> data; + if (recordAllData) { + data = new ArrayList<>(); + } else { + data = null; + } + int offset = 0; + for (TDigest other : others) { + if (other instanceof MergingDigest) { + MergingDigest md = (MergingDigest) other; + System.arraycopy(md.mean, 0, m, offset, md.lastUsedCell); + System.arraycopy(md.weight, 0, w, offset, md.lastUsedCell); + if (data != null) { + for (Centroid centroid : other.centroids()) { + data.add(centroid.data()); + } + } + offset += md.lastUsedCell; + } else { + for (Centroid centroid : other.centroids()) { + m[offset] = centroid.mean(); + w[offset] = centroid.count(); + if (recordAllData) { + assert data != null; + data.add(centroid.data()); + } + offset++; + } + } + } + add(m, w, size, data); + } + + private void mergeNewValues() { + mergeNewValues(false, compression); + } + + private void mergeNewValues(boolean force, double compression) { + if (totalWeight == 0 && unmergedWeight == 0) { + // seriously nothing to do + return; + } + if (force || unmergedWeight > 0) { + // note that we run the merge in reverse every other merge to avoid left-to-right bias in merging + merge(tempMean, tempWeight, tempUsed, tempData, order, unmergedWeight, + useAlternatingSort & mergeCount % 2 == 1, compression); + mergeCount++; + tempUsed = 0; + unmergedWeight = 0; + if (data != null) { + tempData = new ArrayList<>(); + } + } + } + + private void merge(double[] incomingMean, double[] incomingWeight, int incomingCount, + List> incomingData, int[] incomingOrder, + double unmergedWeight, boolean runBackwards, double compression) { + System.arraycopy(mean, 0, incomingMean, incomingCount, lastUsedCell); + System.arraycopy(weight, 0, incomingWeight, incomingCount, lastUsedCell); + incomingCount += lastUsedCell; + + if (incomingData != null) { + for (int i = 0; i < lastUsedCell; i++) { + assert data != null; + incomingData.add(data.get(i)); + } + data = new ArrayList<>(); + } + if (incomingOrder == null) { + incomingOrder = new int[incomingCount]; + } + Sort.sort(incomingOrder, incomingMean, incomingCount); + // option to run backwards is to investigate bias in errors + if (runBackwards) { + Sort.reverse(incomingOrder, 0, incomingCount); + } + + totalWeight += unmergedWeight; + + assert (lastUsedCell + incomingCount) > 0; + lastUsedCell = 0; + mean[lastUsedCell] = incomingMean[incomingOrder[0]]; + weight[lastUsedCell] = incomingWeight[incomingOrder[0]]; + double wSoFar = 0; + if (data != null) { + assert incomingData != null; + data.add(incomingData.get(incomingOrder[0])); + } + + + // weight will contain all zeros after this loop + + double normalizer = scale.normalizer(compression, totalWeight); + double k1 = scale.k(0, normalizer); + double wLimit = totalWeight * scale.q(k1 + 1, normalizer); + for (int i = 1; i < incomingCount; i++) { + int ix = incomingOrder[i]; + double proposedWeight = weight[lastUsedCell] + incomingWeight[ix]; + double projectedW = wSoFar + proposedWeight; + boolean addThis; + if (useWeightLimit) { + double q0 = wSoFar / totalWeight; + double q2 = (wSoFar + proposedWeight) / totalWeight; + addThis = proposedWeight <= totalWeight * Math.min(scale.max(q0, normalizer), scale.max(q2, normalizer)); + } else { + addThis = projectedW <= wLimit; + } + + if (addThis) { + // next point will fit + // so merge into existing centroid + weight[lastUsedCell] += incomingWeight[ix]; + mean[lastUsedCell] = mean[lastUsedCell] + (incomingMean[ix] - mean[lastUsedCell]) * incomingWeight[ix] / weight[lastUsedCell]; + incomingWeight[ix] = 0; + + if (data != null) { + while (data.size() <= lastUsedCell) { + data.add(new ArrayList()); + } + assert incomingData != null; + assert data.get(lastUsedCell) != incomingData.get(ix); + data.get(lastUsedCell).addAll(incomingData.get(ix)); + } + } else { + // didn't fit ... move to next output, copy out first centroid + wSoFar += weight[lastUsedCell]; + if (!useWeightLimit) { + k1 = scale.k(wSoFar / totalWeight, normalizer); + wLimit = totalWeight * scale.q(k1 + 1, normalizer); + } + + lastUsedCell++; + mean[lastUsedCell] = incomingMean[ix]; + weight[lastUsedCell] = incomingWeight[ix]; + incomingWeight[ix] = 0; + + if (data != null) { + assert incomingData != null; + assert data.size() == lastUsedCell; + data.add(incomingData.get(ix)); + } + } + } + // points to next empty cell + lastUsedCell++; + + // sanity check + double sum = 0; + for (int i = 0; i < lastUsedCell; i++) { + sum += weight[i]; + } + assert sum == totalWeight; + if (runBackwards) { + Sort.reverse(mean, 0, lastUsedCell); + Sort.reverse(weight, 0, lastUsedCell); + if (data != null) { + Collections.reverse(data); + } + } + + if (totalWeight > 0) { + min = Math.min(min, mean[0]); + max = Math.max(max, mean[lastUsedCell - 1]); + } + } + + /** + * Exposed for testing. + */ + int checkWeights() { + return checkWeights(weight, totalWeight, lastUsedCell); + } + + private int checkWeights(double[] w, double total, int last) { + int badCount = 0; + + int n = last; + if (w[n] > 0) { + n++; + } + + double normalizer = scale.normalizer(publicCompression, totalWeight); + double k1 = scale.k(0, normalizer); + double q = 0; + double left = 0; + String header = "\n"; + for (int i = 0; i < n; i++) { + double dq = w[i] / total; + double k2 = scale.k(q + dq, normalizer); + q += dq / 2; + if (k2 - k1 > 1 && w[i] != 1) { + System.out.printf("%sOversize centroid at " + + "%d, k0=%.2f, k1=%.2f, dk=%.2f, w=%.2f, q=%.4f, dq=%.4f, left=%.1f, current=%.2f maxw=%.2f\n", + header, i, k1, k2, k2 - k1, w[i], q, dq, left, w[i], totalWeight * scale.max(q, normalizer)); + header = ""; + badCount++; + } + if (k2 - k1 > 4 && w[i] != 1) { + throw new IllegalStateException( + String.format("Egregiously oversized centroid at " + + "%d, k0=%.2f, k1=%.2f, dk=%.2f, w=%.2f, q=%.4f, dq=%.4f, left=%.1f, current=%.2f, maxw=%.2f\n", + i, k1, k2, k2 - k1, w[i], q, dq, left, w[i], totalWeight * scale.max(q, normalizer))); + } + q += dq / 2; + left += w[i]; + k1 = k2; + } + + return badCount; + } + + /** + * Merges any pending inputs and compresses the data down to the public setting. + * Note that this typically loses a bit of precision and thus isn't a thing to + * be doing all the time. It is best done only when we want to show results to + * the outside world. + */ + @Override + public void compress() { + mergeNewValues(true, publicCompression); + } + + @Override + public long size() { + return (long) (totalWeight + unmergedWeight); + } + + @Override + public double cdf(double x) { + mergeNewValues(); + + if (lastUsedCell == 0) { + // no data to examine + return Double.NaN; + } else if (lastUsedCell == 1) { + // exactly one centroid, should have max==min + double width = max - min; + if (x < min) { + return 0; + } else if (x > max) { + return 1; + } else if (x - min <= width) { + // min and max are too close together to do any viable interpolation + return 0.5; + } else { + // interpolate if somehow we have weight > 0 and max != min + return (x - min) / (max - min); + } + } else { + int n = lastUsedCell; + if (x < min) { + return 0; + } + + if (x > max) { + return 1; + } + + // check for the left tail + if (x < mean[0]) { + // note that this is different than mean[0] > min + // ... this guarantees we divide by non-zero number and interpolation works + if (mean[0] - min > 0) { + // must be a sample exactly at min + if (x == min) { + return 0.5 / totalWeight; + } else { + return (1 + (x - min) / (mean[0] - min) * (weight[0] / 2 - 1)) / totalWeight; + } + } else { + // this should be redundant with the check x < min + return 0; + } + } + assert x >= mean[0]; + + // and the right tail + if (x > mean[n - 1]) { + if (max - mean[n - 1] > 0) { + if (x == max) { + return 1 - 0.5 / totalWeight; + } else { + // there has to be a single sample exactly at max + double dq = (1 + (max - x) / (max - mean[n - 1]) * (weight[n - 1] / 2 - 1)) / totalWeight; + return 1 - dq; + } + } else { + return 1; + } + } + + // we know that there are at least two centroids and mean[0] < x < mean[n-1] + // that means that there are either one or more consecutive centroids all at exactly x + // or there are consecutive centroids, c0 < x < c1 + double weightSoFar = 0; + for (int it = 0; it < n - 1; it++) { + // weightSoFar does not include weight[it] yet + if (mean[it] == x) { + // we have one or more centroids == x, treat them as one + // dw will accumulate the weight of all of the centroids at x + double dw = 0; + while (it < n && mean[it] == x) { + dw += weight[it]; + it++; + } + return (weightSoFar + dw / 2) / totalWeight; + } else if (mean[it] <= x && x < mean[it + 1]) { + // landed between centroids ... check for floating point madness + if (mean[it + 1] - mean[it] > 0) { + // note how we handle singleton centroids here + // the point is that for singleton centroids, we know that their entire + // weight is exactly at the centroid and thus shouldn't be involved in + // interpolation + double leftExcludedW = 0; + double rightExcludedW = 0; + if (weight[it] == 1) { + if (weight[it + 1] == 1) { + // two singletons means no interpolation + // left singleton is in, right is out + return (weightSoFar + 1) / totalWeight; + } else { + leftExcludedW = 0.5; + } + } else if (weight[it + 1] == 1) { + rightExcludedW = 0.5; + } + double dw = (weight[it] + weight[it + 1]) / 2; + + // can't have double singleton (handled that earlier) + assert dw > 1; + assert (leftExcludedW + rightExcludedW) <= 0.5; + + // adjust endpoints for any singleton + double left = mean[it]; + double right = mean[it + 1]; + + double dwNoSingleton = dw - leftExcludedW - rightExcludedW; + + // adjustments have only limited effect on endpoints + assert dwNoSingleton > dw / 2; + assert right - left > 0; + double base = weightSoFar + weight[it] / 2 + leftExcludedW; + return (base + dwNoSingleton * (x - left) / (right - left)) / totalWeight; + } else { + // this is simply caution against floating point madness + // it is conceivable that the centroids will be different + // but too near to allow safe interpolation + double dw = (weight[it] + weight[it + 1]) / 2; + return (weightSoFar + dw) / totalWeight; + } + } else { + weightSoFar += weight[it]; + } + } + if (x == mean[n - 1]) { + return 1 - 0.5 / totalWeight; + } else { + throw new IllegalStateException("Can't happen ... loop fell through"); + } + } + } + + @Override + public double quantile(double q) { + if (q < 0 || q > 1) { + throw new IllegalArgumentException("q should be in [0,1], got " + q); + } + mergeNewValues(); + + if (lastUsedCell == 0) { + // no centroids means no data, no way to get a quantile + return Double.NaN; + } else if (lastUsedCell == 1) { + // with one data point, all quantiles lead to Rome + return mean[0]; + } + + // we know that there are at least two centroids now + int n = lastUsedCell; + + // if values were stored in a sorted array, index would be the offset we are interested in + final double index = q * totalWeight; + + // beyond the boundaries, we return min or max + // usually, the first centroid will have unit weight so this will make it moot + if (index < 1) { + return min; + } + + // if the left centroid has more than one sample, we still know + // that one sample occurred at min so we can do some interpolation + if (weight[0] > 1 && index < weight[0] / 2) { + // there is a single sample at min so we interpolate with less weight + return min + (index - 1) / (weight[0] / 2 - 1) * (mean[0] - min); + } + + // usually the last centroid will have unit weight so this test will make it moot + if (index > totalWeight - 1) { + return max; + } + + // if the right-most centroid has more than one sample, we still know + // that one sample occurred at max so we can do some interpolation + if (weight[n-1] > 1 && totalWeight - index <= weight[n - 1] / 2) { + return max - (totalWeight - index - 1) / (weight[n - 1] / 2 - 1) * (max - mean[n - 1]); + } + + // in between extremes we interpolate between centroids + double weightSoFar = weight[0] / 2; + for (int i = 0; i < n - 1; i++) { + double dw = (weight[i] + weight[i + 1]) / 2; + if (weightSoFar + dw > index) { + // centroids i and i+1 bracket our current point + + // check for unit weight + double leftUnit = 0; + if (weight[i] == 1) { + if (index - weightSoFar < 0.5) { + // within the singleton's sphere + return mean[i]; + } else { + leftUnit = 0.5; + } + } + double rightUnit = 0; + if (weight[i + 1] == 1) { + if (weightSoFar + dw - index <= 0.5) { + // no interpolation needed near singleton + return mean[i + 1]; + } + rightUnit = 0.5; + } + double z1 = index - weightSoFar - leftUnit; + double z2 = weightSoFar + dw - index - rightUnit; + return weightedAverage(mean[i], z2, mean[i + 1], z1); + } + weightSoFar += dw; + } + // we handled singleton at end up above + assert weight[n - 1] > 1; + assert index <= totalWeight; + assert index >= totalWeight - weight[n - 1] / 2; + + // weightSoFar = totalWeight - weight[n-1]/2 (very nearly) + // so we interpolate out to max value ever seen + double z1 = index - totalWeight - weight[n - 1] / 2.0; + double z2 = weight[n - 1] / 2 - z1; + return weightedAverage(mean[n - 1], z1, max, z2); + } + + @Override + public int centroidCount() { + mergeNewValues(); + return lastUsedCell; + } + + @Override + public Collection centroids() { + // we don't actually keep centroid structures around so we have to fake it + compress(); + return new AbstractCollection() { + @Override + public Iterator iterator() { + return new Iterator() { + int i = 0; + + @Override + public boolean hasNext() { + return i < lastUsedCell; + } + + @Override + public Centroid next() { + Centroid rc = new Centroid(mean[i], (int) weight[i], data != null ? data.get(i) : null); + i++; + return rc; + } + + @Override + public void remove() { + throw new UnsupportedOperationException("Default operation"); + } + }; + } + + @Override + public int size() { + return lastUsedCell; + } + }; + } + + @Override + public double compression() { + return publicCompression; + } + + @Override + public int byteSize() { + compress(); + // format code, compression(float), buffer-size(int), temp-size(int), #centroids-1(int), + // then two doubles per centroid + return lastUsedCell * 16 + 32; + } + + @Override + public int smallByteSize() { + compress(); + // format code(int), compression(float), buffer-size(short), temp-size(short), #centroids-1(short), + // then two floats per centroid + return lastUsedCell * 8 + 30; + } + + @SuppressWarnings("WeakerAccess") + public ScaleFunction getScaleFunction() { + return scale; + } + + public enum Encoding { + VERBOSE_ENCODING(1), SMALL_ENCODING(2); + + private final int code; + + Encoding(int code) { + this.code = code; + } + } + + @Override + public void asBytes(ByteBuffer buf) { + compress(); + buf.putInt(Encoding.VERBOSE_ENCODING.code); + buf.putDouble(min); + buf.putDouble(max); + buf.putDouble(publicCompression); + buf.putInt(lastUsedCell); + for (int i = 0; i < lastUsedCell; i++) { + buf.putDouble(weight[i]); + buf.putDouble(mean[i]); + } + } + + @Override + public void asSmallBytes(ByteBuffer buf) { + compress(); + buf.putInt(Encoding.SMALL_ENCODING.code); // 4 + buf.putDouble(min); // + 8 + buf.putDouble(max); // + 8 + buf.putFloat((float) publicCompression); // + 4 + buf.putShort((short) mean.length); // + 2 + buf.putShort((short) tempMean.length); // + 2 + buf.putShort((short) lastUsedCell); // + 2 = 30 + for (int i = 0; i < lastUsedCell; i++) { + buf.putFloat((float) weight[i]); + buf.putFloat((float) mean[i]); + } + } + + @SuppressWarnings("WeakerAccess") + public static MergingDigest fromBytes(ByteBuffer buf) { + int encoding = buf.getInt(); + if (encoding == Encoding.VERBOSE_ENCODING.code) { + double min = buf.getDouble(); + double max = buf.getDouble(); + double compression = buf.getDouble(); + int n = buf.getInt(); + MergingDigest r = new MergingDigest(compression); + r.setMinMax(min, max); + r.lastUsedCell = n; + for (int i = 0; i < n; i++) { + r.weight[i] = buf.getDouble(); + r.mean[i] = buf.getDouble(); + + r.totalWeight += r.weight[i]; + } + return r; + } else if (encoding == Encoding.SMALL_ENCODING.code) { + double min = buf.getDouble(); + double max = buf.getDouble(); + double compression = buf.getFloat(); + int n = buf.getShort(); + int bufferSize = buf.getShort(); + MergingDigest r = new MergingDigest(compression, bufferSize, n); + r.setMinMax(min, max); + r.lastUsedCell = buf.getShort(); + for (int i = 0; i < r.lastUsedCell; i++) { + r.weight[i] = buf.getFloat(); + r.mean[i] = buf.getFloat(); + + r.totalWeight += r.weight[i]; + } + return r; + } else { + throw new IllegalStateException("Invalid format for serialized histogram"); + } + + } + + @Override + public String toString() { + return "MergingDigest" + + "-" + getScaleFunction() + + "-" + (useWeightLimit ? "weight" : "kSize") + + "-" + (useAlternatingSort ? "alternating" : "stable") + + "-" + (useTwoLevelCompression ? "twoLevel" : "oneLevel"); + } +} diff --git a/presto-main/src/main/java/com/facebook/presto/tdigest/ScaleFunction.java b/presto-main/src/main/java/com/facebook/presto/tdigest/ScaleFunction.java new file mode 100644 index 0000000000000..ad586586078f9 --- /dev/null +++ b/presto-main/src/main/java/com/facebook/presto/tdigest/ScaleFunction.java @@ -0,0 +1,617 @@ +/* + * Licensed to Ted Dunning under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.facebook.presto.tdigest; + +/** + * Encodes the various scale functions for t-digests. These limits trade accuracy near the tails against accuracy near + * the median in different ways. For instance, K_0 has uniform cluster sizes and results in constant accuracy (in terms + * of q) while K_3 has cluster sizes proportional to min(q,1-q) which results in very much smaller error near the tails + * and modestly increased error near the median. + *

+ * The base forms (K_0, K_1, K_2 and K_3) all result in t-digests limited to a number of clusters equal to the + * compression factor. The K_2_NO_NORM and K_3_NO_NORM versions result in the cluster count increasing roughly with + * log(n). + */ +public enum ScaleFunction { + /** + * Generates uniform cluster sizes. Used for comparison only. + */ + K_0 { + @Override + public double k(double q, double compression, double n) { + return compression * q / 2; + } + + @Override + public double k(double q, double normalizer) { + return normalizer * q; + } + + @Override + public double q(double k, double compression, double n) { + return 2 * k / compression; + } + + @Override + public double q(double k, double normalizer) { + return k / normalizer; + } + + @Override + public double max(double q, double compression, double n) { + return 2 / compression; + } + + @Override + public double max(double q, double normalizer) { + return 1 / normalizer; + } + + @Override + public double normalizer(double compression, double n) { + return compression / 2; + } + }, + + /** + * Generates cluster sizes proportional to sqrt(q*(1-q)). This gives constant relative accuracy if accuracy is + * proportional to squared cluster size. It is expected that K_2 and K_3 will give better practical results. + */ + K_1 { + @Override + public double k(double q, double compression, double n) { + return compression * Math.asin(2 * q - 1) / (2 * Math.PI); + } + + @Override + public double k(double q, double normalizer) { + return normalizer * Math.asin(2 * q - 1); + } + + + @Override + public double q(double k, double compression, double n) { + return (Math.sin(k * (2 * Math.PI / compression)) + 1) / 2; + } + + @Override + public double q(double k, double normalizer) { + return (Math.sin(k / normalizer) + 1) / 2; + } + + @Override + public double max(double q, double compression, double n) { + if (q <= 0) { + return 0; + } else if (q >= 1) { + return 0; + } else { + return 2 * Math.sin(Math.PI / compression) * Math.sqrt(q * (1 - q)); + } + } + + @Override + public double max(double q, double normalizer) { + if (q <= 0) { + return 0; + } else if (q >= 1) { + return 0; + } else { + return 2 * Math.sin(0.5 / normalizer) * Math.sqrt(q * (1 - q)); + } + } + + @Override + public double normalizer(double compression, double n) { + return compression / (2 * Math.PI); + } + }, + + /** + * Generates cluster sizes proportional to sqrt(q*(1-q)) but avoids computation of asin in the critical path by + * using an approximate version. + */ + K_1_FAST { + @Override + public double k(double q, double compression, double n) { + return compression * fastAsin(2 * q - 1) / (2 * Math.PI); + } + + @Override + public double k(double q, double normalizer) { + return normalizer * fastAsin(2 * q - 1); + } + + @Override + public double q(double k, double compression, double n) { + return (Math.sin(k * (2 * Math.PI / compression)) + 1) / 2; + } + + @Override + public double q(double k, double normalizer) { + return (Math.sin(k / normalizer) + 1) / 2; + } + + @Override + public double max(double q, double compression, double n) { + if (q <= 0) { + return 0; + } else if (q >= 1) { + return 0; + } else { + return 2 * Math.sin(Math.PI / compression) * Math.sqrt(q * (1 - q)); + } + } + + @Override + public double max(double q, double normalizer) { + if (q <= 0) { + return 0; + } else if (q >= 1) { + return 0; + } else { + return 2 * Math.sin(0.5 / normalizer) * Math.sqrt(q * (1 - q)); + } + } + + @Override + public double normalizer(double compression, double n) { + return compression / (2 * Math.PI); + } + }, + + /** + * Generates cluster sizes proportional to q*(1-q). This makes tail error bounds tighter than for K_1. The use of a + * normalizing function results in a strictly bounded number of clusters no matter how many samples. + */ + K_2 { + @Override + public double k(double q, double compression, double n) { + if (n <= 1) { + if (q <= 0) { + return -10; + } else if (q >= 1) { + return 10; + } else { + return 0; + } + } + if (q == 0) { + return 2 * k(1 / n, compression, n); + } else if (q == 1) { + return 2 * k((n - 1) / n, compression, n); + } else { + return compression * Math.log(q / (1 - q)) / Z(compression, n); + } + } + + @Override + public double k(double q, double normalizer) { + if (q < 1e-15) { + // this will return something more extreme than q = 1/n + return 2 * k(1e-15, normalizer); + } else if (q > 1 - 1e-15) { + // this will return something more extreme than q = (n-1)/n + return 2 * k(1 - 1e-15, normalizer); + } else { + return Math.log(q / (1 - q)) * normalizer; + } + } + + @Override + public double q(double k, double compression, double n) { + double w = Math.exp(k * Z(compression, n) / compression); + return w / (1 + w); + } + + @Override + public double q(double k, double normalizer) { + double w = Math.exp(k / normalizer); + return w / (1 + w); + } + + @Override + public double max(double q, double compression, double n) { + return Z(compression, n) * q * (1 - q) / compression; + } + + @Override + public double max(double q, double normalizer) { + return q * (1 - q) / normalizer; + } + + @Override + public double normalizer(double compression, double n) { + return compression / Z(compression, n); + } + + private double Z(double compression, double n) { + return 4 * Math.log(n / compression) + 24; + } + }, + + /** + * Generates cluster sizes proportional to min(q, 1-q). This makes tail error bounds tighter than for K_1 or K_2. + * The use of a normalizing function results in a strictly bounded number of clusters no matter how many samples. + */ + K_3 { + @Override + public double k(double q, double compression, double n) { + if (q < 0.9 / n) { + return 10 * k(1 / n, compression, n); + } else if (q > 1 - 0.9 / n) { + return 10 * k((n - 1) / n, compression, n); + } else { + if (q <= 0.5) { + return compression * Math.log(2 * q) / Z(compression, n); + } else { + return -k(1 - q, compression, n); + } + } + } + + @Override + public double k(double q, double normalizer) { + if (q < 1e-15) { + return 10 * k(1e-15, normalizer); + } else if (q > 1 - 1e-15) { + return 10 * k(1 - 1e-15, normalizer); + } else { + if (q <= 0.5) { + return Math.log(2 * q) / normalizer; + } else { + return -k(1 - q, normalizer); + } + } + } + + @Override + public double q(double k, double compression, double n) { + if (k <= 0) { + return Math.exp(k * Z(compression, n) / compression) / 2; + } else { + return 1 - q(-k, compression, n); + } + } + + @Override + public double q(double k, double normalizer) { + if (k <= 0) { + return Math.exp(k / normalizer) / 2; + } else { + return 1 - q(-k, normalizer); + } + } + + @Override + public double max(double q, double compression, double n) { + return Z(compression, n) * Math.min(q, 1 - q) / compression; + } + + @Override + public double max(double q, double normalizer) { + return Math.min(q, 1 - q) / normalizer; + } + + @Override + public double normalizer(double compression, double n) { + return compression / Z(compression, n); + } + + private double Z(double compression, double n) { + return 4 * Math.log(n / compression) + 21; + } + }, + + /** + * Generates cluster sizes proportional to q*(1-q). This makes the tail error bounds tighter. This version does not + * use a normalizer function and thus the number of clusters increases roughly proportional to log(n). That is good + * for accuracy, but bad for size and bad for the statically allocated MergingDigest, but can be useful for + * tree-based implementations. + */ + K_2_NO_NORM { + @Override + public double k(double q, double compression, double n) { + if (q == 0) { + return 2 * k(1 / n, compression, n); + } else if (q == 1) { + return 2 * k((n - 1) / n, compression, n); + } else { + return compression * Math.log(q / (1 - q)); + } + } + + @Override + public double k(double q, double normalizer) { + if (q <= 1e-15) { + return 2 * k(1e-15, normalizer); + } else if (q >= 1 - 1e-15) { + return 2 * k(1 - 1e-15, normalizer); + } else { + return normalizer * Math.log(q / (1 - q)); + } + } + + @Override + public double q(double k, double compression, double n) { + double w = Math.exp(k / compression); + return w / (1 + w); + } + + @Override + public double q(double k, double normalizer) { + double w = Math.exp(k / normalizer); + return w / (1 + w); + } + + @Override + public double max(double q, double compression, double n) { + return q * (1 - q) / compression; + } + + @Override + public double max(double q, double normalizer) { + return q * (1 - q) / normalizer; + } + + @Override + public double normalizer(double compression, double n) { + return compression; + } + }, + + /** + * Generates cluster sizes proportional to min(q, 1-q). This makes the tail error bounds tighter. This version does + * not use a normalizer function and thus the number of clusters increases roughly proportional to log(n). That is + * good for accuracy, but bad for size and bad for the statically allocated MergingDigest, but can be useful for + * tree-based implementations. + */ + K_3_NO_NORM { + @Override + public double k(double q, double compression, double n) { + if (q < 0.9 / n) { + return 10 * k(1 / n, compression, n); + } else if (q > 1 - 0.9 / n) { + return 10 * k((n - 1) / n, compression, n); + } else { + if (q <= 0.5) { + return compression * Math.log(2 * q); + } else { + return -k(1 - q, compression, n); + } + } + } + + @Override + public double k(double q, double normalizer) { + if (q <= 1e-15) { + return 10 * k(1e-15, normalizer); + } else if (q > 1 - 1e-15) { + return 10 * k(1 - 1e-15, normalizer); + } else { + if (q <= 0.5) { + return normalizer * Math.log(2 * q); + } else { + return -k(1 - q, normalizer); + } + } + } + + @Override + public double q(double k, double compression, double n) { + if (k <= 0) { + return Math.exp(k / compression) / 2; + } else { + return 1 - q(-k, compression, n); + } + } + + @Override + public double q(double k, double normalizer) { + if (k <= 0) { + return Math.exp(k / normalizer) / 2; + } else { + return 1 - q(-k, normalizer); + } + } + + @Override + public double max(double q, double compression, double n) { + return Math.min(q, 1 - q) / compression; + } + + @Override + public double max(double q, double normalizer) { + return Math.min(q, 1 - q) / normalizer; + } + + @Override + public double normalizer(double compression, double n) { + return compression; + } + }; // max weight is min(q,1-q), should improve tail accuracy even more + + /** + * Converts a quantile to the k-scale. The total number of points is also provided so that a normalizing function + * can be computed if necessary. + * + * @param q The quantile + * @param compression Also known as delta in literature on the t-digest + * @param n The total number of samples + * @return The corresponding value of k + */ + abstract public double k(double q, double compression, double n); + + /** + * Converts a quantile to the k-scale. The normalizer value depends on compression and (possibly) number of points + * in the digest. #normalizer(double, double) + * + * @param q The quantile + * @param normalizer The normalizer value which depends on compression and (possibly) number of points in the + * digest. + * @return The corresponding value of k + */ + abstract public double k(double q, double normalizer); + + /** + * Computes q as a function of k. This is often faster than finding k as a function of q for some scales. + * + * @param k The index value to convert into q scale. + * @param compression The compression factor (often written as δ) + * @param n The number of samples already in the digest. + * @return The value of q that corresponds to k + */ + abstract public double q(double k, double compression, double n); + + /** + * Computes q as a function of k. This is often faster than finding k as a function of q for some scales. + * + * @param k The index value to convert into q scale. + * @param normalizer The normalizer value which depends on compression and (possibly) number of points in the + * digest. + * @return The value of q that corresponds to k + */ + abstract public double q(double k, double normalizer); + + /** + * Computes the maximum relative size a cluster can have at quantile q. Note that exactly where within the range + * spanned by a cluster that q should be isn't clear. That means that this function usually has to be taken at + * multiple points and the smallest value used. + *

+ * Note that this is the relative size of a cluster. To get the max number of samples in the cluster, multiply this + * value times the total number of samples in the digest. + * + * @param q The quantile + * @param compression The compression factor, typically delta in the literature + * @param n The number of samples seen so far in the digest + * @return The maximum number of samples that can be in the cluster + */ + abstract public double max(double q, double compression, double n); + + /** + * Computes the maximum relative size a cluster can have at quantile q. Note that exactly where within the range + * spanned by a cluster that q should be isn't clear. That means that this function usually has to be taken at + * multiple points and the smallest value used. + *

+ * Note that this is the relative size of a cluster. To get the max number of samples in the cluster, multiply this + * value times the total number of samples in the digest. + * + * @param q The quantile + * @param normalizer The normalizer value which depends on compression and (possibly) number of points in the + * digest. + * @return The maximum number of samples that can be in the cluster + */ + abstract public double max(double q, double normalizer); + + /** + * Computes the normalizer given compression and number of points. + */ + abstract public double normalizer(double compression, double n); + + /** + * Approximates asin to within about 1e-6. This approximation works by breaking the range from 0 to 1 into 5 regions + * for all but the region nearest 1, rational polynomial models get us a very good approximation of asin and by + * interpolating as we move from region to region, we can guarantee continuity and we happen to get monotonicity as + * well. for the values near 1, we just use Math.asin as our region "approximation". + * + * @param x sin(theta) + * @return theta + */ + static double fastAsin(double x) { + if (x < 0) { + return -fastAsin(-x); + } else if (x > 1) { + return Double.NaN; + } else { + // Cutoffs for models. Note that the ranges overlap. In the + // overlap we do linear interpolation to guarantee the overall + // result is "nice" + double c0High = 0.1; + double c1High = 0.55; + double c2Low = 0.5; + double c2High = 0.8; + double c3Low = 0.75; + double c3High = 0.9; + double c4Low = 0.87; + if (x > c3High) { + return Math.asin(x); + } else { + // the models + double[] m0 = {0.2955302411, 1.2221903614, 0.1488583743, 0.2422015816, -0.3688700895, 0.0733398445}; + double[] m1 = {-0.0430991920, 0.9594035750, -0.0362312299, 0.1204623351, 0.0457029620, -0.0026025285}; + double[] m2 = {-0.034873933724, 1.054796752703, -0.194127063385, 0.283963735636, 0.023800124916, -0.000872727381}; + double[] m3 = {-0.37588391875, 2.61991859025, -2.48835406886, 1.48605387425, 0.00857627492, -0.00015802871}; + + // the parameters for all of the models + double[] vars = {1, x, x * x, x * x * x, 1 / (1 - x), 1 / (1 - x) / (1 - x)}; + + // raw grist for interpolation coefficients + double x0 = bound((c0High - x) / c0High); + double x1 = bound((c1High - x) / (c1High - c2Low)); + double x2 = bound((c2High - x) / (c2High - c3Low)); + double x3 = bound((c3High - x) / (c3High - c4Low)); + + // interpolation coefficients + //noinspection UnnecessaryLocalVariable + double mix0 = x0; + double mix1 = (1 - x0) * x1; + double mix2 = (1 - x1) * x2; + double mix3 = (1 - x2) * x3; + double mix4 = 1 - x3; + + // now mix all the results together, avoiding extra evaluations + double r = 0; + if (mix0 > 0) { + r += mix0 * eval(m0, vars); + } + if (mix1 > 0) { + r += mix1 * eval(m1, vars); + } + if (mix2 > 0) { + r += mix2 * eval(m2, vars); + } + if (mix3 > 0) { + r += mix3 * eval(m3, vars); + } + if (mix4 > 0) { + // model 4 is just the real deal + r += mix4 * Math.asin(x); + } + return r; + } + } + } + + private static double eval(double[] model, double[] vars) { + double r = 0; + for (int i = 0; i < model.length; i++) { + r += model[i] * vars[i]; + } + return r; + } + + private static double bound(double v) { + if (v <= 0) { + return 0; + } else if (v >= 1) { + return 1; + } else { + return v; + } + } +} diff --git a/presto-main/src/main/java/com/facebook/presto/tdigest/Sort.java b/presto-main/src/main/java/com/facebook/presto/tdigest/Sort.java new file mode 100644 index 0000000000000..c7f9589b492ce --- /dev/null +++ b/presto-main/src/main/java/com/facebook/presto/tdigest/Sort.java @@ -0,0 +1,482 @@ +/* + * Licensed to Ted Dunning under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.facebook.presto.tdigest; + +import java.util.Random; + +/** + * Static sorting methods + */ +public class Sort { + private static final Random prng = new Random(); // for choosing pivots during quicksort + /** + * Quick sort using an index array. On return, + * values[order[i]] is in order as i goes 0..values.length + * + * @param order Indexes into values + * @param values The values to sort. + */ + @SuppressWarnings("WeakerAccess") + public static void sort(int[] order, double[] values) { + sort(order, values, 0, values.length); + } + + /** + * Quick sort using an index array. On return, + * values[order[i]] is in order as i goes 0..n + * + * @param order Indexes into values + * @param values The values to sort. + * @param n The number of values to sort + */ + @SuppressWarnings("WeakerAccess") + public static void sort(int[] order, double[] values, int n) { + sort(order, values, 0, n); + } + + /** + * Quick sort using an index array. On return, + * values[order[i]] is in order as i goes start..n + * + * @param order Indexes into values + * @param values The values to sort. + * @param start The first element to sort + * @param n The number of values to sort + */ + @SuppressWarnings("WeakerAccess") + public static void sort(int[] order, double[] values, int start, int n) { + for (int i = start; i < start + n; i++) { + order[i] = i; + } + quickSort(order, values, start, start + n, 64); + insertionSort(order, values, start, start + n, 64); + } + + /** + * Standard quick sort except that sorting is done on an index array rather than the values themselves + * + * @param order The pre-allocated index array + * @param values The values to sort + * @param start The beginning of the values to sort + * @param end The value after the last value to sort + * @param limit The minimum size to recurse down to. + */ + private static void quickSort(int[] order, double[] values, int start, int end, int limit) { + // the while loop implements tail-recursion to avoid excessive stack calls on nasty cases + while (end - start > limit) { + + // pivot by a random element + int pivotIndex = start + prng.nextInt(end - start); + double pivotValue = values[order[pivotIndex]]; + + // move pivot to beginning of array + swap(order, start, pivotIndex); + + // we use a three way partition because many duplicate values is an important case + + int low = start + 1; // low points to first value not known to be equal to pivotValue + int high = end; // high points to first value > pivotValue + int i = low; // i scans the array + while (i < high) { + // invariant: values[order[k]] == pivotValue for k in [0..low) + // invariant: values[order[k]] < pivotValue for k in [low..i) + // invariant: values[order[k]] > pivotValue for k in [high..end) + // in-loop: i < high + // in-loop: low < high + // in-loop: i >= low + double vi = values[order[i]]; + if (vi == pivotValue) { + if (low != i) { + swap(order, low, i); + } else { + i++; + } + low++; + } else if (vi > pivotValue) { + high--; + swap(order, i, high); + } else { + // vi < pivotValue + i++; + } + } + // invariant: values[order[k]] == pivotValue for k in [0..low) + // invariant: values[order[k]] < pivotValue for k in [low..i) + // invariant: values[order[k]] > pivotValue for k in [high..end) + // assert i == high || low == high therefore, we are done with partition + + // at this point, i==high, from [start,low) are == pivot, [low,high) are < and [high,end) are > + // we have to move the values equal to the pivot into the middle. To do this, we swap pivot + // values into the top end of the [low,high) range stopping when we run out of destinations + // or when we run out of values to copy + int from = start; + int to = high - 1; + for (i = 0; from < low && to >= low; i++) { + swap(order, from++, to--); + } + if (from == low) { + // ran out of things to copy. This means that the the last destination is the boundary + low = to + 1; + } else { + // ran out of places to copy to. This means that there are uncopied pivots and the + // boundary is at the beginning of those + low = from; + } + +// checkPartition(order, values, pivotValue, start, low, high, end); + + // now recurse, but arrange it so we handle the longer limit by tail recursion + if (low - start < end - high) { + quickSort(order, values, start, low, limit); + + // this is really a way to do + // quickSort(order, values, high, end, limit); + start = high; + } else { + quickSort(order, values, high, end, limit); + // this is really a way to do + // quickSort(order, values, start, low, limit); + end = low; + } + } + } + + /** + * Quick sort in place of several paired arrays. On return, + * keys[...] is in order and the values[] arrays will be + * reordered as well in the same way. + * + * @param key Values to sort on + * @param values The auxilliary values to sort. + */ + @SuppressWarnings("WeakerAccess") + public static void sort(double[] key, double[] ... values) { + sort(key, 0, key.length, values); + } + + /** + * Quick sort using an index array. On return, + * values[order[i]] is in order as i goes start..n + * @param key Values to sort on + * @param start The first element to sort + * @param n The number of values to sort + * @param values The auxilliary values to sort. + */ + @SuppressWarnings("WeakerAccess") + public static void sort(double[] key, int start, int n, double[]... values) { + quickSort(key, values, start, start + n, 8); + insertionSort(key, values, start, start + n, 8); + } + + /** + * Standard quick sort except that sorting rearranges parallel arrays + * + * @param key Values to sort on + * @param values The auxilliary values to sort. + * @param start The beginning of the values to sort + * @param end The value after the last value to sort + * @param limit The minimum size to recurse down to. + */ + private static void quickSort(double[] key, double[][] values, int start, int end, int limit) { + // the while loop implements tail-recursion to avoid excessive stack calls on nasty cases + while (end - start > limit) { + + // median of three values for the pivot + int a = start; + int b = (start + end) / 2; + int c = end - 1; + + int pivotIndex; + double pivotValue; + double va = key[a]; + double vb = key[b]; + double vc = key[c]; + //noinspection Duplicates + if (va > vb) { + if (vc > va) { + // vc > va > vb + pivotIndex = a; + pivotValue = va; + } else { + // va > vb, va >= vc + if (vc < vb) { + // va > vb > vc + pivotIndex = b; + pivotValue = vb; + } else { + // va >= vc >= vb + pivotIndex = c; + pivotValue = vc; + } + } + } else { + // vb >= va + if (vc > vb) { + // vc > vb >= va + pivotIndex = b; + pivotValue = vb; + } else { + // vb >= va, vb >= vc + if (vc < va) { + // vb >= va > vc + pivotIndex = a; + pivotValue = va; + } else { + // vb >= vc >= va + pivotIndex = c; + pivotValue = vc; + } + } + } + + // move pivot to beginning of array + swap(start, pivotIndex, key, values); + + // we use a three way partition because many duplicate values is an important case + + int low = start + 1; // low points to first value not known to be equal to pivotValue + int high = end; // high points to first value > pivotValue + int i = low; // i scans the array + while (i < high) { + // invariant: values[order[k]] == pivotValue for k in [0..low) + // invariant: values[order[k]] < pivotValue for k in [low..i) + // invariant: values[order[k]] > pivotValue for k in [high..end) + // in-loop: i < high + // in-loop: low < high + // in-loop: i >= low + double vi = key[i]; + if (vi == pivotValue) { + if (low != i) { + swap(low, i, key, values); + } else { + i++; + } + low++; + } else if (vi > pivotValue) { + high--; + swap(i, high, key, values); + } else { + // vi < pivotValue + i++; + } + } + // invariant: values[order[k]] == pivotValue for k in [0..low) + // invariant: values[order[k]] < pivotValue for k in [low..i) + // invariant: values[order[k]] > pivotValue for k in [high..end) + // assert i == high || low == high therefore, we are done with partition + + // at this point, i==high, from [start,low) are == pivot, [low,high) are < and [high,end) are > + // we have to move the values equal to the pivot into the middle. To do this, we swap pivot + // values into the top end of the [low,high) range stopping when we run out of destinations + // or when we run out of values to copy + int from = start; + int to = high - 1; + for (i = 0; from < low && to >= low; i++) { + swap(from++, to--, key, values); + } + if (from == low) { + // ran out of things to copy. This means that the the last destination is the boundary + low = to + 1; + } else { + // ran out of places to copy to. This means that there are uncopied pivots and the + // boundary is at the beginning of those + low = from; + } + +// checkPartition(order, values, pivotValue, start, low, high, end); + + // now recurse, but arrange it so we handle the longer limit by tail recursion + if (low - start < end - high) { + quickSort(key, values, start, low, limit); + + // this is really a way to do + // quickSort(order, values, high, end, limit); + start = high; + } else { + quickSort(key, values, high, end, limit); + // this is really a way to do + // quickSort(order, values, start, low, limit); + end = low; + } + } + } + + + /** + * Limited range insertion sort. We assume that no element has to move more than limit steps + * because quick sort has done its thing. This version works on parallel arrays of keys and values. + * + * @param key The array of keys + * @param values The values we are sorting + * @param start The starting point of the sort + * @param end The ending point of the sort + * @param limit The largest amount of disorder + */ + @SuppressWarnings("SameParameterValue") + private static void insertionSort(double[] key, double[][] values, int start, int end, int limit) { + // loop invariant: all values start ... i-1 are ordered + for (int i = start + 1; i < end; i++) { + double v = key[i]; + int m = Math.max(i - limit, start); + for (int j = i; j >= m; j--) { + if (j == m || key[j - 1] <= v) { + if (j < i) { + System.arraycopy(key, j, key, j + 1, i - j); + key[j] = v; + for (double[] value : values) { + double tmp = value[i]; + System.arraycopy(value, j, value, j + 1, i - j); + value[j] = tmp; + } + } + break; + } + } + } + } + + private static void swap(int[] order, int i, int j) { + int t = order[i]; + order[i] = order[j]; + order[j] = t; + } + + private static void swap(int i, int j, double[] key, double[]...values) { + double t = key[i]; + key[i] = key[j]; + key[j] = t; + + for (int k = 0; k < values.length; k++) { + t = values[k][i]; + values[k][i] = values[k][j]; + values[k][j] = t; + } + } + + /** + * Check that a partition step was done correctly. For debugging and testing. + * + * @param order The array of indexes representing a permutation of the keys. + * @param values The keys to sort. + * @param pivotValue The value that splits the data + * @param start The beginning of the data of interest. + * @param low Values from start (inclusive) to low (exclusive) are < pivotValue. + * @param high Values from low to high are equal to the pivot. + * @param end Values from high to end are above the pivot. + */ + @SuppressWarnings("UnusedDeclaration") + public static void checkPartition(int[] order, double[] values, double pivotValue, int start, int low, int high, int end) { + if (order.length != values.length) { + throw new IllegalArgumentException("Arguments must be same size"); + } + + if (!(start >= 0 && low >= start && high >= low && end >= high)) { + throw new IllegalArgumentException(String.format("Invalid indices %d, %d, %d, %d", start, low, high, end)); + } + + for (int i = 0; i < low; i++) { + double v = values[order[i]]; + if (v >= pivotValue) { + throw new IllegalArgumentException(String.format("Value greater than pivot at %d", i)); + } + } + + for (int i = low; i < high; i++) { + if (values[order[i]] != pivotValue) { + throw new IllegalArgumentException(String.format("Non-pivot at %d", i)); + } + } + + for (int i = high; i < end; i++) { + double v = values[order[i]]; + if (v <= pivotValue) { + throw new IllegalArgumentException(String.format("Value less than pivot at %d", i)); + } + } + } + + /** + * Limited range insertion sort. We assume that no element has to move more than limit steps + * because quick sort has done its thing. + * + * @param order The permutation index + * @param values The values we are sorting + * @param start Where to start the sort + * @param n How many elements to sort + * @param limit The largest amount of disorder + */ + @SuppressWarnings("SameParameterValue") + private static void insertionSort(int[] order, double[] values, int start, int n, int limit) { + for (int i = start + 1; i < n; i++) { + int t = order[i]; + double v = values[order[i]]; + int m = Math.max(i - limit, start); + for (int j = i; j >= m; j--) { + if (j == 0 || values[order[j - 1]] <= v) { + if (j < i) { + System.arraycopy(order, j, order, j + 1, i - j); + order[j] = t; + } + break; + } + } + } + } + + /** + * Reverses an array in-place. + * + * @param order The array to reverse + */ + @SuppressWarnings("WeakerAccess") + public static void reverse(int[] order) { + reverse(order, 0, order.length); + } + + /** + * Reverses part of an array. See {@link #reverse(int[])} + * + * @param order The array containing the data to reverse. + * @param offset Where to start reversing. + * @param length How many elements to reverse + */ + @SuppressWarnings("WeakerAccess") + public static void reverse(int[] order, int offset, int length) { + for (int i = 0; i < length / 2; i++) { + int t = order[offset + i]; + order[offset + i] = order[offset + length - i - 1]; + order[offset + length - i - 1] = t; + } + } + + /** + * Reverses part of an array. See {@link #reverse(int[])} + * + * @param order The array containing the data to reverse. + * @param offset Where to start reversing. + * @param length How many elements to reverse + */ + @SuppressWarnings({"WeakerAccess", "SameParameterValue"}) + public static void reverse(double[] order, int offset, int length) { + for (int i = 0; i < length / 2; i++) { + double t = order[offset + i]; + order[offset + i] = order[offset + length - i - 1]; + order[offset + length - i - 1] = t; + } + } +} diff --git a/presto-main/src/main/java/com/facebook/presto/tdigest/TDigest.java b/presto-main/src/main/java/com/facebook/presto/tdigest/TDigest.java new file mode 100644 index 0000000000000..467e82acda8ee --- /dev/null +++ b/presto-main/src/main/java/com/facebook/presto/tdigest/TDigest.java @@ -0,0 +1,226 @@ +/* + * Licensed to Ted Dunning under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.facebook.presto.tdigest; + +import java.io.Serializable; +import java.nio.ByteBuffer; +import java.util.Collection; +import java.util.List; + +/** + * Adaptive histogram based on something like streaming k-means crossed with Q-digest. + * + * The special characteristics of this algorithm are: + * + * - smaller summaries than Q-digest + * + * - works on doubles as well as integers. + * + * - provides part per million accuracy for extreme quantiles and typically <1000 ppm accuracy for middle quantiles + * + * - fast + * + * - simple + * + * - test coverage roughly at 90% + * + * - easy to adapt for use with map-reduce + */ +public abstract class TDigest implements Serializable { + protected ScaleFunction scale = ScaleFunction.K_2; + double min = Double.POSITIVE_INFINITY; + double max = Double.NEGATIVE_INFINITY; + + /** + * Creates an {@link MergingDigest}. This is generally the best known implementation right now. + * + * @param compression The compression parameter. 100 is a common value for normal uses. 1000 is extremely large. + * The number of centroids retained will be a smallish (usually less than 10) multiple of this number. + * @return the MergingDigest + */ + @SuppressWarnings("WeakerAccess") + public static TDigest createMergingDigest(double compression) { + return new MergingDigest(compression); + } + + /** + * Creates a TDigest of whichever type is the currently recommended type. MergingDigest is generally the best + * known implementation right now. + * + * @param compression The compression parameter. 100 is a common value for normal uses. 1000 is extremely large. + * The number of centroids retained will be a smallish (usually less than 10) multiple of this number. + * @return the TDigest + */ + @SuppressWarnings({"unused", "WeakerAccess", "SameParameterValue"}) + public static TDigest createDigest(double compression) { + return createMergingDigest(compression); + } + + /** + * Adds a sample to a histogram. + * + * @param x The value to add. + * @param w The weight of this point. + */ + public abstract void add(double x, int w); + + final void checkValue(double x) { + if (Double.isNaN(x)) { + throw new IllegalArgumentException("Cannot add NaN"); + } + } + + public abstract void add(List others); + + /** + * Re-examines a t-digest to determine whether some centroids are redundant. If your data are + * perversely ordered, this may be a good idea. Even if not, this may save 20% or so in space. + * + * The cost is roughly the same as adding as many data points as there are centroids. This + * is typically < 10 * compression, but could be as high as 100 * compression. + * + * This is a destructive operation that is not thread-safe. + */ + public abstract void compress(); + + /** + * Returns the number of points that have been added to this TDigest. + * + * @return The sum of the weights on all centroids. + */ + public abstract long size(); + + /** + * Returns the fraction of all points added which are ≤ x. + * + * @param x The cutoff for the cdf. + * @return The fraction of all data which is less or equal to x. + */ + public abstract double cdf(double x); + + /** + * Returns an estimate of the cutoff such that a specified fraction of the data + * added to this TDigest would be less than or equal to the cutoff. + * + * @param q The desired fraction + * @return The value x such that cdf(x) == q + */ + public abstract double quantile(double q); + + /** + * A {@link Collection} that lets you go through the centroids in ascending order by mean. Centroids + * returned will not be re-used, but may or may not share storage with this TDigest. + * + * @return The centroids in the form of a Collection. + */ + public abstract Collection centroids(); + + /** + * Returns the current compression factor. + * + * @return The compression factor originally used to set up the TDigest. + */ + public abstract double compression(); + + /** + * Returns the number of bytes required to encode this TDigest using #asBytes(). + * + * @return The number of bytes required. + */ + public abstract int byteSize(); + + /** + * Returns the number of bytes required to encode this TDigest using #asSmallBytes(). + * + * Note that this is just as expensive as actually compressing the digest. If you don't + * care about time, but want to never over-allocate, this is fine. If you care about compression + * and speed, you pretty much just have to overallocate by using allocating #byteSize() bytes. + * + * @return The number of bytes required. + */ + public abstract int smallByteSize(); + + public void setScaleFunction(ScaleFunction scaleFunction) { + if (scaleFunction.toString().endsWith("NO_NORM")) { + throw new IllegalArgumentException( + String.format("Can't use %s as scale with %s", scaleFunction, this.getClass())); + } + this.scale = scaleFunction; + } + + /** + * Serialize this TDigest into a byte buffer. Note that the serialization used is + * very straightforward and is considerably larger than strictly necessary. + * + * @param buf The byte buffer into which the TDigest should be serialized. + */ + public abstract void asBytes(ByteBuffer buf); + + /** + * Serialize this TDigest into a byte buffer. Some simple compression is used + * such as using variable byte representation to store the centroid weights and + * using delta-encoding on the centroid means so that floats can be reasonably + * used to store the centroid means. + * + * @param buf The byte buffer into which the TDigest should be serialized. + */ + public abstract void asSmallBytes(ByteBuffer buf); + + /** + * Tell this TDigest to record the original data as much as possible for test + * purposes. + * + * @return This TDigest so that configurations can be done in fluent style. + */ + public abstract TDigest recordAllData(); + + public abstract boolean isRecording(); + + /** + * Add a sample to this TDigest. + * + * @param x The data value to add + */ + public abstract void add(double x); + + /** + * Add all of the centroids of another TDigest to this one. + * + * @param other The other TDigest + */ + public abstract void add(TDigest other); + + public abstract int centroidCount(); + + public double getMin() { + return min; + } + + public double getMax() { + return max; + } + + /** + * Over-ride the min and max values for testing purposes + */ + @SuppressWarnings("SameParameterValue") + void setMinMax(double min, double max) { + this.min = min; + this.max = max; + } +} From 20ea0f87afa22c8408042f82ed0d485b09f353c6 Mon Sep 17 00:00:00 2001 From: Gaston Arguindegui Date: Mon, 17 Jun 2019 16:12:35 -0700 Subject: [PATCH 2/4] Add tdigest serialization documents --- .../facebook/presto/tdigest/docs/t-digest.md | 33 + .../presto/tdigest/docs/tdigest.graffle | 1662 +++++++++++++++++ .../facebook/presto/tdigest/docs/tdigest.png | Bin 0 -> 19278 bytes .../presto/tdigest/docs/tdigest_centroids.png | Bin 0 -> 8922 bytes 4 files changed, 1695 insertions(+) create mode 100644 presto-main/src/main/java/com/facebook/presto/tdigest/docs/t-digest.md create mode 100644 presto-main/src/main/java/com/facebook/presto/tdigest/docs/tdigest.graffle create mode 100644 presto-main/src/main/java/com/facebook/presto/tdigest/docs/tdigest.png create mode 100644 presto-main/src/main/java/com/facebook/presto/tdigest/docs/tdigest_centroids.png diff --git a/presto-main/src/main/java/com/facebook/presto/tdigest/docs/t-digest.md b/presto-main/src/main/java/com/facebook/presto/tdigest/docs/t-digest.md new file mode 100644 index 0000000000000..af1a825c76fdf --- /dev/null +++ b/presto-main/src/main/java/com/facebook/presto/tdigest/docs/t-digest.md @@ -0,0 +1,33 @@ +# T-Digest + +## Background + +The implementation of t-digest in this library is based on [t-digest [Dunning '19]](https://arxiv.org/abs/1902.04023). It has been modified to support Slice serialization. + +## Format + +_Unless otherwise noted, all values are little-endian._ + +### T-Digest Layout + +![T-Digest Layout](tdigest.png) + +The t-digest algorithm uses a set of centroids to store a distribution of values. A centroid is a data structure that has an associated mean and weight. In this format, the centroids that make up the t-digest are serialized by storing the two metrics mentioned. + +* format: byte value which represents the format used for serialization (currently just one format exists, `0`) +* type: byte value which represents the data type of the values in the distribution +* min: represents the minimum value of the distribution represented as a double +* max: represents the maximum value of the distribution represented as a double +* compression: the compression factor used when creating the t-digest +* total weight: the total number of values in the t-digest +* activeCentroids: the number of centroids used by the t-digest +* centroids: the sequence of centroids which represent the t-digest, ordered by ascending mean + +#### Centroid Layout + +![T-Digest Centroid Entry](tdigest_centroids.png) + +The centroids section in the t-digest consists of two arrays of length `activeCentroids`: + +* weight[]: an array with the number of values contained in each centroid +* mean[]: an array with the average value of each centroid \ No newline at end of file diff --git a/presto-main/src/main/java/com/facebook/presto/tdigest/docs/tdigest.graffle b/presto-main/src/main/java/com/facebook/presto/tdigest/docs/tdigest.graffle new file mode 100644 index 0000000000000..7bbf5cd5d95db --- /dev/null +++ b/presto-main/src/main/java/com/facebook/presto/tdigest/docs/tdigest.graffle @@ -0,0 +1,1662 @@ + + + + + ApplicationVersion + + com.omnigroup.OmniGraffle7 + 195.13.0.328142 + + CreationDate + 2019-06-19 23:03:59 +0000 + Creator + Gaston Arguindegui + GraphDocumentVersion + 16 + GuidesLocked + NO + GuidesVisible + YES + ImageCounter + 1 + LinksVisible + NO + MagnetsVisible + NO + MasterSheets + + MovementHandleVisible + NO + NotesVisible + NO + OriginVisible + NO + PageBreaks + YES + PrintInfo + + NSBottomMargin + + float + 41 + + NSHorizonalPagination + + coded + BAtzdHJlYW10eXBlZIHoA4QBQISEhAhOU051bWJlcgCEhAdOU1ZhbHVlAISECE5TT2JqZWN0AIWEASqEhAFxlwCG + + NSLeftMargin + + float + 18 + + NSPaperSize + + size + {612, 792} + + NSPrintReverseOrientation + + coded + BAtzdHJlYW10eXBlZIHoA4QBQISEhAhOU051bWJlcgCEhAdOU1ZhbHVlAISECE5TT2JqZWN0AIWEASqEhAFxlwCG + + NSRightMargin + + float + 18 + + NSTopMargin + + float + 18 + + + ReadOnly + NO + Sheets + + + ActiveLayerIndex + 0 + AutoAdjust + 6 + AutosizingMargin + 72 + BackgroundGraphic + + Bounds + {{0, 0}, {733, 576}} + Class + GraffleShapes.CanvasBackgroundGraphic + ID + 0 + Style + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + TextAlongPathGlyphAnchor + center + + + BaseZoom + 0 + CanvasDimensionsOrigin + {0, 0} + CanvasOrigin + {0, 0} + CanvasSizingMode + 1 + ColumnAlign + 1 + ColumnSpacing + 36 + DisplayScale + 1 in = 1.00000 in + GraphicsList + + + Class + LineGraphic + ID + 139 + LogicalPath + + elements + + + element + MOVETO + point + {184.84121889575664, 180.66666666666666} + + + element + LINETO + point + {185.93107292003268, 110} + + + + Points + + {184.84121889575664, 180.66666666666666} + {185.93107292003268, 110} + + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + HeadArrow + FilledArrow + Legacy + + LineType + 1 + + + + + Bounds + {{159.86669081575391, 181.04011151904371}, {47, 27.999988876101849}} + Class + ShapedGraphic + ID + 138 + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Pad + 0.0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1671\cocoasubrtf400 +{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +{\*\expandedcolortbl;;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc\partightenfactor0 + +\f0\fs24 \cf0 total weight \ +(1 double)} + TextAlongPathGlyphAnchor + center + VerticalPad + 0.0 + + Wrap + NO + + + Bounds + {{168.70002414908734, 94.986664558450457}, {36, 13.999994438050924}} + Class + ShapedGraphic + ID + 137 + Text + + TextAlongPathGlyphAnchor + center + + + + Bounds + {{48.601733947793605, 169.16709747692138}, {37.29150390625, 26}} + Class + ShapedGraphic + FitText + YES + Flow + Resize + FontInfo + + Color + + space + gg22 + w + 0 + + Size + 11 + + ID + 136 + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Pad + 0.0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1671\cocoasubrtf400 +{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +{\*\expandedcolortbl;;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc\partightenfactor0 + +\f0\fs22 \cf0 type\ +(1 byte)} + TextAlongPathGlyphAnchor + center + VerticalPad + 0.0 + + Wrap + NO + + + Class + LineGraphic + ID + 135 + LogicalPath + + elements + + + element + MOVETO + point + {56.666645986841068, 167.52621614696335} + + + element + LINETO + point + {56.533306241398691, 110} + + + + Points + + {56.666645986841068, 167.52621614696335} + {56.533306241398691, 110} + + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + HeadArrow + FilledArrow + Legacy + + LineType + 1 + + + + + Bounds + {{52.533306241398691, 94.986720085144043}, {9, 14}} + Class + ShapedGraphic + ID + 134 + Text + + TextAlongPathGlyphAnchor + center + + + + Bounds + {{127.20002414908731, 218.01295966264541}, {47, 27.999988876101849}} + Class + ShapedGraphic + ID + 131 + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Pad + 0.0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1671\cocoasubrtf400 +{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +{\*\expandedcolortbl;;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc\partightenfactor0 + +\f0\fs24 \cf0 compression \ +(1 double)} + TextAlongPathGlyphAnchor + center + VerticalPad + 0.0 + + Wrap + NO + + + Class + LineGraphic + Head + + ID + 133 + + ID + 132 + LogicalPath + + elements + + + element + MOVETO + point + {150.70002414908734, 218.01295966264541} + + + element + LINETO + point + {150.70002414908734, 109.48671452319495} + + + + Points + + {150.70002414908734, 218.01295966264541} + {150.70002414908734, 109.48671452319495} + + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + HeadArrow + FilledArrow + Legacy + + LineType + 1 + + + Tail + + ID + 131 + + + + Bounds + {{132.70002414908734, 94.986720085144043}, {36, 13.999994438050924}} + Class + ShapedGraphic + ID + 133 + Text + + TextAlongPathGlyphAnchor + center + + + + Bounds + {{89.682446828750074, 181.04011151904371}, {54.03515625, 28}} + Class + ShapedGraphic + FitText + YES + Flow + Resize + ID + 128 + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Pad + 0.0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1671\cocoasubrtf400 +{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +{\*\expandedcolortbl;;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc\partightenfactor0 + +\f0\fs24 \cf0 max\ +(1 double)} + TextAlongPathGlyphAnchor + center + VerticalPad + 0.0 + + Wrap + NO + + + Class + LineGraphic + Head + + ID + 130 + + ID + 129 + LogicalPath + + elements + + + element + MOVETO + point + {116.399122613057, 181.04011151904371} + + + element + LINETO + point + {114.86122015494765, 109.48654911092339} + + + + Points + + {116.399122613057, 181.04011151904371} + {114.86122015494765, 109.48654911092339} + + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + HeadArrow + FilledArrow + Legacy + + LineType + 1 + + + Tail + + ID + 128 + + + + Bounds + {{96.700024953750017, 94.986664558450457}, {36, 14}} + Class + ShapedGraphic + ID + 130 + Text + + TextAlongPathGlyphAnchor + center + + + + Bounds + {{28.533306241398805, 46.066613891428744}, {37, 14}} + Class + ShapedGraphic + FitText + YES + Flow + Resize + ID + 127 + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Pad + 0.0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1671\cocoasubrtf400 +{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +{\*\expandedcolortbl;;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc\partightenfactor0 + +\f0\fs24 \cf0 Layout} + TextAlongPathGlyphAnchor + left + VerticalPad + 0.0 + + Wrap + NO + + + Bounds + {{185.0399958640337, 259.19999420642853}, {82.705078125, 28}} + Class + ShapedGraphic + FitText + YES + Flow + Resize + ID + 91 + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Pad + 0.0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1671\cocoasubrtf400 +{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +{\*\expandedcolortbl;;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc\partightenfactor0 + +\f0\fs24 \cf0 activeCentroids\ +(1 int)} + TextAlongPathGlyphAnchor + center + VerticalPad + 0.0 + + Wrap + NO + + + Class + LineGraphic + ID + 93 + LogicalPath + + elements + + + element + MOVETO + point + {222.38426813609101, 258} + + + element + LINETO + point + {219.71760146942427, 108} + + + + Points + + {222.38426813609101, 258} + {219.71760146942427, 108} + + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + HeadArrow + FilledArrow + Legacy + + LineType + 1 + + + + + Bounds + {{204.38426813609101, 94.986844558450699}, {18, 14}} + Class + ShapedGraphic + ID + 96 + Text + + TextAlongPathGlyphAnchor + center + + + + Bounds + {{332.6883282804597, 217.76023069828778}, {136.734375, 28}} + Class + ShapedGraphic + FitText + YES + Flow + Resize + ID + 84 + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Pad + 0.0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1671\cocoasubrtf400 +{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +{\*\expandedcolortbl;;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc\partightenfactor0 + +\f0\fs24 \cf0 centroids\ +(16 bytes * # of centroids)} + TextAlongPathGlyphAnchor + center + VerticalPad + 0.0 + + Wrap + NO + + + Bounds + {{61.54908701280732, 218.01291853874716}, {54.03515625, 28}} + Class + ShapedGraphic + FitText + YES + Flow + Resize + ID + 83 + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Pad + 0.0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1671\cocoasubrtf400 +{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +{\*\expandedcolortbl;;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc\partightenfactor0 + +\f0\fs24 \cf0 min \ +(1 double)} + TextAlongPathGlyphAnchor + center + VerticalPad + 0.0 + + Wrap + NO + + + Bounds + {{26.639999404549599, 268.97396151970338}, {55, 28}} + Class + ShapedGraphic + FitText + YES + Flow + Resize + ID + 81 + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Pad + 0.0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1671\cocoasubrtf400 +{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +{\*\expandedcolortbl;;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc\partightenfactor0 + +\f0\fs24 \cf0 format = 0\ +(1 byte)} + TextAlongPathGlyphAnchor + left + VerticalPad + 0.0 + + Wrap + NO + + + Class + LineGraphic + Head + + ID + 69 + + ID + 80 + LogicalPath + + elements + + + element + MOVETO + point + {48, 266.33333333333331} + + + element + LINETO + point + {48.000020677902718, 109.48672008514401} + + + + Points + + {48, 266.33333333333331} + {48.000020677902718, 109.48672008514401} + + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + HeadArrow + FilledArrow + Legacy + + LineType + 1 + + + + + Class + LineGraphic + ID + 79 + LogicalPath + + elements + + + element + MOVETO + point + {86.592592804473995, 219.96077055845052} + + + element + LINETO + point + {87.682446828750017, 110} + + + + Points + + {86.592592804473995, 219.96077055845052} + {87.682446828750017, 110} + + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + HeadArrow + FilledArrow + Legacy + + LineType + 1 + + + + + Class + LineGraphic + Head + + ID + 71 + + ID + 77 + LogicalPath + + elements + + + element + MOVETO + point + {400.9835956074657, 217.76023069828778} + + + element + LINETO + point + {400.42737774400473, 109.48671348768369} + + + + Points + + {400.9835956074657, 217.76023069828778} + {400.42737774400473, 109.48671348768369} + + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + HeadArrow + FilledArrow + Legacy + + LineType + 1 + + + Tail + + ID + 84 + + + + Bounds + {{222.27702935417665, 94.986720085144043}, {356.22363951923273, 14}} + Class + ShapedGraphic + ID + 71 + Text + + TextAlongPathGlyphAnchor + center + + + + Bounds + {{61.06666513780732, 94.986754558450528}, {36, 14}} + Class + ShapedGraphic + ID + 70 + Text + + TextAlongPathGlyphAnchor + center + + + + Bounds + {{43.500021666666626, 94.986720085144043}, {9, 14}} + Class + ShapedGraphic + ID + 69 + Text + + TextAlongPathGlyphAnchor + center + + + + GridInfo + + HPages + 1 + KeepToScale + + Layers + + + Artboards + + Lock + + Name + Layer 1 + Print + + View + + + + LayoutInfo + + Animate + NO + circoMinDist + 18 + circoSeparation + 0.0 + layoutEngine + dot + neatoLineLength + 0.20000000298023224 + neatoSeparation + 0.0 + twopiSeparation + 0.0 + + Orientation + 1 + PrintOnePage + + RowAlign + 1 + RowSpacing + 36 + SheetTitle + Layout + UniqueID + 3 + VPages + 1 + VisibleVoidKey + 1 + + + ActiveLayerIndex + 0 + AutoAdjust + 6 + AutosizingMargin + 72 + BackgroundGraphic + + Bounds + {{0, 0}, {733, 576}} + Class + GraffleShapes.CanvasBackgroundGraphic + ID + 0 + Style + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + TextAlongPathGlyphAnchor + center + + + BaseZoom + 0 + CanvasDimensionsOrigin + {0, 0} + CanvasOrigin + {0, 0} + CanvasSizingMode + 1 + ColumnAlign + 1 + ColumnSpacing + 36 + DisplayScale + 1 in = 1.00000 in + GraphicsList + + + Bounds + {{257.71184377446417, 188.67875910191651}, {94.04296875, 28}} + Class + ShapedGraphic + FitText + YES + Flow + Resize + ID + 118 + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Pad + 0.0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1671\cocoasubrtf400 +{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +{\*\expandedcolortbl;;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc\partightenfactor0 + +\f0\fs24 \cf0 mean []\ +(array of doubles)} + TextAlongPathGlyphAnchor + center + VerticalPad + 0.0 + + Wrap + NO + + + Class + LineGraphic + ID + 119 + LogicalPath + + elements + + + element + MOVETO + point + {305.36292054282848, 188.34542576858314} + + + element + LINETO + point + {304.36292054282848, 91.01209243524977} + + + + Points + + {305.36292054282848, 188.34542576858314} + {304.36292054282848, 91.01209243524977} + + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + HeadArrow + FilledArrow + Legacy + + LineType + 1 + + + + + Bounds + {{232.73332814946434, 73.319362788286639}, {144, 14}} + Class + ShapedGraphic + ID + 120 + Text + + TextAlongPathGlyphAnchor + center + + + + Bounds + {{113.71184377446423, 190.70482208221301}, {94.04296875, 28}} + Class + ShapedGraphic + FitText + YES + Flow + Resize + ID + 124 + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Pad + 0.0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1671\cocoasubrtf400 +{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +{\*\expandedcolortbl;;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc\partightenfactor0 + +\f0\fs24 \cf0 weight []\ +(array of doubles)} + TextAlongPathGlyphAnchor + center + VerticalPad + 0.0 + + Wrap + NO + + + Class + LineGraphic + ID + 125 + LogicalPath + + elements + + + element + MOVETO + point + {160.23332814946434, 187.01209243524988} + + + element + LINETO + point + {160.23332814946434, 91.01209243524977} + + + + Points + + {160.23332814946434, 187.01209243524988} + {160.23332814946434, 91.01209243524977} + + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + HeadArrow + FilledArrow + Legacy + + LineType + 1 + + + + + Bounds + {{88.733328149464228, 73.319362788286639}, {144, 14}} + Class + ShapedGraphic + ID + 126 + Text + + TextAlongPathGlyphAnchor + center + + + + Bounds + {{14.577296911059193, 30.959999307990074}, {182.42578125, 14}} + Class + ShapedGraphic + FitText + YES + Flow + Resize + ID + 68 + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Pad + 0.0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1671\cocoasubrtf400 +{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +{\*\expandedcolortbl;;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc\partightenfactor0 + +\f0\fs24 \cf0 Centroids section layout (type = 0)} + TextAlongPathGlyphAnchor + center + VerticalPad + 0.0 + + Wrap + NO + + + GridInfo + + HPages + 1 + KeepToScale + + Layers + + + Artboards + + Lock + + Name + Layer 1 + Print + + View + + + + LayoutInfo + + Animate + NO + circoMinDist + 18 + circoSeparation + 0.0 + layoutEngine + dot + neatoLineLength + 0.20000000298023224 + neatoSeparation + 0.0 + twopiSeparation + 0.0 + + Orientation + 1 + PrintOnePage + + RowAlign + 1 + RowSpacing + 36 + SheetTitle + Centroid Layout + UniqueID + 7 + VPages + 1 + VisibleVoidKey + 1 + + + SmartAlignmentGuidesActive + YES + SmartDistanceGuidesActive + YES + UseEntirePage + + WindowInfo + + CurrentSheet + 1 + Frame + {{2065, 871}, {1680, 878}} + ShowInfo + + ShowRuler + + Sidebar + + SidebarWidth + 244 + Sidebar_Tab + 0 + VisibleRegion + {{2.7027027027027359, -2.0270270270269481}, {766.21621621621625, 518.91891891891896}} + WindowInfo_InspectorTab + + com.omnigroup.OmniGraffle.inspectorGroup.object + + ZoomValues + + + Layout + 1.5 + 1 + + + Centroid Layout + 1.48 + 1 + + + + compressOnDiskKey + + copyLinkedImagesKey + + createSinglePDFKey + 0 + exportAreaKey + 1 + exportQualityKey + 100 + exportSizesKey + + 1 + + + fileFormatKey + 0 + htmlImageTypeKey + 0 + includeBackgroundGraphicKey + + includeNonPrintingLayersKey + + lastExportTypeKey + 0 + marginWidthKey + 0.0 + readOnlyKey + + resolutionForBMPKey + 1 + resolutionForGIFKey + 1 + resolutionForHTMLKey + 1 + resolutionForJPGKey + 1 + resolutionForPNGKey + 1 + resolutionForTIFFKey + 1 + resolutionUnitsKey + 0 + saveAsFlatFileOptionKey + 1 + useArtboardsKey + + useMarginKey + + useNotesKey + + + diff --git a/presto-main/src/main/java/com/facebook/presto/tdigest/docs/tdigest.png b/presto-main/src/main/java/com/facebook/presto/tdigest/docs/tdigest.png new file mode 100644 index 0000000000000000000000000000000000000000..3778cdbbe1727835078af26974683cbfcee3578d GIT binary patch literal 19278 zcmd>mg;$i{_bxF=GXv5MQUcN~-J#M*cXvs54c#pgf`EbusC0LUfTVOIp`?U>_l$nN zv3@uHf$Lhr@V=+d-skN7>}OAm+7ks_EOIO)BqUrVMOh6bB$Nl>^Dhh(@bkV0uQ>RH z?4h9`jZ`&C`3?Mm>8hybfrNy87x5n%DL0Q4RKd2>()HAR{7A&Y#fihr(#710!^g=L zltw}l^$`Idovb{~XnmX7&jV26 zfrqcNrpc;-OfVE+0)7$boNi} ze+}^eUgDp&L^%=5^ny6wxLXt#Ml9kf(K|Xlly+G7^ z)@Q*NTh78_i$(C5ouv?i)wG|ux#lH`dSy+Kyal>kZx|yn3ahR;V;GvO8hd$X0UEzK zeXLP2iUuJzE7NtGpE#Ld^ZEBhgRM4i?}A_bzeYz#pZI!tcS9P6hEZs{RV8Oh=8H+g0~ zlFjY4uN!>HW`n370#l>{Wz?wN++Lr2hEfPr;-S69_~#8a$}3;nm5$K8s^;%e6wX3E zho<`cey2M@H&=QmTcasGH6Cy9T3He>YE&`mD&alU9yv)btGuOBXhrB7ab zZT9~0)Fkk1g!gm6?(4F|hu?+bW=#IJ4?|(}+36)Q$>TEXTJ&BPv!5)N_rc5KZO2iDzRlLGKm5_+F|B8F(0abaYcqNchHc`zNz|Ri z#qeP(*MU0t`Uo~fW|K#^5E=4!6-Yrp82H;i=Uq%(TF>sS9_YJkA!bdzZldx4qCzSs8-5Y+El=Z-c(e5ZSg%RxIHZ!Z;r)B zEG|Z90KHlmJk!W)ZkVGP=XPhQ^`Sb}&Sy5CW}X;tReFV91{__J6Bc_T(q2H1rt85W#^*XLoZp zV-vUo7A9$tk+DQ>wzuVc$^YwvJ=jw)1jMF?VMO?1Oy`_0FN1~dZ5jphOsWhY3qSs} zV|h09)WqTKN8K$cE_%-Lovilj&1`N9jP~ERzmj#5sKvg1bU6Rs{G-+!Lb2$({X%C$ zL*>Co*VQi5BP?u0+YyqZ_!uW+Umf1H9tQ_^!=VyCXDztJB-_4yDN|i;3*>BAL3yth z5?ZQSuvGDsGdzh8ohHC_8;FM2frx1bQf8QGX}*|$oy#({9p>W~jPpM03BE%)^P7}@ z{3d^gFKHT+O3uqbTGv32nvxq?AmH>B*07mr@V)iiI4u;XpR?^^?UUeggnOn*kZ;`T zk4nAw`XBAj(h9D7dfD|qL`S{?g-3f?z+e&k?JHl%2bCKq<2Hx%Vc%0ol>JTIT0x4K z3gYb`KE~c=s_5XGBt;fXHRXGMpEBZ?%TlxEye9aoQ$Q!8B^i&^x+*gs{Y%Z1AU!gS zbQ&ms8#;^*@fo<+Fr@$YwK6>|H7jPyzp5*Mvecd0(+C+src)z8K2lSPPXA}OL_yhO zjH+55AhYoUypx@!ovQ_-P@N=f{_J#W7p z{%nPhlkAUHA116tllWTZEC=qcrY9x28zWzB7zy@8kpv~4XgTQDL8=OeQb=xASyp%i zF^6Gx)t!pEzgs+Jiy_Gfi=BESO0ECFcD$@0=&S94df?xEqBZ*avQd;eRW!Xy@a@+> z$2$4l(4kGb=^)B*e(QNHw!fQb8vrRJuz94GFFJYQq>1}iC_n?z@QJfFEdxX$$!Qo2 zYITtCM(mHQdD78RE6@DWaQahLPexKS{iEQUpMQjYA~M8>#ps91MGj z40JycC&?)4Ib-DA3QikrVs;FHUwiM``}Uw5T3Pz`_gl(2-jDEncOH3=ReFVVj}~q9 z{m#i}3FdmB&d$9=PjPgLujmc&bchok%o=A^GKJpZ+yiGSnW&HJOqG%U&wV{~K6{kc zJm4%Y#KgB9ekAV7X**Xk&G+V&WL<0=YbSL6(ed;q-!XL*69=@x1SpOT2UL@ zy~-esL|kZH6>savEnr{FD6;TlmIBWg>13kvyuR9jD=QQpR{w^kWNReHV~{cHbZ^F` z?1dK=ij-9R@pZK2;bvAfyXbei)lV-Tp6?VV36Nu38*mxA6@!Cb-2M!lg!1eOJHahl-9Dzoo}sx=>rye(NXu65O~)8Y0hjWtZ0um?K(ssR#?mAo&5u zN~Ak)>zBR)5$FBF!e62ciI=^=F4EQ6Jc1SD3~}T~#1`hksr{mnqla^NZJfa}NVQTn zJ-l!Sy7GTe!qET{aOKcYF}Itzkkeycn_>t zKAUG*WtWP)-Jb+CEXa$EET9QBe|%C3ybK;;7n^Ft*ah`af z$cK8cSWkb)t~-=B+ew)YrQe?x6u&-{X<7rD`64s7BHtL_R5wbQR3uOI^j_bKp`ZcD zC)+9^Dp{Y))r%dGF$v5UdwXXXIu>GUh_YD-Q(j()+I-GGAn#JdYSk{VsiDMsxX%@y z-_GS-V>#6?<6fn{*zEmIygN+TbANXH>}g@e;2w)^GX0lm$#=%Bj`Rc3;pDUlkJVo6 zg-dRV_F&=GX5h*|INC1EpRO0$$Mpu2n51<8ar<79H~rS&X#umo_{9Wym_X80`tAvP zHtMd>Q&=xPQ>p3D?mb{!X`FkRMz63JU5z||doTII^QfTsrh{XUO?{%GfQdLHtUH=t z4WMS`v5x6WTjg8E_~ijKd;Bwo;gwD=;ycEQWFCwppYOl_te@R(*mgRh^1XHlX-^O; zenA_pE1n*=Z(@}XOc%jdl$X_`Qq5`&+Br6z{rw3P=i!uB#WeWT54{{a)29?}zo`bh zyTHSo+i!mEp5Sb9nfGA0N5zsluZTuyWya|;VljTP631`Y&iBvzxeUwZGLsfBQmat8 zr$#nhbGRNsz!u&at>`Yp<<9Ho%&kTBY*IZVgCKBsi0vItEFrtx>A}2UF1zmtFbE$r zNo5ZNUtb&zdMM_*7bgf{5zG3#|0JMt>vNp$>M_7&_Tt&J(ydQddO_gNxy>w-M83vP z0`rBTjh9=`$@~b!D=c#qOv7;nFwTAgbrd;ytAolWV~RU`Z<|}ojXRF<#N+qofNgVpUZzI9meI6mq@F+^|qC_bTCt#1b1qEk5 zOn?gWy>co}g&a9St#c5q7WQiL5js z)~x`Q8<86AHbx=7)E84)F1PE8^&}a6c;0&P793Wd43}<%aGFsC_2l_(<*B2P>&~9q$!|UK*D}5zSzm66JxojU+bPmXwH=8YX7J=bn0i|ale`klkkM1U z?^Xpn9FQ^oMWf8jPHTVhrj`LJth?JSlM1I?!IH&46v2~?(|kmZk0`X7ESe5kx;jkW zC}Q|QUvJkV(-?R*Nbx-d-ttpoP)2FVEzn2tSS{5%WuC82JbiwvVpR#zx>D>rp~s(m zS<=$3t`60hiYgjxb~|Ax#@oe3wVpW|5Ygl0nOo*o!&JPdGzhz6!)QYwVaAQNzTCQU zmD+cJO@luRZds;w^ea_Vi`&!JM@=*xUT_3XS8+lX0UYU$`IAs+%D>`P>|e{ znX<^fqy+6s?8Bn$Riw4FkJWmAu%!_ibCdgVMTE>;+Lp5$l@cN+iEh06Jr&>8N2=o<^s;-P|B%(UeC<3#3F{pxt{ADbq$nYZY?eA= z=LWb{+_33)IssMNPj0&_NB7CJTMEhwW!?6Z_u5(lReZT zOwYzTjFYNms{bIab(i#gE~c2;Hq`2DbNY?Z-!afZky%cz*wxho)7_&etft;5uolE4 z=08BH=IfjYxWM2g zy#m7}0*62r7nhc>pMD!fSoUd$_^P7mHNJt1J}edujQbJFnK0zwlko^5wc0)f#0u{P zPYg4D(|xv12js|D!THMt1-d)7-A{5LT-TwSO4O;wRkcrg_JSeQARa;^O~ z=z?!lfxtcG@MXbNR$iw-eha;n7ec-C2XA&1sg=W=7XL`(43Nm%t>VT=K7ZWv0h!yY z0wBUv<97`d;-;_E2=Dd^L9P=}t1!?e5yZ)m9iQU+ z@@J%}@YI&$rf-{I7ksH29~Yk$Ewo*2Ale;dQ+HMd1M1s4d$wM*-?+tAFT)y$g95td zj@;F0LMQ|1n;9{iwYK5BtSpVAbw93d3@}?B9Q~!-EihZD`>loqj(bc{^_=*aRHYT| z+z4P(6QE>lTmIRuG|}{u;psH^2if$Egbm(=Y54Kw(y;wU1!~p`YAeP}prciO#|RyD zVJNV>{-kgcW5Z-{09rmnDCw2Cm@h&}1yeus-S&kBtmO>qAePHV39(%NdAC*+&~{#{ z|L=NBZvT0wPcyXb&si}=?OR^ym!2_u7i#6GVWJ8|!_(?ao-t9O0|3xF0JN`R0aAJ7^x$3OT7iI8e z8KfArA%FW<&vv!`?CFn&e9@A#zkAE@zrFcaog+2zWq-Z5KZj)iOhO|O6re|>oT`iH zY6&fWz-sE}47Pyu3~@qgNCV9AS%tjF_y2jC4rq&$IF zlh12|$NyZ=#kk&i@fI|2@NvdifhN%N*Utl0p;bt8?=o;3yn$7{YzC)V3OM*&pwM z&MJE2NR~Z5i_JULFS5OBsaisg zJ4NuI^X6qeF&$&y&81dwP%=RkZj{5s~n3bUo726o*Y&SgAtD`n??L;dwfQlX>{YsQc@d67Dj0gz0?wh2P zAohJSNutLCVC*DMjRD+QBjuU}SqlDr!(r3}v3R)(u*t3fHdz-l2X6q=8U$Ex&|zd~ zQ+lb%&8s}RUJOf0CyLR=h+-I|0eEE37gY%(uGWpBuxb_p$OrKfX8?$`#JD#SF(hjTUsMxNtxMT*TmXA3~Pn@u}WI4V-$v+CF;&Q0G+ zv$ugiOGe`zjppNZaXid9cnLgP(UlO?pxcvz^>sXJM(crOI)KS?AqNFsog%!42Zm|w z`=#^+59(&aCfxer&=Mkmb9CbhmYDaT|Bz=Rev2ewXl@LHQ9Nhb6N=118f_wOZ*i7>&uNWNuanKnoN#?B^a#Jzhh6@Ditm`P>* zR+-vejRhtxdHnYPD9440UF{T0{F?hbGVAwqe@>T?D_v?Zzpl>vSgx^9@H)?bHm&FXqg7A{@^=-cX%rKBRP%(6a%}RsslHNqFN_z-pbwy)2l~7& z%u&p{vp)G+b5+6^YtsJ;zz`mz!dV~`@cU-@x8CTaJ@DH)$5Jf47oU;Lx0^bbk~<#8 z5{h(1WxkW;eSs};)N}XuAzT87MjU`wTMQLb(CszL zUJe?`kPTYY1uE_lj*BOqt`qdgsB&z&pK%U7 zQVWr$dr`cyxC@t(s5n_HDZ1qf)aKXEgjX&JNfv{D%`cN`=p`sDtr2V4Y)fg~gQh@Z zg>GwWVnV44-bCb#KCeM)bN#vCp}IS0`ncsLd4lu>{&8W&@4l%EiRM7FIpPQUx@}x{ z&8@2A!-%k%Xj=fbc)L#XTXL+Vu6$DW)~t+czw3byUNfq01dSufNWesi(&$p?Jz>NA zD1xjOPEGwLgYl4!A{M4tZiL3>|JB%a@_XuoU8Kw3(iy^0vl;QZBo^3BPj|+wM{EdpI6JzN!73kzLOhf<#mIN-&+)zei_p&c%_Vm<_#2WyqV!VLhqw_J*525lRc>XspFTMp}Zo zXemmu)m1nR9;ZA_rBIY=dTp{mb-d(KZ87(X>4J#l$am+?SR!`9Z|KxrYat)$8<4~5 z1NF;g%rw~*E^5|#MUwp~C(N~Dq^0t$n=WjTtj)RaldH!6a}f5(?A`$Z7+fe_?d+9u zfbUKxEacF?Pg&j!${7%#TJIWV#V!NW_G6)i5O_9T!S+e5cq};=~ zZ~JG47Uc|O(BL;H5;`sA%=5Mv_=SFX6vMW|_U$Srfk*`ApQ6xMai+Odt0(M)^56qI z-_3hrZ*@B-R_PKyhrk=+6hHj@>hK%_{C^winsO0t0=f_ty>lUQ(TfbhXdW0vLiV}< zsG!eV>zQQm_cRj$_aq(`S}c8szBIo?d?hJg&BL9^*LTD11}kA!$u6`6Sp?ycX-`B3 zjZ%^JoSJ2|5FyQlx4?WN{z#COOm-X__s$6k+ir+aHPe#xi^KEA&Awp_>$nDstZX^S zrm9QgZ{KzFxI^Sd)=1cMxoVke*%LNjRqOPy8nH+nrZ%PUkXo3 zp`;LH&Rs&Q>JD6{9~WGk5I8^|IbcS#0~-S)9sAS*bAk)5;d&GXIG7_|Au5*lf`~sW z_6JZtyPK82LC1!X(}5CgNcqIf>|L;%nDgbmX36cM+J!|{wma2mfdeR|m?F{PBdp1J z(nJ`>O;7K}GqC$2j1eke^H36E5y}A3ll@*kQ%%63^Qgw>QN7-iex?Rp6cVU^jiq^3 z{xsZBuMtbqrTnXIO?Q62os^Y8C~#)93+`9ctQFO+4B4#Z9>ESjx-7RD!t|(=E)>T& z7Ai#3tHt${`(g>(?e!Ua%gIk`Zz_FQ=6}>Py{$-iA^rGGs%TodXiwWrE&H4)Ptc%> zrFw*PTA3hf`^GmTJ1diAf19o|!`-88;yN}ABzQp-TJ6+avOT#jFbzd@p7 z7;nqTc#o_=n8HFm&ln)nM}rM)z95Fz_?VV@=%7M%=}_xfO^vfZ??otq1jYw4YIBOa z29{WLG&&uxa1HEd6D)&Dm7>r?ld~KQmsXty+$BrhC>%faBL}&;jwe8f+0DOxu+x3u z(I;%4<_od0;7o-JH{q4WV|p|}i5)gn$y;8S^Nfy8hkfH->62i`4gHYkh5uaUr~axS zvfkK-u#UASLmhFACxoM&4IA+XlpJa@mkNRIGywP-u&hc~^Ygy%J0Ml`A6Gz;5z5Uj z(n|S+xY|+Ry!hg}j~}%;<>djxonG>`{lgEqP&0OCyO+jQ>IOMw_5`rL-m+I4r&KL3 zUb0#AY{MXN8=O7eeWhYQXHn^Jem(PcMp-%x`?O5JUO`cSv85W#oux45x5|@fK{flT z=l1a|TE7FD;9N3ksnjn8r2eTxG7&yD!sRI+dah#%Q6DbDfgocqlp4RDd}fuj4NhFQ zsBhfg+(u4%?yAyqb;3wla?`u3jIYOY*n~x+*G=bpYQrw8zYODSnBM#9FiN8MEx`S* zK*7R32@?)&gX-oOJgiZ#WTS}G)jde%SZ(y7Wa?I#F|{;4+@p@+aU_xMRDJrEu1HLl zLl$q@lpg6jiR|r0pBCIzl(UBl(@tMqM=2>|vb&5X!?`V2sJ)R-gGPHl#>L+e!Y4v2 zJlK{yQ-~=kqDy2xOn_iVjoLz4r_ISV#+bcRqZFxQ+wS+00{8hWA{CH6FWxnS(?d+pBYg@U*U>~eLKk-+cUcOny_X$fcQDrIDtHarR z^30kW?e_%MG(7(41vY9*d{l$xEH_C6T(R$*#xZ*M6Mv6Ri(vqF!GfCpAn4egixrra z|GHpRCmJwqyG>T3;ohixQgPtmSp?9@C9jzfcvw zgvjODb?g&Y2r251L9;5USGm%mNnAK)U9rP-;>p!?7a1W$BRxn5^rd8$Dqmc26+U7n zQ9UGfDVq#U$57olvc-PFX83JOnKI5przFp@XVF}|(U&scBZ=gH$c*CVm0B`fV9`^v z2jgdh={?x8e&R}ZVaODoETsa2Ph%hS-kt~s+14hx0gt?8rZI*onvJr6yS1*uz{(0L zM!4H@MxNG<7|)?8Psvb;!QZovC_GTRJts@|}k%MDheUvUgKhvP%oI9MUrQnMK;+Tw$1n8tgY= zx7Boxlb)S?#7vCBvg?De1V#yOY*X$Ek1X!cu!LBpcLu^zAx^HOueVBS?5(XZN{l)C z5Bpp4yO&cR(W}uEv8@Xgn3|usQynvWvX^k6?zD_Zp-zsKPVpL?rm@6ePIK!Zu(m4- z!xG=2f89s(`$B=*lbaivfzFa#6RF8^KrGtriG`9VEB^kuAqojCqi}-?39ZadT0xu? zYZuaIqdo`=30AyPp~0mrxaR-BI8VS)>z?-wlH_>Zm$23Lx>!c_n66Xm}_Uq zE~l^G^8J*Y!oG;DXnp$Q8xf7%_G!4r)!MG!!Q?wAnQq1lofW8U0Je6rFt0)#xAtLS znZhh>9;GgCjz##}vLTlFbb4A}mRFor`BS%2Z^~_6YiAOr!rzQMPMU#$ZQ&2)_Rooi z*PcIdSb7Q0`7z_Ft!S3+A=nu4%OTg{Yt6Nyk@mG+ifmD&^$shxuUk3@yj+_Q-nR?ba5iuF;1&C@B9MDx- z4*%B2$7(;9>>lfT&YHoe7-+0g%^(`Dvn>32%d?j1-Isi~U)3M?+Jn)M^2BnIAmgN5 zWc*5>C?g5gI);_$t^a<|T4++^BH%9{aPZRDZv0fH9tt>DS(4=2ZkX%42^drmdc6T`SdN@Wn6(L# zO}-ceBHv2VYHy;#-Z~&kW3|0@j8IQ};kWym(&zK*6TwB#H2z)CSGS#5dqq{#kU79V zm=jxezkXyx&ocD*#4iiLq4rJAZ5OxGL4fhNf%uuL+BCowuE!N<+`LHImVm9*J($Lv z79}Hp_(}x{AIqiJCk27YEjA9SiXK670B+|8f{opK9D}FBOeh&tB&+$yfwyA-74ZU* z{S^Viisv9+oQj=qXeAW=_?vs2%Bb|B`|yL{qc{^+k%AY(J(%(elx{r$j_Jcm@z7Gb zo8We^iBtq!-dA#Q#Dr4cGFL$yJOB_JxQUZd;dJl z_92%U3Ou`m>=3trl+zCw9Xnry&Qe6iPA*~5p%I7q9n2kc$v9b_?&8F4hKcu9D-5em zkw`U>{ZGzfdxh__<3=(e)&=u6>YpgQCUHsUJ&n@D1fp8 z>z`f<5}}~sS+Z5Iw)yTv+-hdtKCtxxq?)ToaJIowX9Ia@BCYop zgDEk{KR|lqOz83#pw4a1^SZy3W$k_n@pV1LGoJ%{F~E@WIc7m9I{LlIrL(X?;#F~> z&+X;5@LIG?4~@?Xay0xTwon&OPVD-yJBRBS14qHb`g8uL{*mpcTzwzwSI*3CUTl# z?#(hQ1ZIDC%gS!QIdIWI7k{Ia#?&cu*pnyh5xw!=^48WH-*3#;jdV|4f$AfO;C9LM zoOGTDl3N@+_$l_T<#@=8bT9qBW^yR$ zZHgb`#pq_@G;Ow?>l`(Rc~zbBDQd1aH&j@5S%Pa`@^pUv*X2z6t<|@18r6Up+wKE= z7IY;JT}m2W>5RaUYq|BAIwu1b2y$E`jn?WR4S@ziuZH1#R9u}v_JVr3O-oM!(U`ytPXmrRtlNY@@IEpm&>?nh? z2mHP{FQ!-3VAP3HyyIe~fuDDdKgf#Thc*T^Yx986wi&c}WF{sjU$l^Yxz9mvvDp{= zIkWUOm=QWf>L$F}v_fSOggRAb`VJPe~Icr?vjxcR|C@Olc-V2p?l9@2YS0*&? zI73{Z_b}ST20GeW#RIQs+p_a>+XGCMSUZ~3`xB{h&TGtjBJhYB{bS!!4`3<0Kz=9V=8EJXaB7B9 zdfYpZ5UDDVswuJ@Hq~i|-C((9mGbCK^@qOWPXq=V#{LJr+cG%IS+8qzUYgx@8)=~c zwZU>INT#gw&7{`ED93x1rb)<(`?0~A4J{A5wBi(c-B*h{8715N5&OiDueACi48&`#_wZ#c#K79*En z?-#7n&>5iomuzZih$>@O$yp@urAh!D8rM2cKKdv=S?(f#Mu3u#2#B%66kW)wOCa!@ zoRlAtqU#waof^jK&#E*kK4D7r6$7vCwM?hIDR=Z-eo_>hs!SoKk1e3GSrvUw%&R4? z8M;Cl-Zk6A!|H8%Rq!Zkn_qoxJgR2Pz zWC3ktwDKBpw|4q;&DL=@!0BbO0(AC=y&f(}d#`g2<_(Q?V}GCO*1)T9V3l~bJ9!b% z5GRJ`@G{#W>yaLh)G=|!=Js>z+WwZx&{vk)8MM)ot-jGSNhYwTsFj&V#p@i{cSf>Q zI}On86`Ro3**ibA+ez6FtK2Ar4do2GFV0v@?6xR1EmXhmR*!AQxx@AH@$n0Enj?}- zQ7sS0ZxTMup)1@)J?=Oj%JE!A_d(Q$QanItdgNlULlnYh@6Bcwq6DRs?0=HI#3&;q zbh(i>=P#MTu!o}^9w27j%N1gb|#)8+GZP*6)ph3jr$LBV7!3Xcy)xOLq=gocdI7WTb#HX!Pu)HQjsxK9?>`omxHHC26L z_p-)_zKE-I8&6s!3N!5xQJkpYJ?RyNw>|_E`bx&;wOBS0&z%suGeD`d=l)5;ruGx5 zrk+sqGwjO>51xGqkB?!A&G8h3Ms~+ckUHsM@O-QPx%Cg$;JMHhjbov}=dGR%?e4tq zOV3Ucyt%DM)RT=~5S3#gUn%pq*8*%6{R`zi4SEy2rHP%M0O&YYVOU{1ea~1~P9v>c z%AMa@4FnxNxkeZ)=h2<5e63%o+d+KrTW!RpfUCnNg*b*zxI+9v(tArlwSQ1LalNBW z9Ik-6o%6~3W$<|R(b3^s_qZl?4^KRwp4id|hdD0UPjk>=COVI_UO{5_qy)-ttr=>Y z;yo#JD%tGz4_S+H8}XP?ROj4kfVH> zc>e9(DQchLLNgs?mZXS-=)(7J;hWrv8$)adi;tZ^$b7+!xupn61h#=n3O z)OLg%%E70r8tDL(;mm!cI{8Q?)+IpwQKu`)XPYSNkVtgu)$SFKmk)U;yTvG@*C-q0 zcxCB}z7UVB%ODn&tkZIkXZEkw1Ri&?^IiH4koitN=ZpVC5$>r~0f?=%mo-qMYrBs4gm4@_dh5nMhD-(n#pZ5qFBqwLM z(xW<@rq(s3;bt+5#I0d%fsbuSO*r05>J&Z@)WV01aiQM72ThZf$F%7W^&#ioY8YZu zLq>VFNmmlfP?3^{(k~c}753BAGvom-vdG=#n2)gjiOMjM_8DR_t;MJ|NtKRe^VUX7 zXdDfp;VtClqR1`p3jXskm1SuPy8 zMkukxC9OVvg8~k?Ft#EWTt#g};+@J0in>YnWcA=p0IFQ7-3NzAkm6C!1|6SCBX#0E zvXScbiEIy`{=mF*nyW-CD$DaoKc`>M;geH&WU)zMFUj5~Mt|MwLerdi6Z=Sl?k?*v z_5BmXR()ejzy$_eTYeMFCB~TXJjcdM1!=CGLkNJPAQ6o_(rBCN=l|)bULXKfg z4?ZO@qrwZ{&^0z^Bz24EH{ywnKB_-R#gG20x{7wRcPq=fK1vr>i4y*{Yh-fEgVIEi zU2qZ+?R=!kSNxL@^f%l2X1phvM{@D7soRHe^ChP{~K&NKOuoli(**L|!(bl%I z4X2bbV99dZ03lFMe&%>;q)&HZ8wxF4v;q|M`##2#Y`uJ5Mr)TwVubgY2-pJ#8^Xu2 zN(LkOM0K}gUoJFY;xLm(!LUkHECXjZ=bQUwd0k+i3Y*cA6V{ zp;uBF@WTkdJYS+(!V~F|Jkzb-uANhsl(wOI;>UN_T^?pPAm0R7kfJn(c~$3Q&ESNx zlH?B6W`$F85#eU)jywrfCp9V1eZh@iZ18;QF^B=_2FfX$>h!WF1!Qx*Op$yHZ=-Oa zu_^0_kGIJwDwbr?6!efhFrv9nqL3vskn3rTe?MOY-E=!osQV490`pfpH~Vn z@ng4y9x8V-*3AvOZDFAzXOtN3r?ZQti66_Nh0dV~gv#cAB6y>c`QkGEe!Aiv6azN- z?I$NSrxtu>A=QsM($Ji+1rJQ(iS(Y>eigB-^n5gW0W&9;E3?`pPkj7 zcuS;{G~nA;MjDjVyG_X6S$IhTvyPfZmKn%6j4ZuZUI97*%AN00$tbYy1Rbfm&3t6Z z<|#F_4$2FOzx3e}<83MAr{(OT!l8Op>iQT~UeQWv6pAS)-A&O(laA`yX8Q25YKO9O zDP|3i--s*Wj~FZXdvsCjxU0E7W0`%69dt)v^%+zj2^AvyO0C3*M_#fc5$`g)rWzwq z0*$6pW&^kaULDBjlSPHsgi?YSIV0L%yjtE{xD2Ke@aXVgZk)^^LR1+R;`7FEof&OT z_-yAb=}%&Ylr3F+sfv}n+}V2rWEhaj=?2QN2SU%uNR&d-H(>EKTx2|6Cm!|HyGriP zXj>xLhmB?CCKfJE-l!iqhKI0E{q{F6oJ_3 z!WG{F^Uksfxrt}XiGMKX8#V!yO=+CtY1%b!9;S!+9S%)v>xP`ivmg8leudx-;oWiW z5+7a z11*aHq)4rRoEl*kLST>#%4m0%!6PLTLH^X{#pcY1Zvb_}D&x6}A`Fn}^x~|8kFQhV zEnO8Ok&;oMjMc^F^hdi?_%2pG)?E}bM46(jxst~zqLU?u6|Qv2<{+YYKgbtX``Mrj z?wN=d8C^0x;*p;ou8}r)bmONbOgGj6=!oPOnF zj3k@a2txteovZ4FoAGv3ALVnS?c8!9kkG>_4_j^m1=7)q_}V^_ z&s>6>7#aab(i~?%t+K)&lVQ2(MTc1DQ94GjL1(T6SoR@=DP*f^v3%b0ekR^D*t-lqxNyhndaPVk3=&ORp zf4>G$VL)u7)8|7r~^!_%n2}VpOFM&v?ia?_A6L?B_F6C*Z1cY>K#w7R#P}Artxq5gT zG^aD38qdAfWcGAZn*_vViIh2r2?Vxc5_Zi=_+>3UOXGVY;gjJ6q zE1g{=@v509OeAK1jDkf2TW7_x$Itqa<_D{Fdf1s zCO}GX9H}Qbl#`L-gg&^s-f0J9F{;(>S?n)SE9=(YlIA+`?A(|4GU@u0ainW|#-NmB z0m)qY{BcE5&z+lIN3OmHnN+i{WRvRWte?+W0brRuBoF;_XR$49N{6928ptnHL_?W6 zR%0d4nO7DE?j?yyz!5-B)hp@Dsv}Sd#|W~b20=!L<2WM(zl-1vZ$uKnd@MnbXpO@i zs?WItr%6gmQ!$ZpX>fC+hz8%t1S#Dve0Y2o^!Kjw;JPQ(I^$YJ(B|6b)c zmRiA8p-0>f*o`l_WZy+sqrDFTP$@?%BE^tT93`eNvd<;TUKfV!X+q&xjokN1QJ`2p zLE7X|?V{KG0mx&9^A@Hp-+cFGd;5W)YW3v=CO!V{L!l5Wiu~QV_sqhkh8+m}%IpBH zNNlZ^Tcai81JVH5Ckpw(S=AEh8YCrpG>Oe))oHVeXb1@&_#tu;0RxC78p|T18W75w z%gRCWyiQ4O{w=cqiq=PEPf?- zs0@uwLL&B-tu^%QBeU5Dn8nV{3UBR4G-mg{E4(7CSCn$`l)^mO9$Yy6c_8rz0CAY| z^jc=hqhZ8XGFowUELND)#{N#a2|nM=t4&jM1*?xCRvO>&6(~bvWVr+phpVkCB-3;# z=JrJq#Gio})!x{M>5Z)q7Kbz;Tik0AVnbbR?i3rsV)I@NR&h)eMQAd57>czWU=;KD z@#af6_uAvGMQ?`ig+D+Y7T_kE&>o0|xJq)7{fI-OD`$~K%GR`yLb6e_F+->MM5oM-!dm^2W6gJ~msblOpF3gul$J+1H!>sde3Nl$7$}5XX)7># z3Yn!Wy5#{wvBo|@jN`dts;TVlqh~qZ^Fsp4zvrEsA2diOGu(Xz(nt|0{gbpI0$YI3 z+W(07yO|l;0|~8&QoP(byRsL_nPC(-deKbCDqkF}Y&OZT_3Pfz1fGs(9!HWGo4!Im zirjQdw?o8M^b}nwhz5s(*pnoFwM*!3w+oVV$FavZtEAt;ikh|lNd)JK>d+`}ne_?9 z=$iJdqvZMi{Ll@=cWV8|MQb}a>N;Uwy?mJm2@qID5F%bfaR)Oh(#ky9pDWS)$aJa2 z_YlT&w@<$_Ay$6W(Nb#L+U!^JukX$8X)Vn@TyLy@*I=7R`DRXNs}Iq|y#mb8PtjFF zxhtD=4Skwc&VGhqs_FEH;jNUorcl}z^9N=jA<~Z6Mf0Wu528B>sxKU;(JQ&0{|X$y z@i3;27;y}4euZ<>JC2Q_cDSt=oo;FFPknGdO7v0-4|(=OR=Ginw-HtH^J2ERqo8J| zcWlC@;r-Hu!^+6uBs`Ak_BPCdhOjY{MtKo%#<{dAeA*W;X8SxN zvl5@-)beX-IeM$J7#LxJ>|`a#Euav3nud$@H;rlGm2_$qRU(Jk@rZ~yI?*NF*DNjU zWGL8?mlKLwWxohNp=_A__VP{-Iyra4{qCQFmW=xvtZKm!welt^IiK%lrRG(u()`@* zi?V&cX05~VlhDO@B96yj0n5WzawT|!PClRk{x)pHM!hnl`NhtxMbc(Gl6A8y^t-mJ zLZ3OoNW?2tmB(8v1dq7wS4dDf;P}=4G!9~8W7rY z#s-s&$3uV>&#QooR?&Y2Gp0yRa*IjG!9*u<%i{9-J@O>XV2Gn}gFqB%Rg?dsp$llo z239D_%R5^h*Q_$gY^e9>s09h=SFYbNe7=j-LuGS(s;q|&;o5`8hB2gD!ZA-v;vC3LfF&fX3UgZ8;OY0Q0Pz0ZB5b;;1Z;EPH&HW~26DAKeR zz;xyLDJ>=d7c(Q}UQ@0b;t4Bd%l?pk7M$U`>xQs|;v_MtyEHm}-s$}SgIZSq)ZK9g+^82+?^UvVBL{v!vD0x{& zvsX99!n}#*gfO@`rQF{dGjw^P9ZT~KAUHF5|@d0o@hVJ!{g%M|82Ckbg;=Q;67;u~V9Srxc4d6cP@VEgZS;*$(y^!w1}Xz}4asKa^Yzqb4?-Rh`2kRkG*ow+OSj1uq1-Bzrakn}=bi z+5PYB_|y4py#&xaV6)yp+{s+}8U7rNHc06G( zQvN70Pschr$$iFAm1|ozZguu3lB?C>UCo`ds(td0!Z+r5;yQ8BQoQVc!G{(AljkCv zBSuEY533w$y4x{L=-aDnYo!xY6rL!0a4b|mk}jivt*rU7b`KNXqaxpOCxbx1nXHTTh zuUEh2<4#y?Iw4fRG;iA^lZx`duV6Dw-GT8}7-XTh-8(O7E7w7*IUz@w__Z@- z#f5AsK6g^H^X7++%$S1vtBd#W#umzHXD-k?Jx!SL#_>~fi+==_Ks6h+ImqxQ)}5Vc zxZ~;i1!oPoxXk(Hy}Y>CebVubTMr+y-ZE**6GvE07or>xqVK+1Aa6tZ>NB6EyKPT; t0?$$b9)!`av@qnmN@yJT@DfR;|NQBZ9^4IIHh*OR0#8>zmvv4FO#ovBlf(c3 literal 0 HcmV?d00001 diff --git a/presto-main/src/main/java/com/facebook/presto/tdigest/docs/tdigest_centroids.png b/presto-main/src/main/java/com/facebook/presto/tdigest/docs/tdigest_centroids.png new file mode 100644 index 0000000000000000000000000000000000000000..7a9877db1e0bac4431bac1c6fbacf9c4115f140d GIT binary patch literal 8922 zcmd^_cTkkg(&z~bE7l7K|XL5Z?tkSs|g=bWPi0Rc&Z zfJ7A#@OwPxJ?}Yn%3JkS-M?;Ks(@!_dgkfr?wRTSLA0j2(p}v9xM*l-cU6?-wb9Vf zU4iFzFb44RY|U{N_=WDNtt5+9H9@lne86#2Ht7W z6GOT>b6Z%sT3U1aI=cbVXlN3?V!)%bwU-5hud|bjrHp#lMCFpCqAnUS4iuJUl)=KHNS6+^!zB zJba>}qCC9(JpBAzfCQJPpNp4;FPDoa<3A_)*F5spo=6XSH!piv7lz;ST3EV1@sfl> zf4}JOuYc<4WpDHEH@SHJwJczPJimK*__%p_{+=5cD)GBjOxxPi)#=Ia>GfRfy`=ah z{vrI|&HsMeKS#;AI=gvTdwK%Hq#pb=$7Bb? zLRlH|3Me(mw=O(wKf12fKxjmMX)La&C~vimtv2iI z$l!)BfWqM+#H?ED_jN;vh%jW$F)@BE!1m|g{5%dH9(0rvq@-RHQQB7{$%=z?UGtss6uz@8vz@tzx-h0*Wv(H+fuvuIJCEv!fYbT;u z`P-Md67}r+1erVL{eP};hEQ8EAKU#Y>%GH=9?!(=vVpYqfN!iLa@k@}U>Z3z2sRPt*;}(`H{?2clR1*Pemhho|5V^jY#KC`{^m3B(T? z*Mf2bPI3GSC|oado_=kfRTuvV7%q)&l3MP(WxuL-$YHStB6;<-^=RrPImaWb4r|?d z``PEAIL)$~Z6^cN$8tBv^v)DZ_M~bx#R$}VsaBy$tG8P$58AKJ+pCIg>0j4pWlR<~ zk@P_qW3L3bsLv;*f34J6KeQ>qzI*@r=Equ<=tZ?|83~)#1W(7sM!Kp0@F_lxNDXo@ zVRJC?Xr39sTf1XwNse- zS^z<#MEm5uiO}*88%1Ks;tHEZqXP0x-B);5TxGk#wF! zQ{SWZACy|fh$#`>iS>Yk+QILuPmfroYZ@0!pYnmFFV->nHFBjSA+B*y+?jJAxJ{~z zn_W9DvGpQ^vV>h%!*NKxmz{;bjKy;pOHdZ>kua;uD~Twmq4Mr(*|K&=;87GHTAu6= zFz$HnQ(UL8>0r}CEN*wgdf5QE675GuO-5#mAdqc%An_|JX{vlab~HTt}ZgHDR_DeGXWq8*n# zG7&d%|MT6<2aYXX#Ig*opqTMo>D-0PlY6;@$tXUXJ+*!{v(YpzWLN7Y?o~0$1sZnd zH3hS~$5A&a_49mxVbX3O<26c%O2U^nxm?o!_VP0VW$ZD`%#!W(67j9==KK4g!8oe7 zAHA1fwTATeOd*D-xmvX3-|22#W0fF0Hb-obnY}NeswB=I>v7nzO%%oCZqkfgM6Y%W z^Qpd+WO?BBaa6Z?3_&8{OqJ56D5i6-De!pagF+JSLD4UiO?78RmAIkMODh7kl(@f>7Uoa}Q-~+LxK-#Qn7b7F11cJm`mpB}z@dQclVx>leN`+Ku!2jcYO-3LCw~nH-~s-fwIOA~--k?7F2y<2;67CV7U)1uV=Q1)35BCADq# zlY7-Ly1Up|q{XRyK^CQQXIHRF4%iEaSIP)B^IoKMD?TelW8GOVp|rmL@JnXC_Y_=) zeG-Y>45p)94NCay%Py^l3F2SP@Ab!*n{@`su2Ra{IB3Ouq7W#kEMI9_vMPys#EX1( zKCKidc-@MQiLD7qKqH^Imvxqsvhqc#@GRxU+S1Kcq3KRtAijX@B=u1S<@a&KAUo@et*O?;7WZB5aOtro=uu6IGWQm23Ne zU-YRr5*gE~-N)c=3Z^$dE)?gaEyP5Db-;McEG&_s+|oJI*6AkCD-=K>`uInIva6-{ zq^6(mk~uI6sY&w1ZFi?SaznbW6XS!~`En5ANy+)@HB{r)HVQpg(a$N{YHU-(n65Z% z#&o1%&#j|8)W@i;nivk))M+RL!N`PsEZ#LZ$y5_sl3n_Cy%{IWQe%2_kCETp0ph-* zy(<%joj+9@n5V^Y?wIU9pE*@cv{+Nz8xaxLJgPjf-#xwlU@4<5$6dR{%YOB7F;DFM zJV##AmD!4_7TnA^P>*l8O*Cr7cQgKb3fdhkDrC5HqV%n=BEjAY2`t=htwVAt#KL>; zb-LjSC?1ngL`p60-e$f3#SIXa(=Jv!Y_|m67$lp1j9^u=v|98=d5&Q@B{j+BCKVUs zW>));El*B<8qX35B8ITZ2N>s3y95y1@_vU+Hq6w8D9s3R6@6(;@`#6$loaO}4^(Xg z85?rG^^frP0LM$L(AK=WOX4xrG=(Cma0qFflt_L9q4af<*Fh9;i&BbX3}~+Cd!~sF zn|YB+@jNtN6rn7Cg!|DJvdYAv)xfC)q!}2vWA?y}{9rnsQeGLtZeQ&?R=CT>7@{Jc ziGjd9gM>-IKDtoxW~>1LxTum0>sAq*wxjG4jTz2O5{@flWpT=mO3-&0MO9oB7qTr9 zWVjab8rS+WOnGHsPfw$2f_(FQrpxnA{4KRL>-6uWQEt;DM{Zj@R6?0&5yCo(=c3QzhOReHUn>i|W3{j8oxS?ponrSg6ZuJa$1_l9w#tNLY5(E#s zS$p-7K7>ivfojc? ze1nSaRzUaa3-m=o8yCM4gvjS{ObZEBT90U}aM(?mUS_JYy6-orLe#mF9=(+~b%SlBWr1_pN}OI!keT zWO(?Kn!_u8>o!kjY!A6E!Jr6Sa)d{{#C??(ni%H1Awf%~TI|jmWJ$!koU4Uxkr;aOp8WIu zagUvaZ1=d3!l+gLEt~O7Js_)3dQQW?feJPb2uL2KnpDtL)M|MA#!%t7N!5&4GBw>4N?YKIFySN=F&*&k+=+yHokS=XGmi^ zErAtP>)GeYK^6s5AbB=!^L1CAuy13sM)e_IPO}~4)#g!jhGE|wu`}&AuC=`R`J*ud zBP5@gQ8D~(v{Nz^29z)#eMVB)2D|_%px!X<6HBr_APHc$a>;gCZM$5oMPg#Z8&tGN zz5>a02?Dhv@*LLxF=j{*RIHxe;`T{)a1O{cfh%&|rI)b?5%W?FD11immv2&LlKF5z zXNh)aO9)4M6F7`>f^M#wmyf}IVhSq7Z3)(+X%Wa}0S9UajAevQ%GL{(>Y<3&!s__P z4;8Hj6(Jysnzs5)-brC-FP-{|Z@2DX0`1dpU$CZYEPC-bwGOUh(Y_$BFHVwSh|t~l zWjaTz?OGi1)WQUm3?E<$+D&bF=o;<{+Dv}@+03WfD@>D(aKZ>UJCXU3sEazg!^o~! z5rW@P!jc;wbnIU6>5Ny5d=1Y&*;g$E3et{4Yruh=kv-1zn*+iuN3BoG?e`8~(SD)~ z_LYU_B(SW{RT6;-&_BSsqjhOXFmi~JoHuZMA8>#|cTgIPdUC|?ZIaomt~ zpqnI1c@6?u#{tIV1V4T&%OC=D|2^eWAL+5fBQPj3SR#8}zMKFDT?0HMbOLJ2B**B6 zFf22PyeOBG?IOjfLr9=k+eET}zb!QT2MhgCA*bX& z82#Ta^Ar1n%l_92{V@vc|907_*B>nOM}__r1^@4arC=Vxll^(Fyw&ivRL&Ex7y$c7 z1z40v%64&JJOcnill!K+$R|*>aZ$UweR92AC3ZMGjUg$S`9@bb0{#MSez^1wDE0{} z)3}Vh`(wzPl#N`PtzWFuz#fv3EB2QV4Pk?%u8(`a%vI0_o=+dOovF_jBd7)KG{gP5 zmW#kR9OxC{?Eo;+Dc3CnP2QsFoj&mMr*k}ZxjfyLuoytD{&c5`Ex%?VWMd0suQ4=E?zVpoJWWgPc@UcZmI1@jPn&Ove|blbc*Vipi%XC(oAYlzsZ>iTNGI&$5l&!S4Ja6t5PmK?UWFOS*N!v|nk z3&5f*(ANC--vGGk5~v)Mp{|sTZG*bszAcz(poHEyE>yMEJvMg7(s=UvUda3#BLIuN z0yx0T_hr{-Kv^LKWDkKqzf*?Q|0aTF-XE+pg1+r90IcUMPbN%R;`U+_AQ!cOdGgPR z@7}_qQ`z-;#>$KuxvSB|nBUR(S0a}BFfTF&znAm9wbB`~{`uWO_ zaDNj38yjU+@1enQm9j-)8kpl!3wjlgWb#&ihb!3~3fM3(QWdJjrKR%tcoorZg;LycsB%%~aOBkqaOt#KUf{77^ zvgs#t8P@v(uxyIaxX*yzcRfV<_5$;Sm`VoT)QDBl0ZvI1I$Tm0E=K)OqK&^6=fP&?4#cDY>d9 zg}NqYqS!WuhE;z@n<#*x!Qb9PiUJuN+qsktBLQ0iq4u&3%9&UCa1Yr3l0ILJ3o$QJ zoi~)CQ$xLM%gfj;;+FK_p@H2&} z-&jX6-1ZItn8?jRA1C1po=rnEDGME7_;6&cl&{h&l0vEiy-L&e%o?-vs5`ZkkDk@E z4?o}gSZyX9RHz_ti-Cbh!A&FSFPOz!UGv}tTR2vSnmH#GT@m7?44ckrO?rqko)Vqk z)_Ql?hvwk2mlC#iqyfHZl#5n#tv*LU)~SqQO3d+qg#m>HKdZGWU4w21fJvRSkitJM zKj-G6EM6v=;MUE|ZFPN}QEtRm2>mRi z)IpQR1*-*gSsz@`FJb98vo)NIB=KyuzUtPQIJ8%t5jLB1?!e&KV1BjlE8gxE+B~sS ze=)Bzd8id&qn)w8V(Hm~k5aR$TDGD)qFU^lfV5jp=$B;8+2#hOnM}AGIgRX3x<*v$ zn*qqoF8=zSnp9`K%BjBd5Wv(`y{k!DCe3Pfw`@8k@FZPODr%|tD97q{AsAnxktOS- ziD^4U|)kXkG1 z*PDd5rT_`$Tr^_q(aRc)_P(26qY&RNt2?b@zl4o=+4v>vWZ{i^(AEI*G%?EcxhI(Zod?w-p#9Sy@ z!zHF?<_=Y!zT`c|pO=;-Lf_ESTy7_!_n&}QgMKEBbWM-xvJeQNIn&r2=f_D!w*U){dN}< z;4!>Q;SU}>Oyr5h8u@~fjM%Bnn-07>?-rFUtJUayTy0vl6mHwP_*r&vn%{}6X}2Jj zQ=mPsk|V<(%6zMxrW@+hb=})dweR}l>qlZjbgc=016)^}YCK!N7nUhD%@ncExsET6 z)d;L?R@d|5Q}2toF}#hR{yrM6n396P)ty*~R_SwwVGMGr>T$DFjocMp>B^b~{;1IA zb=JE%%HL^mXv8LATV97--TGB54RWcCZfgo zNZN}mT#ocd0Q&S3kDur=dsGeF8a8qH28(|9a@(x?QmZ zmP35T25~GoBa4bxaWeo9{_rjRFd}L|6H2^7`JHIXCpWleOd0*gQm_I4R8{}p{y?BL z#vamG6I51fv^K*3x>g5gX-k1(_l0DE!iv~MU&cl~M7&((*AB|{fGy4Nx`h2eOEt(E zwIwIxSjBPB%uU~;l9XCIVjFXSjzonCa5}7Wnu|BB#Lf9;+B5E~^PK1kii^uMx!D@O zOR;`SRUC7uv6)6ZLFxTO5l0s@SH70)T%->++7n&zsiv1eAoF!#s1LW~rw`nfOl0); z?p>uk;uJ8tosO$AAHlIt4uFqf>v-oGqQ{Xrsbf%$9kv-=w4_b2NjV{DbX{m^f4#u- z9u&ZWxVAY;E&GhHm54}X$cpO4g^lBKf5<4l&_{O_=)RBdA^y$w)g>?C+L7qjrcLgq zb&Q6s_Pk0+`4K?B4Kx=8LBCwlKBFpF~J3Y|9Bn z*LmQgmkgwx@lo>Kx_eHD?Q`xp$eJ|MqQ(x_c|P+}eU2L_MIpzv-)57;tP};g8nZp( z+?@2)Pl%rK8@=2(v7SZ*tOR?K>q}+`RN3-&arJ8f3_Lv{o8(zTEIHhX^=ecH!_>PV zG|@?`9Hp7H5GU9**VC3^u`3?Q~wVW~8lQ&4Aw{GyD0j@hLTykwfG7G%# zPo&nv*%sW2)^ZS z=F7hSFo21QX5(@)i)F@$U|9i*xhCJN+aVtnaxfXXg2fq&KIgM}my^=7-{N0MlCOaZ%S>Jm mhZf3IM#$ffBUC|pdnb$MhgmQi#n$hCI8jkhm#>mF5B(n#b~LO2 literal 0 HcmV?d00001 From 1a389b16f1428d68d4f1182a98fe5833ac49a11e Mon Sep 17 00:00:00 2001 From: Gaston Arguindegui Date: Thu, 20 Jun 2019 10:25:09 -0700 Subject: [PATCH 3/4] Add NOTICES file --- NOTICES | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 NOTICES diff --git a/NOTICES b/NOTICES new file mode 100644 index 0000000000000..9c15b9d6155f4 --- /dev/null +++ b/NOTICES @@ -0,0 +1,3 @@ +The code for the t-digest was originally authored by Ted Dunning + +Adrien Grand contributed the heart of the AVLTreeDigest (https://github.com/jpountz) From 55d352fd318b718dbe2cd992a25f434d13b4fa92 Mon Sep 17 00:00:00 2001 From: Gaston Arguindegui Date: Mon, 17 Jun 2019 16:07:59 -0700 Subject: [PATCH 4/4] Modify t-digest for Presto Remove unnecessary functions for Presto t-digest needs. --- .../presto/tdigest/AbstractTDigest.java | 151 --- .../com/facebook/presto/tdigest/Centroid.java | 142 ++- .../presto/tdigest/MergingDigest.java | 926 ------------------ .../presto/tdigest/ScaleFunction.java | 617 ------------ .../com/facebook/presto/tdigest/TDigest.java | 755 ++++++++++---- .../tdigest/{Sort.java => TDigestUtils.java} | 181 ++-- .../presto/tdigest/BenchmarkTDigest.java | 203 ++++ .../facebook/presto/tdigest/TestTDigest.java | 369 +++++++ 8 files changed, 1350 insertions(+), 1994 deletions(-) delete mode 100644 presto-main/src/main/java/com/facebook/presto/tdigest/AbstractTDigest.java delete mode 100644 presto-main/src/main/java/com/facebook/presto/tdigest/MergingDigest.java delete mode 100644 presto-main/src/main/java/com/facebook/presto/tdigest/ScaleFunction.java rename presto-main/src/main/java/com/facebook/presto/tdigest/{Sort.java => TDigestUtils.java} (82%) create mode 100644 presto-main/src/test/java/com/facebook/presto/tdigest/BenchmarkTDigest.java create mode 100644 presto-main/src/test/java/com/facebook/presto/tdigest/TestTDigest.java diff --git a/presto-main/src/main/java/com/facebook/presto/tdigest/AbstractTDigest.java b/presto-main/src/main/java/com/facebook/presto/tdigest/AbstractTDigest.java deleted file mode 100644 index ce0e618bf9101..0000000000000 --- a/presto-main/src/main/java/com/facebook/presto/tdigest/AbstractTDigest.java +++ /dev/null @@ -1,151 +0,0 @@ -/* - * Licensed to Ted Dunning under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.facebook.presto.tdigest; - -import java.nio.ByteBuffer; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.Random; - -public abstract class AbstractTDigest extends TDigest -{ - final Random gen = new Random(); - boolean recordAllData = false; - - /** - * Same as {@link #weightedAverageSorted(double, double, double, double)} but flips - * the order of the variables if x2 is greater than - * x1. - */ - static double weightedAverage(double x1, double w1, double x2, double w2) { - if (x1 <= x2) { - return weightedAverageSorted(x1, w1, x2, w2); - } else { - return weightedAverageSorted(x2, w2, x1, w1); - } - } - - /** - * Compute the weighted average between x1 with a weight of - * w1 and x2 with a weight of w2. - * This expects x1 to be less than or equal to x2 - * and is guaranteed to return a number between x1 and - * x2. - */ - private static double weightedAverageSorted(double x1, double w1, double x2, double w2) { - assert x1 <= x2; - final double x = (x1 * w1 + x2 * w2) / (w1 + w2); - return Math.max(x1, Math.min(x, x2)); - } - - static double interpolate(double x, double x0, double x1) { - return (x - x0) / (x1 - x0); - } - - static void encode(ByteBuffer buf, int n) { - int k = 0; - while (n < 0 || n > 0x7f) { - byte b = (byte) (0x80 | (0x7f & n)); - buf.put(b); - n = n >>> 7; - k++; - if (k >= 6) { - throw new IllegalStateException("Size is implausibly large"); - } - } - buf.put((byte) n); - } - - static int decode(ByteBuffer buf) { - int v = buf.get(); - int z = 0x7f & v; - int shift = 7; - while ((v & 0x80) != 0) { - if (shift > 28) { - throw new IllegalStateException("Shift too large in decode"); - } - v = buf.get(); - z += (v & 0x7f) << shift; - shift += 7; - } - return z; - } - - abstract void add(double x, int w, Centroid base); - - /** - * Computes an interpolated value of a quantile that is between two centroids. - * - * Index is the quantile desired multiplied by the total number of samples - 1. - * - * @param index Denormalized quantile desired - * @param previousIndex The denormalized quantile corresponding to the center of the previous centroid. - * @param nextIndex The denormalized quantile corresponding to the center of the following centroid. - * @param previousMean The mean of the previous centroid. - * @param nextMean The mean of the following centroid. - * @return The interpolated mean. - */ - static double quantile(double index, double previousIndex, double nextIndex, double previousMean, double nextMean) { - final double delta = nextIndex - previousIndex; - final double previousWeight = (nextIndex - index) / delta; - final double nextWeight = (index - previousIndex) / delta; - return previousMean * previousWeight + nextMean * nextWeight; - } - - /** - * Sets up so that all centroids will record all data assigned to them. For testing only, really. - */ - @Override - public TDigest recordAllData() { - recordAllData = true; - return this; - } - - @Override - public boolean isRecording() { - return recordAllData; - } - - /** - * Adds a sample to a histogram. - * - * @param x The value to add. - */ - @Override - public void add(double x) { - add(x, 1); - } - - @Override - public void add(TDigest other) { - List tmp = new ArrayList<>(); - for (Centroid centroid : other.centroids()) { - tmp.add(centroid); - } - - Collections.shuffle(tmp, gen); - for (Centroid centroid : tmp) { - add(centroid.mean(), centroid.count(), centroid); - } - } - - protected Centroid createCentroid(double mean, int id) { - return new Centroid(mean, id, recordAllData); - } -} diff --git a/presto-main/src/main/java/com/facebook/presto/tdigest/Centroid.java b/presto-main/src/main/java/com/facebook/presto/tdigest/Centroid.java index d865a9a194488..9f92277feb36a 100644 --- a/presto-main/src/main/java/com/facebook/presto/tdigest/Centroid.java +++ b/presto-main/src/main/java/com/facebook/presto/tdigest/Centroid.java @@ -1,3 +1,17 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + /* * Licensed to Ted Dunning under one or more * contributor license agreements. See the NOTICE file distributed with @@ -17,141 +31,107 @@ package com.facebook.presto.tdigest; -import java.io.IOException; -import java.io.ObjectInputStream; +import javax.annotation.concurrent.NotThreadSafe; + import java.io.Serializable; -import java.util.ArrayList; -import java.util.List; import java.util.concurrent.atomic.AtomicInteger; +import static java.util.Objects.requireNonNull; + /** * A single centroid which represents a number of data points. */ -public class Centroid implements Comparable, Serializable { +@NotThreadSafe +public class Centroid + implements Comparable, Serializable +{ private static final AtomicInteger uniqueCount = new AtomicInteger(1); - private double centroid = 0; - private int count = 0; + private double centroid; + private int count; // The ID is transient because it must be unique within a given JVM. A new // ID should be generated from uniqueCount when a Centroid is deserialized. private transient int id; - private List actualData = null; - - private Centroid(boolean record) { - id = uniqueCount.getAndIncrement(); - if (record) { - actualData = new ArrayList<>(); - } - } - - public Centroid(double x) { - this(false); + public Centroid(double x) + { start(x, 1, uniqueCount.getAndIncrement()); } - public Centroid(double x, int w) { - this(false); + public Centroid(double x, int w) + { start(x, w, uniqueCount.getAndIncrement()); } - public Centroid(double x, int w, int id) { - this(false); + public Centroid(double x, int w, int id) + { start(x, w, id); } - public Centroid(double x, int id, boolean record) { - this(record); - start(x, 1, id); - } - - Centroid(double x, int w, List data) { - this(x, w); - actualData = data; - } - - private void start(double x, int w, int id) { + private void start(double x, int w, int id) + { this.id = id; add(x, w); } - public void add(double x, int w) { - if (actualData != null) { - actualData.add(x); - } + public void add(double x, int w) + { count += w; centroid += w * (x - centroid) / count; } - public double mean() { + public double getMean() + { return centroid; } - public int count() { + public int getWeight() + { return count; } - public int id() { + public int getId() + { return id; } @Override - public String toString() { + public String toString() + { return "Centroid{" + - "centroid=" + centroid + + "mean=" + centroid + ", count=" + count + '}'; } @Override - public int hashCode() { + public int hashCode() + { return id; } @Override - public int compareTo(@SuppressWarnings("NullableProblems") Centroid o) { - int r = Double.compare(centroid, o.centroid); - if (r == 0) { - r = id - o.id; + public boolean equals(Object o) + { + if (this == o) { + return true; } - return r; - } - - public List data() { - return actualData; - } - - @SuppressWarnings("WeakerAccess") - public void insertData(double x) { - if (actualData == null) { - actualData = new ArrayList<>(); + if (o == null || getClass() != o.getClass()) { + return false; } - actualData.add(x); - } - - public static Centroid createWeighted(double x, int w, Iterable data) { - Centroid r = new Centroid(data != null); - r.add(x, w, data); - return r; + Centroid centroid = (Centroid) o; + return this.centroid == centroid.getMean() && this.count == centroid.getWeight(); } - public void add(double x, int w, Iterable data) { - if (actualData != null) { - if (data != null) { - for (Double old : data) { - actualData.add(old); - } - } else { - actualData.add(x); - } + @Override + public int compareTo(Centroid o) + { + requireNonNull(o); + int r = Double.compare(centroid, o.centroid); + if (r == 0) { + r = id - o.id; } - centroid = AbstractTDigest.weightedAverage(centroid, count, x, w); - count += w; - } - - private void readObject(ObjectInputStream in) throws ClassNotFoundException, IOException { - in.defaultReadObject(); - id = uniqueCount.getAndIncrement(); + return r; } } diff --git a/presto-main/src/main/java/com/facebook/presto/tdigest/MergingDigest.java b/presto-main/src/main/java/com/facebook/presto/tdigest/MergingDigest.java deleted file mode 100644 index e4e7b32f9d882..0000000000000 --- a/presto-main/src/main/java/com/facebook/presto/tdigest/MergingDigest.java +++ /dev/null @@ -1,926 +0,0 @@ -/* - * Licensed to Ted Dunning under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.facebook.presto.tdigest; - -import java.nio.ByteBuffer; -import java.util.AbstractCollection; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.Iterator; -import java.util.List; - -/** - * Maintains a t-digest by collecting new points in a buffer that is then sorted occasionally and merged - * into a sorted array that contains previously computed centroids. - *

- * This can be very fast because the cost of sorting and merging is amortized over several insertion. If - * we keep N centroids total and have the input array is k long, then the amortized cost is something like - *

- * N/k + log k - *

- * These costs even out when N/k = log k. Balancing costs is often a good place to start in optimizing an - * algorithm. For different values of compression factor, the following table shows estimated asymptotic - * values of N and suggested values of k: - * - * - * - * - * - * - * - * - * - * - *
CompressionNk
507825
10015742
20031473
Sizing considerations for t-digest
- *

- * The virtues of this kind of t-digest implementation include: - *

    - *
  • No allocation is required after initialization
  • - *
  • The data structure automatically compresses existing centroids when possible
  • - *
  • No Java object overhead is incurred for centroids since data is kept in primitive arrays
  • - *
- *

- * The current implementation takes the liberty of using ping-pong buffers for implementing the merge resulting - * in a substantial memory penalty, but the complexity of an in place merge was not considered as worthwhile - * since even with the overhead, the memory cost is less than 40 bytes per centroid which is much less than half - * what the AVLTreeDigest uses and no dynamic allocation is required at all. - */ -public class MergingDigest extends AbstractTDigest -{ - private int mergeCount = 0; - - private final double publicCompression; - private final double compression; - - // points to the first unused centroid - private int lastUsedCell; - - // sum_i weight[i] See also unmergedWeight - private double totalWeight = 0; - - // number of points that have been added to each merged centroid - private final double[] weight; - // mean of points added to each merged centroid - private final double[] mean; - - // history of all data added to centroids (for testing purposes) - private List> data = null; - - // sum_i tempWeight[i] - private double unmergedWeight = 0; - - // this is the index of the next temporary centroid - // this is a more Java-like convention than lastUsedCell uses - private int tempUsed = 0; - private final double[] tempWeight; - private final double[] tempMean; - private List> tempData = null; - - - // array used for sorting the temp centroids. This is a field - // to avoid allocations during operation - private final int[] order; - - // if true, alternate upward and downward merge passes - public boolean useAlternatingSort = true; - // if true, use higher working value of compression during construction, then reduce on presentation - public boolean useTwoLevelCompression = true; - - // this forces centroid merging based on size limit rather than - // based on accumulated k-index. This can be much faster since we - // scale functions are more expensive than the corresponding - // weight limits. - public static boolean useWeightLimit = true; - - /** - * Allocates a buffer merging t-digest. This is the normally used constructor that - * allocates default sized internal arrays. Other versions are available, but should - * only be used for special cases. - * - * @param compression The compression factor - */ - @SuppressWarnings("WeakerAccess") - public MergingDigest(double compression) { - this(compression, -1); - } - - /** - * If you know the size of the temporary buffer for incoming points, you can use this entry point. - * - * @param compression Compression factor for t-digest. Same as 1/\delta in the paper. - * @param bufferSize How many samples to retain before merging. - */ - @SuppressWarnings("WeakerAccess") - public MergingDigest(double compression, int bufferSize) { - // we can guarantee that we only need ceiling(compression). - this(compression, bufferSize, -1); - } - - /** - * Fully specified constructor. Normally only used for deserializing a buffer t-digest. - * - * @param compression Compression factor - * @param bufferSize Number of temporary centroids - * @param size Size of main buffer - */ - @SuppressWarnings("WeakerAccess") - public MergingDigest(double compression, int bufferSize, int size) { - // ensure compression >= 10 - // default size = 2 * ceil(compression) - // default bufferSize = 5 * size - // scale = max(2, bufferSize / size - 1) - // compression, publicCompression = sqrt(scale-1)*compression, compression - // ensure size > 2 * compression + weightLimitFudge - // ensure bufferSize > 2*size - - // force reasonable value. Anything less than 10 doesn't make much sense because - // too few centroids are retained - if (compression < 10) { - compression = 10; - } - - // the weight limit is too conservative about sizes and can require a bit of extra room - double sizeFudge = 0; - if (useWeightLimit) { - sizeFudge = 10; - if (compression < 30) sizeFudge += 20; - } - - // default size - size = (int) Math.max(2 * compression + sizeFudge, size); - - // default buffer - if (bufferSize == -1) { - // TODO update with current numbers - // having a big buffer is good for speed - // experiments show bufferSize = 1 gives half the performance of bufferSize=10 - // bufferSize = 2 gives 40% worse performance than 10 - // but bufferSize = 5 only costs about 5-10% - // - // compression factor time(us) - // 50 1 0.275799 - // 50 2 0.151368 - // 50 5 0.108856 - // 50 10 0.102530 - // 100 1 0.215121 - // 100 2 0.142743 - // 100 5 0.112278 - // 100 10 0.107753 - // 200 1 0.210972 - // 200 2 0.148613 - // 200 5 0.118220 - // 200 10 0.112970 - // 500 1 0.219469 - // 500 2 0.158364 - // 500 5 0.127552 - // 500 10 0.121505 - bufferSize = 5 * size; - } - - // ensure enough space in buffer - if (bufferSize <= 2 * size) { - bufferSize = 2 * size; - } - - // scale is the ratio of extra buffer to the final size - // we have to account for the fact that we copy all live centroids into the incoming space - double scale = Math.max(1, bufferSize / size - 1); - if (!useTwoLevelCompression) { - scale = 1; - } - - // publicCompression is how many centroids the user asked for - // compression is how many we actually keep - this.publicCompression = compression; - this.compression = Math.sqrt(scale) * publicCompression; - - // changing the compression could cause buffers to be too small, readjust if so - if (size < this.compression + sizeFudge) { - size = (int) Math.ceil(this.compression + sizeFudge); - } - - // ensure enough space in buffer (possibly again) - if (bufferSize <= 2 * size) { - bufferSize = 2 * size; - } - - weight = new double[size]; - mean = new double[size]; - - tempWeight = new double[bufferSize]; - tempMean = new double[bufferSize]; - order = new int[bufferSize]; - - lastUsedCell = 0; - } - - /** - * Turns on internal data recording. - */ - @Override - public TDigest recordAllData() { - super.recordAllData(); - data = new ArrayList<>(); - tempData = new ArrayList<>(); - return this; - } - - @Override - void add(double x, int w, Centroid base) { - add(x, w, base.data()); - } - - @Override - public void add(double x, int w) { - add(x, w, (List) null); - } - - private void add(double x, int w, List history) { - if (Double.isNaN(x)) { - throw new IllegalArgumentException("Cannot add NaN to t-digest"); - } - if (tempUsed >= tempWeight.length - lastUsedCell - 1) { - mergeNewValues(); - } - int where = tempUsed++; - tempWeight[where] = w; - tempMean[where] = x; - unmergedWeight += w; - if (x < min) { - min = x; - } - if (x > max) { - max = x; - } - - if (data != null) { - if (tempData == null) { - tempData = new ArrayList<>(); - } - while (tempData.size() <= where) { - tempData.add(new ArrayList()); - } - if (history == null) { - history = Collections.singletonList(x); - } - tempData.get(where).addAll(history); - } - } - - private void add(double[] m, double[] w, int count, List> data) { - if (m.length != w.length) { - throw new IllegalArgumentException("Arrays not same length"); - } - if (m.length < count + lastUsedCell) { - // make room to add existing centroids - double[] m1 = new double[count + lastUsedCell]; - System.arraycopy(m, 0, m1, 0, count); - m = m1; - double[] w1 = new double[count + lastUsedCell]; - System.arraycopy(w, 0, w1, 0, count); - w = w1; - } - double total = 0; - for (int i = 0; i < count; i++) { - total += w[i]; - } - merge(m, w, count, data, null, total, false, compression); - } - - @Override - public void add(List others) { - if (others.size() == 0) { - return; - } - int size = lastUsedCell; - for (TDigest other : others) { - other.compress(); - size += other.centroidCount(); - } - - double[] m = new double[size]; - double[] w = new double[size]; - List> data; - if (recordAllData) { - data = new ArrayList<>(); - } else { - data = null; - } - int offset = 0; - for (TDigest other : others) { - if (other instanceof MergingDigest) { - MergingDigest md = (MergingDigest) other; - System.arraycopy(md.mean, 0, m, offset, md.lastUsedCell); - System.arraycopy(md.weight, 0, w, offset, md.lastUsedCell); - if (data != null) { - for (Centroid centroid : other.centroids()) { - data.add(centroid.data()); - } - } - offset += md.lastUsedCell; - } else { - for (Centroid centroid : other.centroids()) { - m[offset] = centroid.mean(); - w[offset] = centroid.count(); - if (recordAllData) { - assert data != null; - data.add(centroid.data()); - } - offset++; - } - } - } - add(m, w, size, data); - } - - private void mergeNewValues() { - mergeNewValues(false, compression); - } - - private void mergeNewValues(boolean force, double compression) { - if (totalWeight == 0 && unmergedWeight == 0) { - // seriously nothing to do - return; - } - if (force || unmergedWeight > 0) { - // note that we run the merge in reverse every other merge to avoid left-to-right bias in merging - merge(tempMean, tempWeight, tempUsed, tempData, order, unmergedWeight, - useAlternatingSort & mergeCount % 2 == 1, compression); - mergeCount++; - tempUsed = 0; - unmergedWeight = 0; - if (data != null) { - tempData = new ArrayList<>(); - } - } - } - - private void merge(double[] incomingMean, double[] incomingWeight, int incomingCount, - List> incomingData, int[] incomingOrder, - double unmergedWeight, boolean runBackwards, double compression) { - System.arraycopy(mean, 0, incomingMean, incomingCount, lastUsedCell); - System.arraycopy(weight, 0, incomingWeight, incomingCount, lastUsedCell); - incomingCount += lastUsedCell; - - if (incomingData != null) { - for (int i = 0; i < lastUsedCell; i++) { - assert data != null; - incomingData.add(data.get(i)); - } - data = new ArrayList<>(); - } - if (incomingOrder == null) { - incomingOrder = new int[incomingCount]; - } - Sort.sort(incomingOrder, incomingMean, incomingCount); - // option to run backwards is to investigate bias in errors - if (runBackwards) { - Sort.reverse(incomingOrder, 0, incomingCount); - } - - totalWeight += unmergedWeight; - - assert (lastUsedCell + incomingCount) > 0; - lastUsedCell = 0; - mean[lastUsedCell] = incomingMean[incomingOrder[0]]; - weight[lastUsedCell] = incomingWeight[incomingOrder[0]]; - double wSoFar = 0; - if (data != null) { - assert incomingData != null; - data.add(incomingData.get(incomingOrder[0])); - } - - - // weight will contain all zeros after this loop - - double normalizer = scale.normalizer(compression, totalWeight); - double k1 = scale.k(0, normalizer); - double wLimit = totalWeight * scale.q(k1 + 1, normalizer); - for (int i = 1; i < incomingCount; i++) { - int ix = incomingOrder[i]; - double proposedWeight = weight[lastUsedCell] + incomingWeight[ix]; - double projectedW = wSoFar + proposedWeight; - boolean addThis; - if (useWeightLimit) { - double q0 = wSoFar / totalWeight; - double q2 = (wSoFar + proposedWeight) / totalWeight; - addThis = proposedWeight <= totalWeight * Math.min(scale.max(q0, normalizer), scale.max(q2, normalizer)); - } else { - addThis = projectedW <= wLimit; - } - - if (addThis) { - // next point will fit - // so merge into existing centroid - weight[lastUsedCell] += incomingWeight[ix]; - mean[lastUsedCell] = mean[lastUsedCell] + (incomingMean[ix] - mean[lastUsedCell]) * incomingWeight[ix] / weight[lastUsedCell]; - incomingWeight[ix] = 0; - - if (data != null) { - while (data.size() <= lastUsedCell) { - data.add(new ArrayList()); - } - assert incomingData != null; - assert data.get(lastUsedCell) != incomingData.get(ix); - data.get(lastUsedCell).addAll(incomingData.get(ix)); - } - } else { - // didn't fit ... move to next output, copy out first centroid - wSoFar += weight[lastUsedCell]; - if (!useWeightLimit) { - k1 = scale.k(wSoFar / totalWeight, normalizer); - wLimit = totalWeight * scale.q(k1 + 1, normalizer); - } - - lastUsedCell++; - mean[lastUsedCell] = incomingMean[ix]; - weight[lastUsedCell] = incomingWeight[ix]; - incomingWeight[ix] = 0; - - if (data != null) { - assert incomingData != null; - assert data.size() == lastUsedCell; - data.add(incomingData.get(ix)); - } - } - } - // points to next empty cell - lastUsedCell++; - - // sanity check - double sum = 0; - for (int i = 0; i < lastUsedCell; i++) { - sum += weight[i]; - } - assert sum == totalWeight; - if (runBackwards) { - Sort.reverse(mean, 0, lastUsedCell); - Sort.reverse(weight, 0, lastUsedCell); - if (data != null) { - Collections.reverse(data); - } - } - - if (totalWeight > 0) { - min = Math.min(min, mean[0]); - max = Math.max(max, mean[lastUsedCell - 1]); - } - } - - /** - * Exposed for testing. - */ - int checkWeights() { - return checkWeights(weight, totalWeight, lastUsedCell); - } - - private int checkWeights(double[] w, double total, int last) { - int badCount = 0; - - int n = last; - if (w[n] > 0) { - n++; - } - - double normalizer = scale.normalizer(publicCompression, totalWeight); - double k1 = scale.k(0, normalizer); - double q = 0; - double left = 0; - String header = "\n"; - for (int i = 0; i < n; i++) { - double dq = w[i] / total; - double k2 = scale.k(q + dq, normalizer); - q += dq / 2; - if (k2 - k1 > 1 && w[i] != 1) { - System.out.printf("%sOversize centroid at " + - "%d, k0=%.2f, k1=%.2f, dk=%.2f, w=%.2f, q=%.4f, dq=%.4f, left=%.1f, current=%.2f maxw=%.2f\n", - header, i, k1, k2, k2 - k1, w[i], q, dq, left, w[i], totalWeight * scale.max(q, normalizer)); - header = ""; - badCount++; - } - if (k2 - k1 > 4 && w[i] != 1) { - throw new IllegalStateException( - String.format("Egregiously oversized centroid at " + - "%d, k0=%.2f, k1=%.2f, dk=%.2f, w=%.2f, q=%.4f, dq=%.4f, left=%.1f, current=%.2f, maxw=%.2f\n", - i, k1, k2, k2 - k1, w[i], q, dq, left, w[i], totalWeight * scale.max(q, normalizer))); - } - q += dq / 2; - left += w[i]; - k1 = k2; - } - - return badCount; - } - - /** - * Merges any pending inputs and compresses the data down to the public setting. - * Note that this typically loses a bit of precision and thus isn't a thing to - * be doing all the time. It is best done only when we want to show results to - * the outside world. - */ - @Override - public void compress() { - mergeNewValues(true, publicCompression); - } - - @Override - public long size() { - return (long) (totalWeight + unmergedWeight); - } - - @Override - public double cdf(double x) { - mergeNewValues(); - - if (lastUsedCell == 0) { - // no data to examine - return Double.NaN; - } else if (lastUsedCell == 1) { - // exactly one centroid, should have max==min - double width = max - min; - if (x < min) { - return 0; - } else if (x > max) { - return 1; - } else if (x - min <= width) { - // min and max are too close together to do any viable interpolation - return 0.5; - } else { - // interpolate if somehow we have weight > 0 and max != min - return (x - min) / (max - min); - } - } else { - int n = lastUsedCell; - if (x < min) { - return 0; - } - - if (x > max) { - return 1; - } - - // check for the left tail - if (x < mean[0]) { - // note that this is different than mean[0] > min - // ... this guarantees we divide by non-zero number and interpolation works - if (mean[0] - min > 0) { - // must be a sample exactly at min - if (x == min) { - return 0.5 / totalWeight; - } else { - return (1 + (x - min) / (mean[0] - min) * (weight[0] / 2 - 1)) / totalWeight; - } - } else { - // this should be redundant with the check x < min - return 0; - } - } - assert x >= mean[0]; - - // and the right tail - if (x > mean[n - 1]) { - if (max - mean[n - 1] > 0) { - if (x == max) { - return 1 - 0.5 / totalWeight; - } else { - // there has to be a single sample exactly at max - double dq = (1 + (max - x) / (max - mean[n - 1]) * (weight[n - 1] / 2 - 1)) / totalWeight; - return 1 - dq; - } - } else { - return 1; - } - } - - // we know that there are at least two centroids and mean[0] < x < mean[n-1] - // that means that there are either one or more consecutive centroids all at exactly x - // or there are consecutive centroids, c0 < x < c1 - double weightSoFar = 0; - for (int it = 0; it < n - 1; it++) { - // weightSoFar does not include weight[it] yet - if (mean[it] == x) { - // we have one or more centroids == x, treat them as one - // dw will accumulate the weight of all of the centroids at x - double dw = 0; - while (it < n && mean[it] == x) { - dw += weight[it]; - it++; - } - return (weightSoFar + dw / 2) / totalWeight; - } else if (mean[it] <= x && x < mean[it + 1]) { - // landed between centroids ... check for floating point madness - if (mean[it + 1] - mean[it] > 0) { - // note how we handle singleton centroids here - // the point is that for singleton centroids, we know that their entire - // weight is exactly at the centroid and thus shouldn't be involved in - // interpolation - double leftExcludedW = 0; - double rightExcludedW = 0; - if (weight[it] == 1) { - if (weight[it + 1] == 1) { - // two singletons means no interpolation - // left singleton is in, right is out - return (weightSoFar + 1) / totalWeight; - } else { - leftExcludedW = 0.5; - } - } else if (weight[it + 1] == 1) { - rightExcludedW = 0.5; - } - double dw = (weight[it] + weight[it + 1]) / 2; - - // can't have double singleton (handled that earlier) - assert dw > 1; - assert (leftExcludedW + rightExcludedW) <= 0.5; - - // adjust endpoints for any singleton - double left = mean[it]; - double right = mean[it + 1]; - - double dwNoSingleton = dw - leftExcludedW - rightExcludedW; - - // adjustments have only limited effect on endpoints - assert dwNoSingleton > dw / 2; - assert right - left > 0; - double base = weightSoFar + weight[it] / 2 + leftExcludedW; - return (base + dwNoSingleton * (x - left) / (right - left)) / totalWeight; - } else { - // this is simply caution against floating point madness - // it is conceivable that the centroids will be different - // but too near to allow safe interpolation - double dw = (weight[it] + weight[it + 1]) / 2; - return (weightSoFar + dw) / totalWeight; - } - } else { - weightSoFar += weight[it]; - } - } - if (x == mean[n - 1]) { - return 1 - 0.5 / totalWeight; - } else { - throw new IllegalStateException("Can't happen ... loop fell through"); - } - } - } - - @Override - public double quantile(double q) { - if (q < 0 || q > 1) { - throw new IllegalArgumentException("q should be in [0,1], got " + q); - } - mergeNewValues(); - - if (lastUsedCell == 0) { - // no centroids means no data, no way to get a quantile - return Double.NaN; - } else if (lastUsedCell == 1) { - // with one data point, all quantiles lead to Rome - return mean[0]; - } - - // we know that there are at least two centroids now - int n = lastUsedCell; - - // if values were stored in a sorted array, index would be the offset we are interested in - final double index = q * totalWeight; - - // beyond the boundaries, we return min or max - // usually, the first centroid will have unit weight so this will make it moot - if (index < 1) { - return min; - } - - // if the left centroid has more than one sample, we still know - // that one sample occurred at min so we can do some interpolation - if (weight[0] > 1 && index < weight[0] / 2) { - // there is a single sample at min so we interpolate with less weight - return min + (index - 1) / (weight[0] / 2 - 1) * (mean[0] - min); - } - - // usually the last centroid will have unit weight so this test will make it moot - if (index > totalWeight - 1) { - return max; - } - - // if the right-most centroid has more than one sample, we still know - // that one sample occurred at max so we can do some interpolation - if (weight[n-1] > 1 && totalWeight - index <= weight[n - 1] / 2) { - return max - (totalWeight - index - 1) / (weight[n - 1] / 2 - 1) * (max - mean[n - 1]); - } - - // in between extremes we interpolate between centroids - double weightSoFar = weight[0] / 2; - for (int i = 0; i < n - 1; i++) { - double dw = (weight[i] + weight[i + 1]) / 2; - if (weightSoFar + dw > index) { - // centroids i and i+1 bracket our current point - - // check for unit weight - double leftUnit = 0; - if (weight[i] == 1) { - if (index - weightSoFar < 0.5) { - // within the singleton's sphere - return mean[i]; - } else { - leftUnit = 0.5; - } - } - double rightUnit = 0; - if (weight[i + 1] == 1) { - if (weightSoFar + dw - index <= 0.5) { - // no interpolation needed near singleton - return mean[i + 1]; - } - rightUnit = 0.5; - } - double z1 = index - weightSoFar - leftUnit; - double z2 = weightSoFar + dw - index - rightUnit; - return weightedAverage(mean[i], z2, mean[i + 1], z1); - } - weightSoFar += dw; - } - // we handled singleton at end up above - assert weight[n - 1] > 1; - assert index <= totalWeight; - assert index >= totalWeight - weight[n - 1] / 2; - - // weightSoFar = totalWeight - weight[n-1]/2 (very nearly) - // so we interpolate out to max value ever seen - double z1 = index - totalWeight - weight[n - 1] / 2.0; - double z2 = weight[n - 1] / 2 - z1; - return weightedAverage(mean[n - 1], z1, max, z2); - } - - @Override - public int centroidCount() { - mergeNewValues(); - return lastUsedCell; - } - - @Override - public Collection centroids() { - // we don't actually keep centroid structures around so we have to fake it - compress(); - return new AbstractCollection() { - @Override - public Iterator iterator() { - return new Iterator() { - int i = 0; - - @Override - public boolean hasNext() { - return i < lastUsedCell; - } - - @Override - public Centroid next() { - Centroid rc = new Centroid(mean[i], (int) weight[i], data != null ? data.get(i) : null); - i++; - return rc; - } - - @Override - public void remove() { - throw new UnsupportedOperationException("Default operation"); - } - }; - } - - @Override - public int size() { - return lastUsedCell; - } - }; - } - - @Override - public double compression() { - return publicCompression; - } - - @Override - public int byteSize() { - compress(); - // format code, compression(float), buffer-size(int), temp-size(int), #centroids-1(int), - // then two doubles per centroid - return lastUsedCell * 16 + 32; - } - - @Override - public int smallByteSize() { - compress(); - // format code(int), compression(float), buffer-size(short), temp-size(short), #centroids-1(short), - // then two floats per centroid - return lastUsedCell * 8 + 30; - } - - @SuppressWarnings("WeakerAccess") - public ScaleFunction getScaleFunction() { - return scale; - } - - public enum Encoding { - VERBOSE_ENCODING(1), SMALL_ENCODING(2); - - private final int code; - - Encoding(int code) { - this.code = code; - } - } - - @Override - public void asBytes(ByteBuffer buf) { - compress(); - buf.putInt(Encoding.VERBOSE_ENCODING.code); - buf.putDouble(min); - buf.putDouble(max); - buf.putDouble(publicCompression); - buf.putInt(lastUsedCell); - for (int i = 0; i < lastUsedCell; i++) { - buf.putDouble(weight[i]); - buf.putDouble(mean[i]); - } - } - - @Override - public void asSmallBytes(ByteBuffer buf) { - compress(); - buf.putInt(Encoding.SMALL_ENCODING.code); // 4 - buf.putDouble(min); // + 8 - buf.putDouble(max); // + 8 - buf.putFloat((float) publicCompression); // + 4 - buf.putShort((short) mean.length); // + 2 - buf.putShort((short) tempMean.length); // + 2 - buf.putShort((short) lastUsedCell); // + 2 = 30 - for (int i = 0; i < lastUsedCell; i++) { - buf.putFloat((float) weight[i]); - buf.putFloat((float) mean[i]); - } - } - - @SuppressWarnings("WeakerAccess") - public static MergingDigest fromBytes(ByteBuffer buf) { - int encoding = buf.getInt(); - if (encoding == Encoding.VERBOSE_ENCODING.code) { - double min = buf.getDouble(); - double max = buf.getDouble(); - double compression = buf.getDouble(); - int n = buf.getInt(); - MergingDigest r = new MergingDigest(compression); - r.setMinMax(min, max); - r.lastUsedCell = n; - for (int i = 0; i < n; i++) { - r.weight[i] = buf.getDouble(); - r.mean[i] = buf.getDouble(); - - r.totalWeight += r.weight[i]; - } - return r; - } else if (encoding == Encoding.SMALL_ENCODING.code) { - double min = buf.getDouble(); - double max = buf.getDouble(); - double compression = buf.getFloat(); - int n = buf.getShort(); - int bufferSize = buf.getShort(); - MergingDigest r = new MergingDigest(compression, bufferSize, n); - r.setMinMax(min, max); - r.lastUsedCell = buf.getShort(); - for (int i = 0; i < r.lastUsedCell; i++) { - r.weight[i] = buf.getFloat(); - r.mean[i] = buf.getFloat(); - - r.totalWeight += r.weight[i]; - } - return r; - } else { - throw new IllegalStateException("Invalid format for serialized histogram"); - } - - } - - @Override - public String toString() { - return "MergingDigest" - + "-" + getScaleFunction() - + "-" + (useWeightLimit ? "weight" : "kSize") - + "-" + (useAlternatingSort ? "alternating" : "stable") - + "-" + (useTwoLevelCompression ? "twoLevel" : "oneLevel"); - } -} diff --git a/presto-main/src/main/java/com/facebook/presto/tdigest/ScaleFunction.java b/presto-main/src/main/java/com/facebook/presto/tdigest/ScaleFunction.java deleted file mode 100644 index ad586586078f9..0000000000000 --- a/presto-main/src/main/java/com/facebook/presto/tdigest/ScaleFunction.java +++ /dev/null @@ -1,617 +0,0 @@ -/* - * Licensed to Ted Dunning under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.facebook.presto.tdigest; - -/** - * Encodes the various scale functions for t-digests. These limits trade accuracy near the tails against accuracy near - * the median in different ways. For instance, K_0 has uniform cluster sizes and results in constant accuracy (in terms - * of q) while K_3 has cluster sizes proportional to min(q,1-q) which results in very much smaller error near the tails - * and modestly increased error near the median. - *

- * The base forms (K_0, K_1, K_2 and K_3) all result in t-digests limited to a number of clusters equal to the - * compression factor. The K_2_NO_NORM and K_3_NO_NORM versions result in the cluster count increasing roughly with - * log(n). - */ -public enum ScaleFunction { - /** - * Generates uniform cluster sizes. Used for comparison only. - */ - K_0 { - @Override - public double k(double q, double compression, double n) { - return compression * q / 2; - } - - @Override - public double k(double q, double normalizer) { - return normalizer * q; - } - - @Override - public double q(double k, double compression, double n) { - return 2 * k / compression; - } - - @Override - public double q(double k, double normalizer) { - return k / normalizer; - } - - @Override - public double max(double q, double compression, double n) { - return 2 / compression; - } - - @Override - public double max(double q, double normalizer) { - return 1 / normalizer; - } - - @Override - public double normalizer(double compression, double n) { - return compression / 2; - } - }, - - /** - * Generates cluster sizes proportional to sqrt(q*(1-q)). This gives constant relative accuracy if accuracy is - * proportional to squared cluster size. It is expected that K_2 and K_3 will give better practical results. - */ - K_1 { - @Override - public double k(double q, double compression, double n) { - return compression * Math.asin(2 * q - 1) / (2 * Math.PI); - } - - @Override - public double k(double q, double normalizer) { - return normalizer * Math.asin(2 * q - 1); - } - - - @Override - public double q(double k, double compression, double n) { - return (Math.sin(k * (2 * Math.PI / compression)) + 1) / 2; - } - - @Override - public double q(double k, double normalizer) { - return (Math.sin(k / normalizer) + 1) / 2; - } - - @Override - public double max(double q, double compression, double n) { - if (q <= 0) { - return 0; - } else if (q >= 1) { - return 0; - } else { - return 2 * Math.sin(Math.PI / compression) * Math.sqrt(q * (1 - q)); - } - } - - @Override - public double max(double q, double normalizer) { - if (q <= 0) { - return 0; - } else if (q >= 1) { - return 0; - } else { - return 2 * Math.sin(0.5 / normalizer) * Math.sqrt(q * (1 - q)); - } - } - - @Override - public double normalizer(double compression, double n) { - return compression / (2 * Math.PI); - } - }, - - /** - * Generates cluster sizes proportional to sqrt(q*(1-q)) but avoids computation of asin in the critical path by - * using an approximate version. - */ - K_1_FAST { - @Override - public double k(double q, double compression, double n) { - return compression * fastAsin(2 * q - 1) / (2 * Math.PI); - } - - @Override - public double k(double q, double normalizer) { - return normalizer * fastAsin(2 * q - 1); - } - - @Override - public double q(double k, double compression, double n) { - return (Math.sin(k * (2 * Math.PI / compression)) + 1) / 2; - } - - @Override - public double q(double k, double normalizer) { - return (Math.sin(k / normalizer) + 1) / 2; - } - - @Override - public double max(double q, double compression, double n) { - if (q <= 0) { - return 0; - } else if (q >= 1) { - return 0; - } else { - return 2 * Math.sin(Math.PI / compression) * Math.sqrt(q * (1 - q)); - } - } - - @Override - public double max(double q, double normalizer) { - if (q <= 0) { - return 0; - } else if (q >= 1) { - return 0; - } else { - return 2 * Math.sin(0.5 / normalizer) * Math.sqrt(q * (1 - q)); - } - } - - @Override - public double normalizer(double compression, double n) { - return compression / (2 * Math.PI); - } - }, - - /** - * Generates cluster sizes proportional to q*(1-q). This makes tail error bounds tighter than for K_1. The use of a - * normalizing function results in a strictly bounded number of clusters no matter how many samples. - */ - K_2 { - @Override - public double k(double q, double compression, double n) { - if (n <= 1) { - if (q <= 0) { - return -10; - } else if (q >= 1) { - return 10; - } else { - return 0; - } - } - if (q == 0) { - return 2 * k(1 / n, compression, n); - } else if (q == 1) { - return 2 * k((n - 1) / n, compression, n); - } else { - return compression * Math.log(q / (1 - q)) / Z(compression, n); - } - } - - @Override - public double k(double q, double normalizer) { - if (q < 1e-15) { - // this will return something more extreme than q = 1/n - return 2 * k(1e-15, normalizer); - } else if (q > 1 - 1e-15) { - // this will return something more extreme than q = (n-1)/n - return 2 * k(1 - 1e-15, normalizer); - } else { - return Math.log(q / (1 - q)) * normalizer; - } - } - - @Override - public double q(double k, double compression, double n) { - double w = Math.exp(k * Z(compression, n) / compression); - return w / (1 + w); - } - - @Override - public double q(double k, double normalizer) { - double w = Math.exp(k / normalizer); - return w / (1 + w); - } - - @Override - public double max(double q, double compression, double n) { - return Z(compression, n) * q * (1 - q) / compression; - } - - @Override - public double max(double q, double normalizer) { - return q * (1 - q) / normalizer; - } - - @Override - public double normalizer(double compression, double n) { - return compression / Z(compression, n); - } - - private double Z(double compression, double n) { - return 4 * Math.log(n / compression) + 24; - } - }, - - /** - * Generates cluster sizes proportional to min(q, 1-q). This makes tail error bounds tighter than for K_1 or K_2. - * The use of a normalizing function results in a strictly bounded number of clusters no matter how many samples. - */ - K_3 { - @Override - public double k(double q, double compression, double n) { - if (q < 0.9 / n) { - return 10 * k(1 / n, compression, n); - } else if (q > 1 - 0.9 / n) { - return 10 * k((n - 1) / n, compression, n); - } else { - if (q <= 0.5) { - return compression * Math.log(2 * q) / Z(compression, n); - } else { - return -k(1 - q, compression, n); - } - } - } - - @Override - public double k(double q, double normalizer) { - if (q < 1e-15) { - return 10 * k(1e-15, normalizer); - } else if (q > 1 - 1e-15) { - return 10 * k(1 - 1e-15, normalizer); - } else { - if (q <= 0.5) { - return Math.log(2 * q) / normalizer; - } else { - return -k(1 - q, normalizer); - } - } - } - - @Override - public double q(double k, double compression, double n) { - if (k <= 0) { - return Math.exp(k * Z(compression, n) / compression) / 2; - } else { - return 1 - q(-k, compression, n); - } - } - - @Override - public double q(double k, double normalizer) { - if (k <= 0) { - return Math.exp(k / normalizer) / 2; - } else { - return 1 - q(-k, normalizer); - } - } - - @Override - public double max(double q, double compression, double n) { - return Z(compression, n) * Math.min(q, 1 - q) / compression; - } - - @Override - public double max(double q, double normalizer) { - return Math.min(q, 1 - q) / normalizer; - } - - @Override - public double normalizer(double compression, double n) { - return compression / Z(compression, n); - } - - private double Z(double compression, double n) { - return 4 * Math.log(n / compression) + 21; - } - }, - - /** - * Generates cluster sizes proportional to q*(1-q). This makes the tail error bounds tighter. This version does not - * use a normalizer function and thus the number of clusters increases roughly proportional to log(n). That is good - * for accuracy, but bad for size and bad for the statically allocated MergingDigest, but can be useful for - * tree-based implementations. - */ - K_2_NO_NORM { - @Override - public double k(double q, double compression, double n) { - if (q == 0) { - return 2 * k(1 / n, compression, n); - } else if (q == 1) { - return 2 * k((n - 1) / n, compression, n); - } else { - return compression * Math.log(q / (1 - q)); - } - } - - @Override - public double k(double q, double normalizer) { - if (q <= 1e-15) { - return 2 * k(1e-15, normalizer); - } else if (q >= 1 - 1e-15) { - return 2 * k(1 - 1e-15, normalizer); - } else { - return normalizer * Math.log(q / (1 - q)); - } - } - - @Override - public double q(double k, double compression, double n) { - double w = Math.exp(k / compression); - return w / (1 + w); - } - - @Override - public double q(double k, double normalizer) { - double w = Math.exp(k / normalizer); - return w / (1 + w); - } - - @Override - public double max(double q, double compression, double n) { - return q * (1 - q) / compression; - } - - @Override - public double max(double q, double normalizer) { - return q * (1 - q) / normalizer; - } - - @Override - public double normalizer(double compression, double n) { - return compression; - } - }, - - /** - * Generates cluster sizes proportional to min(q, 1-q). This makes the tail error bounds tighter. This version does - * not use a normalizer function and thus the number of clusters increases roughly proportional to log(n). That is - * good for accuracy, but bad for size and bad for the statically allocated MergingDigest, but can be useful for - * tree-based implementations. - */ - K_3_NO_NORM { - @Override - public double k(double q, double compression, double n) { - if (q < 0.9 / n) { - return 10 * k(1 / n, compression, n); - } else if (q > 1 - 0.9 / n) { - return 10 * k((n - 1) / n, compression, n); - } else { - if (q <= 0.5) { - return compression * Math.log(2 * q); - } else { - return -k(1 - q, compression, n); - } - } - } - - @Override - public double k(double q, double normalizer) { - if (q <= 1e-15) { - return 10 * k(1e-15, normalizer); - } else if (q > 1 - 1e-15) { - return 10 * k(1 - 1e-15, normalizer); - } else { - if (q <= 0.5) { - return normalizer * Math.log(2 * q); - } else { - return -k(1 - q, normalizer); - } - } - } - - @Override - public double q(double k, double compression, double n) { - if (k <= 0) { - return Math.exp(k / compression) / 2; - } else { - return 1 - q(-k, compression, n); - } - } - - @Override - public double q(double k, double normalizer) { - if (k <= 0) { - return Math.exp(k / normalizer) / 2; - } else { - return 1 - q(-k, normalizer); - } - } - - @Override - public double max(double q, double compression, double n) { - return Math.min(q, 1 - q) / compression; - } - - @Override - public double max(double q, double normalizer) { - return Math.min(q, 1 - q) / normalizer; - } - - @Override - public double normalizer(double compression, double n) { - return compression; - } - }; // max weight is min(q,1-q), should improve tail accuracy even more - - /** - * Converts a quantile to the k-scale. The total number of points is also provided so that a normalizing function - * can be computed if necessary. - * - * @param q The quantile - * @param compression Also known as delta in literature on the t-digest - * @param n The total number of samples - * @return The corresponding value of k - */ - abstract public double k(double q, double compression, double n); - - /** - * Converts a quantile to the k-scale. The normalizer value depends on compression and (possibly) number of points - * in the digest. #normalizer(double, double) - * - * @param q The quantile - * @param normalizer The normalizer value which depends on compression and (possibly) number of points in the - * digest. - * @return The corresponding value of k - */ - abstract public double k(double q, double normalizer); - - /** - * Computes q as a function of k. This is often faster than finding k as a function of q for some scales. - * - * @param k The index value to convert into q scale. - * @param compression The compression factor (often written as δ) - * @param n The number of samples already in the digest. - * @return The value of q that corresponds to k - */ - abstract public double q(double k, double compression, double n); - - /** - * Computes q as a function of k. This is often faster than finding k as a function of q for some scales. - * - * @param k The index value to convert into q scale. - * @param normalizer The normalizer value which depends on compression and (possibly) number of points in the - * digest. - * @return The value of q that corresponds to k - */ - abstract public double q(double k, double normalizer); - - /** - * Computes the maximum relative size a cluster can have at quantile q. Note that exactly where within the range - * spanned by a cluster that q should be isn't clear. That means that this function usually has to be taken at - * multiple points and the smallest value used. - *

- * Note that this is the relative size of a cluster. To get the max number of samples in the cluster, multiply this - * value times the total number of samples in the digest. - * - * @param q The quantile - * @param compression The compression factor, typically delta in the literature - * @param n The number of samples seen so far in the digest - * @return The maximum number of samples that can be in the cluster - */ - abstract public double max(double q, double compression, double n); - - /** - * Computes the maximum relative size a cluster can have at quantile q. Note that exactly where within the range - * spanned by a cluster that q should be isn't clear. That means that this function usually has to be taken at - * multiple points and the smallest value used. - *

- * Note that this is the relative size of a cluster. To get the max number of samples in the cluster, multiply this - * value times the total number of samples in the digest. - * - * @param q The quantile - * @param normalizer The normalizer value which depends on compression and (possibly) number of points in the - * digest. - * @return The maximum number of samples that can be in the cluster - */ - abstract public double max(double q, double normalizer); - - /** - * Computes the normalizer given compression and number of points. - */ - abstract public double normalizer(double compression, double n); - - /** - * Approximates asin to within about 1e-6. This approximation works by breaking the range from 0 to 1 into 5 regions - * for all but the region nearest 1, rational polynomial models get us a very good approximation of asin and by - * interpolating as we move from region to region, we can guarantee continuity and we happen to get monotonicity as - * well. for the values near 1, we just use Math.asin as our region "approximation". - * - * @param x sin(theta) - * @return theta - */ - static double fastAsin(double x) { - if (x < 0) { - return -fastAsin(-x); - } else if (x > 1) { - return Double.NaN; - } else { - // Cutoffs for models. Note that the ranges overlap. In the - // overlap we do linear interpolation to guarantee the overall - // result is "nice" - double c0High = 0.1; - double c1High = 0.55; - double c2Low = 0.5; - double c2High = 0.8; - double c3Low = 0.75; - double c3High = 0.9; - double c4Low = 0.87; - if (x > c3High) { - return Math.asin(x); - } else { - // the models - double[] m0 = {0.2955302411, 1.2221903614, 0.1488583743, 0.2422015816, -0.3688700895, 0.0733398445}; - double[] m1 = {-0.0430991920, 0.9594035750, -0.0362312299, 0.1204623351, 0.0457029620, -0.0026025285}; - double[] m2 = {-0.034873933724, 1.054796752703, -0.194127063385, 0.283963735636, 0.023800124916, -0.000872727381}; - double[] m3 = {-0.37588391875, 2.61991859025, -2.48835406886, 1.48605387425, 0.00857627492, -0.00015802871}; - - // the parameters for all of the models - double[] vars = {1, x, x * x, x * x * x, 1 / (1 - x), 1 / (1 - x) / (1 - x)}; - - // raw grist for interpolation coefficients - double x0 = bound((c0High - x) / c0High); - double x1 = bound((c1High - x) / (c1High - c2Low)); - double x2 = bound((c2High - x) / (c2High - c3Low)); - double x3 = bound((c3High - x) / (c3High - c4Low)); - - // interpolation coefficients - //noinspection UnnecessaryLocalVariable - double mix0 = x0; - double mix1 = (1 - x0) * x1; - double mix2 = (1 - x1) * x2; - double mix3 = (1 - x2) * x3; - double mix4 = 1 - x3; - - // now mix all the results together, avoiding extra evaluations - double r = 0; - if (mix0 > 0) { - r += mix0 * eval(m0, vars); - } - if (mix1 > 0) { - r += mix1 * eval(m1, vars); - } - if (mix2 > 0) { - r += mix2 * eval(m2, vars); - } - if (mix3 > 0) { - r += mix3 * eval(m3, vars); - } - if (mix4 > 0) { - // model 4 is just the real deal - r += mix4 * Math.asin(x); - } - return r; - } - } - } - - private static double eval(double[] model, double[] vars) { - double r = 0; - for (int i = 0; i < model.length; i++) { - r += model[i] * vars[i]; - } - return r; - } - - private static double bound(double v) { - if (v <= 0) { - return 0; - } else if (v >= 1) { - return 1; - } else { - return v; - } - } -} diff --git a/presto-main/src/main/java/com/facebook/presto/tdigest/TDigest.java b/presto-main/src/main/java/com/facebook/presto/tdigest/TDigest.java index 467e82acda8ee..ce0ca747f2fa7 100644 --- a/presto-main/src/main/java/com/facebook/presto/tdigest/TDigest.java +++ b/presto-main/src/main/java/com/facebook/presto/tdigest/TDigest.java @@ -1,3 +1,17 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + /* * Licensed to Ted Dunning under one or more * contributor license agreements. See the NOTICE file distributed with @@ -17,210 +31,623 @@ package com.facebook.presto.tdigest; -import java.io.Serializable; -import java.nio.ByteBuffer; +import io.airlift.slice.BasicSliceInput; +import io.airlift.slice.DynamicSliceOutput; +import io.airlift.slice.Slice; +import io.airlift.slice.SliceInput; +import io.airlift.slice.SliceOutput; +import org.openjdk.jol.info.ClassLayout; + +import javax.annotation.concurrent.NotThreadSafe; + +import java.util.AbstractCollection; +import java.util.ArrayList; import java.util.Collection; +import java.util.Iterator; import java.util.List; +import java.util.Random; +import java.util.concurrent.ThreadLocalRandom; + +import static com.facebook.presto.tdigest.TDigestUtils.maxSize; +import static com.facebook.presto.tdigest.TDigestUtils.normalizer; +import static com.facebook.presto.tdigest.TDigestUtils.reverse; +import static com.facebook.presto.tdigest.TDigestUtils.sort; +import static com.facebook.presto.tdigest.TDigestUtils.weightedAverage; +import static com.google.common.base.Preconditions.checkArgument; +import static io.airlift.slice.SizeOf.SIZE_OF_BYTE; +import static io.airlift.slice.SizeOf.SIZE_OF_DOUBLE; +import static io.airlift.slice.SizeOf.SIZE_OF_INT; +import static io.airlift.slice.SizeOf.sizeOf; +import static io.airlift.slice.Slices.wrappedDoubleArray; +import static java.lang.Math.ceil; +import static java.lang.Math.max; +import static java.lang.Math.toIntExact; +import static java.lang.String.format; +import static java.lang.System.arraycopy; +import static java.util.Collections.shuffle; /** - * Adaptive histogram based on something like streaming k-means crossed with Q-digest. - * - * The special characteristics of this algorithm are: - * - * - smaller summaries than Q-digest - * - * - works on doubles as well as integers. - * - * - provides part per million accuracy for extreme quantiles and typically <1000 ppm accuracy for middle quantiles - * - * - fast - * - * - simple - * - * - test coverage roughly at 90% - * - * - easy to adapt for use with map-reduce + * Maintains a t-digest by collecting new points in a buffer that is then sorted occasionally and merged + * into a sorted array that contains previously computed centroids. + *

+ * This can be very fast because the cost of sorting and merging is amortized over several insertion. If + * we keep N centroids total and have the input array of size k, then the amortized cost is approximately + *

+ * N/k + log k + *

+ * These costs even out when N/k = log k. Balancing costs is often a good place to start in optimizing an + * algorithm. + * The virtues of this kind of t-digest implementation include: + *

    + *
  • No allocation is required after initialization
  • + *
  • The data structure automatically compresses existing centroids when possible
  • + *
  • No Java object overhead is incurred for centroids since data is kept in primitive arrays
  • + *
+ *

*/ -public abstract class TDigest implements Serializable { - protected ScaleFunction scale = ScaleFunction.K_2; - double min = Double.POSITIVE_INFINITY; - double max = Double.NEGATIVE_INFINITY; - /** - * Creates an {@link MergingDigest}. This is generally the best known implementation right now. - * - * @param compression The compression parameter. 100 is a common value for normal uses. 1000 is extremely large. - * The number of centroids retained will be a smallish (usually less than 10) multiple of this number. - * @return the MergingDigest - */ - @SuppressWarnings("WeakerAccess") - public static TDigest createMergingDigest(double compression) { - return new MergingDigest(compression); +@NotThreadSafe +public class TDigest +{ + private static final int INSTANCE_SIZE = ClassLayout.parseClass(TDigest.class).instanceSize(); + private static final double MAX_COMPRESSION_FACTOR = 1_000; + private static final double sizeFudge = 30; + + private double min = Double.POSITIVE_INFINITY; + private double max = Double.NEGATIVE_INFINITY; + + private final Random gen = ThreadLocalRandom.current(); + + private int mergeCount; + + private final double publicCompression; + private final double compression; + + // points to the first unused centroid + private int activeCentroids; + + private double totalWeight; + + private final double[] weight; + + private final double[] mean; + + private double unmergedWeight; + + // this is the index of the next temporary centroid + // this is a more Java-like convention than activeCentroids uses + private int tempUsed; + private final double[] tempWeight; + private final double[] tempMean; + + // array used for sorting the temp centroids + // to avoid allocations during operation + private final int[] order; + + private TDigest(double compression) + { + // ensure compression >= 10 + // default size = 2 * ceil(compression) + // default bufferSize = 5 * size + // ensure size > 2 * compression + weightLimitFudge + // ensure bufferSize > 2 * size + + checkArgument(compression <= MAX_COMPRESSION_FACTOR, "Compression factor cannot exceed %s", MAX_COMPRESSION_FACTOR); + this.publicCompression = max(compression, 10); + // publicCompression is how many centroids the user asked for + // compression is how many we actually keep + this.compression = 2 * publicCompression; + + // having a big buffer is good for speed + int bufferSize = 5 * (int) ceil(this.publicCompression + sizeFudge); + int size = (int) ceil(this.compression + sizeFudge); + + weight = new double[size]; + mean = new double[size]; + tempWeight = new double[bufferSize]; + tempMean = new double[bufferSize]; + order = new int[bufferSize]; + + activeCentroids = 0; } - /** - * Creates a TDigest of whichever type is the currently recommended type. MergingDigest is generally the best - * known implementation right now. - * - * @param compression The compression parameter. 100 is a common value for normal uses. 1000 is extremely large. - * The number of centroids retained will be a smallish (usually less than 10) multiple of this number. - * @return the TDigest - */ - @SuppressWarnings({"unused", "WeakerAccess", "SameParameterValue"}) - public static TDigest createDigest(double compression) { - return createMergingDigest(compression); + public static TDigest createTDigest(double compression) + { + return new TDigest(compression); } - /** - * Adds a sample to a histogram. - * - * @param x The value to add. - * @param w The weight of this point. - */ - public abstract void add(double x, int w); + public static TDigest createTDigest(Slice slice) + { + if (slice == null) { + return null; + } - final void checkValue(double x) { - if (Double.isNaN(x)) { - throw new IllegalArgumentException("Cannot add NaN"); + SliceInput sliceInput = new BasicSliceInput(slice); + try { + byte format = sliceInput.readByte(); + checkArgument(format == 0, "Invalid serialization format for TDigest; expected '0'"); + byte type = sliceInput.readByte(); + checkArgument(type == 0, "Invalid type for TDigest; expected '0' (type double)"); + double min = sliceInput.readDouble(); + double max = sliceInput.readDouble(); + double publicCompression = max(10, sliceInput.readDouble()); + TDigest r = new TDigest(publicCompression); + r.setMinMax(min, max); + r.totalWeight = sliceInput.readDouble(); + r.activeCentroids = sliceInput.readInt(); + sliceInput.readBytes(wrappedDoubleArray(r.weight), r.activeCentroids * SIZE_OF_DOUBLE); + sliceInput.readBytes(wrappedDoubleArray(r.mean), r.activeCentroids * SIZE_OF_DOUBLE); + sliceInput.close(); + return r; + } + catch (IndexOutOfBoundsException e) { + throw new IllegalArgumentException("Incorrect slice serialization format"); } } - public abstract void add(List others); + public void add(double x) + { + add(x, 1); + } - /** - * Re-examines a t-digest to determine whether some centroids are redundant. If your data are - * perversely ordered, this may be a good idea. Even if not, this may save 20% or so in space. - * - * The cost is roughly the same as adding as many data points as there are centroids. This - * is typically < 10 * compression, but could be as high as 100 * compression. - * - * This is a destructive operation that is not thread-safe. - */ - public abstract void compress(); + public void add(double x, long w) + { + checkArgument(!Double.isNaN(x), "Cannot add NaN to t-digest"); + checkArgument(w > 0L, "weight must be > 0"); - /** - * Returns the number of points that have been added to this TDigest. - * - * @return The sum of the weights on all centroids. - */ - public abstract long size(); + if (tempUsed >= tempWeight.length - activeCentroids - 1) { + mergeNewValues(); + } + int where = tempUsed++; + tempWeight[where] = w; + tempMean[where] = x; + unmergedWeight += w; + if (x < min) { + min = x; + } + if (x > max) { + max = x; + } + } - /** - * Returns the fraction of all points added which are ≤ x. - * - * @param x The cutoff for the cdf. - * @return The fraction of all data which is less or equal to x. - */ - public abstract double cdf(double x); + public void merge(TDigest other) + { + checkArgument(other != null, "Cannot merge with a null t-digest"); + checkArgument(this.publicCompression == other.getCompressionFactor(), "TDigests must have the same compression, found (%s, %s)", this.publicCompression, + other.getCompressionFactor()); + List tmp = new ArrayList<>(); + for (Centroid centroid : other.centroids()) { + tmp.add(centroid); + } - /** - * Returns an estimate of the cutoff such that a specified fraction of the data - * added to this TDigest would be less than or equal to the cutoff. - * - * @param q The desired fraction - * @return The value x such that cdf(x) == q - */ - public abstract double quantile(double q); + shuffle(tmp, gen); + for (Centroid centroid : tmp) { + add(centroid.getMean(), centroid.getWeight()); + } + } - /** - * A {@link Collection} that lets you go through the centroids in ascending order by mean. Centroids - * returned will not be re-used, but may or may not share storage with this TDigest. - * - * @return The centroids in the form of a Collection. - */ - public abstract Collection centroids(); + private void mergeNewValues() + { + mergeNewValues(false, compression); + } - /** - * Returns the current compression factor. - * - * @return The compression factor originally used to set up the TDigest. - */ - public abstract double compression(); + private void mergeNewValues(boolean force, double compression) + { + if (unmergedWeight == 0) { + return; + } - /** - * Returns the number of bytes required to encode this TDigest using #asBytes(). - * - * @return The number of bytes required. - */ - public abstract int byteSize(); + if (force || unmergedWeight > 0) { + // note that we run the merge in reverse every other merge to avoid left-to-right bias in merging + merge(tempMean, tempWeight, tempUsed, order, unmergedWeight, mergeCount % 2 == 1, compression); + mergeCount++; + tempUsed = 0; + unmergedWeight = 0; + } + } - /** - * Returns the number of bytes required to encode this TDigest using #asSmallBytes(). - * - * Note that this is just as expensive as actually compressing the digest. If you don't - * care about time, but want to never over-allocate, this is fine. If you care about compression - * and speed, you pretty much just have to overallocate by using allocating #byteSize() bytes. - * - * @return The number of bytes required. - */ - public abstract int smallByteSize(); + private void merge(double[] incomingMean, + double[] incomingWeight, + int incomingCount, + int[] incomingOrder, + double unmergedWeight, + boolean runBackwards, + double compression) + { + arraycopy(mean, 0, incomingMean, incomingCount, activeCentroids); + arraycopy(weight, 0, incomingWeight, incomingCount, activeCentroids); + incomingCount += activeCentroids; + + if (incomingOrder == null) { + incomingOrder = new int[incomingCount]; + } + sort(incomingOrder, incomingMean, incomingCount); + + if (runBackwards) { + reverse(incomingOrder, 0, incomingCount); + } + + totalWeight += unmergedWeight; + + checkArgument((activeCentroids + incomingCount) > 0, "Active centroids plus incoming count must be > 0, was %s", activeCentroids + incomingCount); + activeCentroids = 0; + mean[activeCentroids] = incomingMean[incomingOrder[0]]; + weight[activeCentroids] = incomingWeight[incomingOrder[0]]; + double weightSoFar = 0; - public void setScaleFunction(ScaleFunction scaleFunction) { - if (scaleFunction.toString().endsWith("NO_NORM")) { - throw new IllegalArgumentException( - String.format("Can't use %s as scale with %s", scaleFunction, this.getClass())); + double normalizer = normalizer(compression, totalWeight); + for (int i = 1; i < incomingCount; i++) { + int ix = incomingOrder[i]; + double proposedWeight = weight[activeCentroids] + incomingWeight[ix]; + boolean addThis; + + double q0 = weightSoFar / totalWeight; + double q2 = (weightSoFar + proposedWeight) / totalWeight; + addThis = proposedWeight <= totalWeight * Math.min(maxSize(q0, normalizer), maxSize(q2, normalizer)); + + if (addThis) { + // next point can be merged into existing centroid + weight[activeCentroids] += incomingWeight[ix]; + mean[activeCentroids] = mean[activeCentroids] + (incomingMean[ix] - mean[activeCentroids]) * incomingWeight[ix] / weight[activeCentroids]; + incomingWeight[ix] = 0; + } + else { + // move to next output, copy out first centroid + weightSoFar += weight[activeCentroids]; + + activeCentroids++; + mean[activeCentroids] = incomingMean[ix]; + weight[activeCentroids] = incomingWeight[ix]; + incomingWeight[ix] = 0; + } + } + activeCentroids++; + + // sanity check + double sum = 0; + for (int i = 0; i < activeCentroids; i++) { + sum += weight[i]; + } + + checkArgument(sum == totalWeight, "Sum must equal the total weight, but sum:%s != totalWeight:%s", sum, totalWeight); + if (runBackwards) { + reverse(mean, 0, activeCentroids); + reverse(weight, 0, activeCentroids); + } + + if (totalWeight > 0) { + min = Math.min(min, mean[0]); + max = max(max, mean[activeCentroids - 1]); } - this.scale = scaleFunction; } /** - * Serialize this TDigest into a byte buffer. Note that the serialization used is - * very straightforward and is considerably larger than strictly necessary. - * - * @param buf The byte buffer into which the TDigest should be serialized. + * Merges any pending inputs and compresses the data down to the public setting. */ - public abstract void asBytes(ByteBuffer buf); + public void compress() + { + mergeNewValues(true, publicCompression); + } - /** - * Serialize this TDigest into a byte buffer. Some simple compression is used - * such as using variable byte representation to store the centroid weights and - * using delta-encoding on the centroid means so that floats can be reasonably - * used to store the centroid means. - * - * @param buf The byte buffer into which the TDigest should be serialized. - */ - public abstract void asSmallBytes(ByteBuffer buf); + public double getSize() + { + return totalWeight + unmergedWeight; + } - /** - * Tell this TDigest to record the original data as much as possible for test - * purposes. - * - * @return This TDigest so that configurations can be done in fluent style. - */ - public abstract TDigest recordAllData(); + public double getCdf(double x) + { + if (unmergedWeight > 0) { + compress(); + } - public abstract boolean isRecording(); + if (activeCentroids == 0) { + return Double.NaN; + } + if (activeCentroids == 1) { + double width = max - min; + if (x < min) { + return 0; + } + if (x > max) { + return 1; + } + if (x - min <= width) { + // min and max are too close together to do any viable interpolation + return 0.5; + } + return (x - min) / (max - min); + } + int n = activeCentroids; + if (x < min) { + return 0; + } - /** - * Add a sample to this TDigest. - * - * @param x The data value to add - */ - public abstract void add(double x); + if (x > max) { + return 1; + } - /** - * Add all of the centroids of another TDigest to this one. - * - * @param other The other TDigest - */ - public abstract void add(TDigest other); + // check for the left tail + if (x < mean[0]) { + // guarantees we divide by non-zero number and interpolation works + if (mean[0] - min > 0) { + // must be a sample exactly at min + if (x == min) { + return 0.5 / totalWeight; + } + return (1 + (x - min) / (mean[0] - min) * (weight[0] / 2 - 1)) / totalWeight; + } + return 0; + } + checkArgument(x >= mean[0], "Value x:%s must be greater than mean of first centroid %s if we got here", x, mean[0]); - public abstract int centroidCount(); + // and the right tail + if (x > mean[n - 1]) { + if (max - mean[n - 1] > 0) { + if (x == max) { + return 1 - 0.5 / totalWeight; + } + // there has to be a single sample exactly at max + double dq = (1 + (max - x) / (max - mean[n - 1]) * (weight[n - 1] / 2 - 1)) / totalWeight; + return 1 - dq; + } + return 1; + } - public double getMin() { - return min; + // we know that there are at least two centroids and mean[0] < x < mean[n-1] + // that means that there are either one or more consecutive centroids all at exactly x + // or there are consecutive centroids, c0 < x < c1 + double weightSoFar = 0; + for (int it = 0; it < n - 1; it++) { + // weightSoFar does not include weight[it] yet + if (mean[it] == x) { + // dw will accumulate the weight of all of the centroids at x + double dw = 0; + while (it < n && mean[it] == x) { + dw += weight[it]; + it++; + } + return (weightSoFar + dw / 2) / totalWeight; + } + else if (mean[it] <= x && x < mean[it + 1]) { + // landed between centroids + if (mean[it + 1] - mean[it] > 0) { + // no interpolation needed if we have a singleton centroid + double leftExcludedW = 0; + double rightExcludedW = 0; + if (weight[it] == 1) { + if (weight[it + 1] == 1) { + // two singletons means no interpolation + // left singleton is in, right is out + return (weightSoFar + 1) / totalWeight; + } + else { + leftExcludedW = 0.5; + } + } + else if (weight[it + 1] == 1) { + rightExcludedW = 0.5; + } + double dw = (weight[it] + weight[it + 1]) / 2; + + checkArgument(dw > 1, "dw must be > 1, was %s", dw); + checkArgument((leftExcludedW + rightExcludedW) <= 0.5, "Excluded weight must be <= 0.5, was %s", leftExcludedW + rightExcludedW); + + // adjust endpoints for any singleton + double left = mean[it]; + double right = mean[it + 1]; + + double dwNoSingleton = dw - leftExcludedW - rightExcludedW; + + checkArgument(right - left > 0, "Centroids should be in ascending order, but mean of left centroid was greater than right centroid"); + + double base = weightSoFar + weight[it] / 2 + leftExcludedW; + return (base + dwNoSingleton * (x - left) / (right - left)) / totalWeight; + } + else { + // caution against floating point madness + double dw = (weight[it] + weight[it + 1]) / 2; + return (weightSoFar + dw) / totalWeight; + } + } + else { + weightSoFar += weight[it]; + } + } + checkArgument(x == mean[n - 1], "At this point, x must equal the mean of the last centroid"); + + return 1 - 0.5 / totalWeight; } - public double getMax() { - return max; + public double getQuantile(double q) + { + checkArgument(q >= 0 && q <= 1, "q should be in [0,1], got %s", q); + if (unmergedWeight > 0) { + compress(); + } + + if (activeCentroids == 0) { + return Double.NaN; + } + else if (activeCentroids == 1) { + return mean[0]; + } + + int n = activeCentroids; + + final double index = q * totalWeight; + + if (index < 1) { + return min; + } + + // if the left centroid has more than one sample, we still know + // that one sample occurred at min so we can do some interpolation + if (weight[0] > 1 && index < weight[0] / 2) { + // there is a single sample at min so we interpolate with less weight + return min + (index - 1) / (weight[0] / 2 - 1) * (mean[0] - min); + } + + if (index > totalWeight - 1) { + return max; + } + + // if the right-most centroid has more than one sample, we still know + // that one sample occurred at max so we can do some interpolation + if (weight[n - 1] > 1 && totalWeight - index <= weight[n - 1] / 2) { + return max - (totalWeight - index - 1) / (weight[n - 1] / 2 - 1) * (max - mean[n - 1]); + } + + // in between extremes we interpolate between centroids + double weightSoFar = weight[0] / 2; + for (int i = 0; i < n - 1; i++) { + // centroids i and i + 1 bracket our current point + double dw = (weight[i] + weight[i + 1]) / 2; + if (weightSoFar + dw > index) { + // check for unit weight + double leftUnit = 0; + if (weight[i] == 1) { + if (index - weightSoFar < 0.5) { + // within the singleton's sphere + return mean[i]; + } + else { + leftUnit = 0.5; + } + } + double rightUnit = 0; + if (weight[i + 1] == 1) { + if (weightSoFar + dw - index <= 0.5) { + // no interpolation needed near singleton + return mean[i + 1]; + } + rightUnit = 0.5; + } + double z1 = index - weightSoFar - leftUnit; + double z2 = weightSoFar + dw - index - rightUnit; + return weightedAverage(mean[i], z2, mean[i + 1], z1); + } + weightSoFar += dw; + } + + checkArgument(weight[n - 1] > 1, "Expected weight[n - 1] > 1, but was %s", weight[n - 1]); + checkArgument(index <= totalWeight, "Expected index <= totalWeight, but index:%s > totalWeight:%s", index, totalWeight); + checkArgument(index >= totalWeight - weight[n - 1] / 2, "Expected index >= totalWeight - weight[n - 1] / 2, but" + + "index:%s < %s", index, totalWeight - weight[n - 1] / 2); + + // weightSoFar = totalWeight - weight[n - 1] / 2 (very nearly) + // so we interpolate out to max value ever seen + double z1 = index - totalWeight - weight[n - 1] / 2.0; + double z2 = weight[n - 1] / 2 - z1; + return weightedAverage(mean[n - 1], z1, max, z2); } - /** - * Over-ride the min and max values for testing purposes - */ - @SuppressWarnings("SameParameterValue") - void setMinMax(double min, double max) { + public int centroidCount() + { + mergeNewValues(); + return activeCentroids; + } + + public Collection centroids() + { + compress(); + return new AbstractCollection() + { + @Override + public Iterator iterator() + { + return new Iterator() + { + int i; + + @Override + public boolean hasNext() + { + return i < activeCentroids; + } + + @Override + public Centroid next() + { + Centroid rc = new Centroid(mean[i], (int) weight[i]); + i++; + return rc; + } + + @Override + public void remove() + { + throw new UnsupportedOperationException("Default operation"); + } + }; + } + + @Override + public int size() + { + return activeCentroids; + } + }; + } + + public double getCompressionFactor() + { + return publicCompression; + } + + public long estimatedSerializedSizeInBytes() + { + compress(); + return SIZE_OF_BYTE // format + + SIZE_OF_BYTE // type (e.g double, float, bigint) + + SIZE_OF_DOUBLE // min + + SIZE_OF_DOUBLE // max + + SIZE_OF_DOUBLE // compression factor + + SIZE_OF_DOUBLE // total weight + + SIZE_OF_INT // number of centroids + + SIZE_OF_DOUBLE * activeCentroids // weight[], containing weight of each centroid + + SIZE_OF_DOUBLE * activeCentroids; // mean[], containing mean of each centroid + } + + public long estimatedInMemorySizeInBytes() + { + return INSTANCE_SIZE + sizeOf(weight) + sizeOf(mean) + sizeOf(tempWeight) + sizeOf(tempMean) + sizeOf(order); + } + + public Slice serialize() + { + SliceOutput sliceOutput = new DynamicSliceOutput(toIntExact(estimatedSerializedSizeInBytes())); + + sliceOutput.writeByte(0); // version 0 of T-Digest serialization + sliceOutput.writeByte(0); // represents the underlying data type of the distribution + sliceOutput.writeDouble(min); + sliceOutput.writeDouble(max); + sliceOutput.writeDouble(publicCompression); + sliceOutput.writeDouble(totalWeight); + sliceOutput.writeInt(activeCentroids); + sliceOutput.writeBytes(wrappedDoubleArray(weight), 0, activeCentroids * SIZE_OF_DOUBLE); + sliceOutput.writeBytes(wrappedDoubleArray(mean), 0, activeCentroids * SIZE_OF_DOUBLE); + return sliceOutput.slice(); + } + + private void setMinMax(double min, double max) + { this.min = min; this.max = max; } + + public double getMin() + { + return min; + } + + public double getMax() + { + return max; + } + + public String toString() + { + return format("TDigest\nCompression:%s\nCentroid Count:%s\nSize:%s\nMin:%s Median:%s Max:%s", + publicCompression, activeCentroids, totalWeight, min, getQuantile(0.5), max); + } } diff --git a/presto-main/src/main/java/com/facebook/presto/tdigest/Sort.java b/presto-main/src/main/java/com/facebook/presto/tdigest/TDigestUtils.java similarity index 82% rename from presto-main/src/main/java/com/facebook/presto/tdigest/Sort.java rename to presto-main/src/main/java/com/facebook/presto/tdigest/TDigestUtils.java index c7f9589b492ce..7a816cca68382 100644 --- a/presto-main/src/main/java/com/facebook/presto/tdigest/Sort.java +++ b/presto-main/src/main/java/com/facebook/presto/tdigest/TDigestUtils.java @@ -1,3 +1,17 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + /* * Licensed to Ted Dunning under one or more * contributor license agreements. See the NOTICE file distributed with @@ -18,12 +32,54 @@ package com.facebook.presto.tdigest; import java.util.Random; +import java.util.concurrent.ThreadLocalRandom; + +import static java.lang.String.format; + +public final class TDigestUtils +{ + private TDigestUtils() {} + + static double weightedAverage(double x1, double w1, double x2, double w2) + { + if (x1 <= x2) { + return weightedAverageSorted(x1, w1, x2, w2); + } + else { + return weightedAverageSorted(x2, w2, x1, w1); + } + } + + private static double weightedAverageSorted(double x1, double w1, double x2, double w2) + { + final double x = (x1 * w1 + x2 * w2) / (w1 + w2); + return Math.max(x1, Math.min(x, x2)); + } + + // Scale Functions + public static double maxSize(double q, double compression, double n) + { + return Z(compression, n) * q * (1 - q) / compression; + } + + public static double maxSize(double q, double normalizer) + { + return q * (1 - q) / normalizer; + } + + public static double normalizer(double compression, double n) + { + return compression / Z(compression, n); + } + + private static double Z(double compression, double n) + { + return 4 * Math.log(n / compression) + 24; + } + + // Sorting Functions + private static final Random prng = ThreadLocalRandom.current(); // for choosing pivots during quicksort -/** - * Static sorting methods - */ -public class Sort { - private static final Random prng = new Random(); // for choosing pivots during quicksort /** * Quick sort using an index array. On return, * values[order[i]] is in order as i goes 0..values.length @@ -31,8 +87,8 @@ public class Sort { * @param order Indexes into values * @param values The values to sort. */ - @SuppressWarnings("WeakerAccess") - public static void sort(int[] order, double[] values) { + public static void sort(int[] order, double[] values) + { sort(order, values, 0, values.length); } @@ -44,8 +100,8 @@ public static void sort(int[] order, double[] values) { * @param values The values to sort. * @param n The number of values to sort */ - @SuppressWarnings("WeakerAccess") - public static void sort(int[] order, double[] values, int n) { + public static void sort(int[] order, double[] values, int n) + { sort(order, values, 0, n); } @@ -58,8 +114,8 @@ public static void sort(int[] order, double[] values, int n) { * @param start The first element to sort * @param n The number of values to sort */ - @SuppressWarnings("WeakerAccess") - public static void sort(int[] order, double[] values, int start, int n) { + public static void sort(int[] order, double[] values, int start, int n) + { for (int i = start; i < start + n; i++) { order[i] = i; } @@ -76,10 +132,10 @@ public static void sort(int[] order, double[] values, int start, int n) { * @param end The value after the last value to sort * @param limit The minimum size to recurse down to. */ - private static void quickSort(int[] order, double[] values, int start, int end, int limit) { + private static void quickSort(int[] order, double[] values, int start, int end, int limit) + { // the while loop implements tail-recursion to avoid excessive stack calls on nasty cases while (end - start > limit) { - // pivot by a random element int pivotIndex = start + prng.nextInt(end - start); double pivotValue = values[order[pivotIndex]]; @@ -103,14 +159,17 @@ private static void quickSort(int[] order, double[] values, int start, int end, if (vi == pivotValue) { if (low != i) { swap(order, low, i); - } else { + } + else { i++; } low++; - } else if (vi > pivotValue) { + } + else if (vi > pivotValue) { high--; swap(order, i, high); - } else { + } + else { // vi < pivotValue i++; } @@ -132,14 +191,13 @@ private static void quickSort(int[] order, double[] values, int start, int end, if (from == low) { // ran out of things to copy. This means that the the last destination is the boundary low = to + 1; - } else { + } + else { // ran out of places to copy to. This means that there are uncopied pivots and the // boundary is at the beginning of those low = from; } -// checkPartition(order, values, pivotValue, start, low, high, end); - // now recurse, but arrange it so we handle the longer limit by tail recursion if (low - start < end - high) { quickSort(order, values, start, low, limit); @@ -147,7 +205,8 @@ private static void quickSort(int[] order, double[] values, int start, int end, // this is really a way to do // quickSort(order, values, high, end, limit); start = high; - } else { + } + else { quickSort(order, values, high, end, limit); // this is really a way to do // quickSort(order, values, start, low, limit); @@ -164,8 +223,8 @@ private static void quickSort(int[] order, double[] values, int start, int end, * @param key Values to sort on * @param values The auxilliary values to sort. */ - @SuppressWarnings("WeakerAccess") - public static void sort(double[] key, double[] ... values) { + public static void sort(double[] key, double[]... values) + { sort(key, 0, key.length, values); } @@ -177,8 +236,8 @@ public static void sort(double[] key, double[] ... values) { * @param n The number of values to sort * @param values The auxilliary values to sort. */ - @SuppressWarnings("WeakerAccess") - public static void sort(double[] key, int start, int n, double[]... values) { + public static void sort(double[] key, int start, int n, double[]... values) + { quickSort(key, values, start, start + n, 8); insertionSort(key, values, start, start + n, 8); } @@ -192,10 +251,10 @@ public static void sort(double[] key, int start, int n, double[]... values) { * @param end The value after the last value to sort * @param limit The minimum size to recurse down to. */ - private static void quickSort(double[] key, double[][] values, int start, int end, int limit) { + private static void quickSort(double[] key, double[][] values, int start, int end, int limit) + { // the while loop implements tail-recursion to avoid excessive stack calls on nasty cases while (end - start > limit) { - // median of three values for the pivot int a = start; int b = (start + end) / 2; @@ -212,31 +271,36 @@ private static void quickSort(double[] key, double[][] values, int start, int en // vc > va > vb pivotIndex = a; pivotValue = va; - } else { + } + else { // va > vb, va >= vc if (vc < vb) { // va > vb > vc pivotIndex = b; pivotValue = vb; - } else { + } + else { // va >= vc >= vb pivotIndex = c; pivotValue = vc; } } - } else { + } + else { // vb >= va if (vc > vb) { // vc > vb >= va pivotIndex = b; pivotValue = vb; - } else { + } + else { // vb >= va, vb >= vc if (vc < va) { // vb >= va > vc pivotIndex = a; pivotValue = va; - } else { + } + else { // vb >= vc >= va pivotIndex = c; pivotValue = vc; @@ -263,14 +327,17 @@ private static void quickSort(double[] key, double[][] values, int start, int en if (vi == pivotValue) { if (low != i) { swap(low, i, key, values); - } else { + } + else { i++; } low++; - } else if (vi > pivotValue) { + } + else if (vi > pivotValue) { high--; swap(i, high, key, values); - } else { + } + else { // vi < pivotValue i++; } @@ -292,14 +359,13 @@ private static void quickSort(double[] key, double[][] values, int start, int en if (from == low) { // ran out of things to copy. This means that the the last destination is the boundary low = to + 1; - } else { + } + else { // ran out of places to copy to. This means that there are uncopied pivots and the // boundary is at the beginning of those low = from; } -// checkPartition(order, values, pivotValue, start, low, high, end); - // now recurse, but arrange it so we handle the longer limit by tail recursion if (low - start < end - high) { quickSort(key, values, start, low, limit); @@ -307,7 +373,8 @@ private static void quickSort(double[] key, double[][] values, int start, int en // this is really a way to do // quickSort(order, values, high, end, limit); start = high; - } else { + } + else { quickSort(key, values, high, end, limit); // this is really a way to do // quickSort(order, values, start, low, limit); @@ -316,7 +383,6 @@ private static void quickSort(double[] key, double[][] values, int start, int en } } - /** * Limited range insertion sort. We assume that no element has to move more than limit steps * because quick sort has done its thing. This version works on parallel arrays of keys and values. @@ -328,7 +394,8 @@ private static void quickSort(double[] key, double[][] values, int start, int en * @param limit The largest amount of disorder */ @SuppressWarnings("SameParameterValue") - private static void insertionSort(double[] key, double[][] values, int start, int end, int limit) { + private static void insertionSort(double[] key, double[][] values, int start, int end, int limit) + { // loop invariant: all values start ... i-1 are ordered for (int i = start + 1; i < end; i++) { double v = key[i]; @@ -350,13 +417,15 @@ private static void insertionSort(double[] key, double[][] values, int start, in } } - private static void swap(int[] order, int i, int j) { + private static void swap(int[] order, int i, int j) + { int t = order[i]; order[i] = order[j]; order[j] = t; } - private static void swap(int i, int j, double[] key, double[]...values) { + private static void swap(int i, int j, double[] key, double[]...values) + { double t = key[i]; key[i] = key[j]; key[j] = t; @@ -379,33 +448,33 @@ private static void swap(int i, int j, double[] key, double[]...values) { * @param high Values from low to high are equal to the pivot. * @param end Values from high to end are above the pivot. */ - @SuppressWarnings("UnusedDeclaration") - public static void checkPartition(int[] order, double[] values, double pivotValue, int start, int low, int high, int end) { + public static void checkPartition(int[] order, double[] values, double pivotValue, int start, int low, int high, int end) + { if (order.length != values.length) { throw new IllegalArgumentException("Arguments must be same size"); } if (!(start >= 0 && low >= start && high >= low && end >= high)) { - throw new IllegalArgumentException(String.format("Invalid indices %d, %d, %d, %d", start, low, high, end)); + throw new IllegalArgumentException(format("Invalid indices %d, %d, %d, %d", start, low, high, end)); } for (int i = 0; i < low; i++) { double v = values[order[i]]; if (v >= pivotValue) { - throw new IllegalArgumentException(String.format("Value greater than pivot at %d", i)); + throw new IllegalArgumentException(format("Value greater than pivot at %d", i)); } } for (int i = low; i < high; i++) { if (values[order[i]] != pivotValue) { - throw new IllegalArgumentException(String.format("Non-pivot at %d", i)); + throw new IllegalArgumentException(format("Non-pivot at %d", i)); } } for (int i = high; i < end; i++) { double v = values[order[i]]; if (v <= pivotValue) { - throw new IllegalArgumentException(String.format("Value less than pivot at %d", i)); + throw new IllegalArgumentException(format("Value less than pivot at %d", i)); } } } @@ -421,7 +490,8 @@ public static void checkPartition(int[] order, double[] values, double pivotValu * @param limit The largest amount of disorder */ @SuppressWarnings("SameParameterValue") - private static void insertionSort(int[] order, double[] values, int start, int n, int limit) { + private static void insertionSort(int[] order, double[] values, int start, int n, int limit) + { for (int i = start + 1; i < n; i++) { int t = order[i]; double v = values[order[i]]; @@ -443,8 +513,8 @@ private static void insertionSort(int[] order, double[] values, int start, int n * * @param order The array to reverse */ - @SuppressWarnings("WeakerAccess") - public static void reverse(int[] order) { + public static void reverse(int[] order) + { reverse(order, 0, order.length); } @@ -455,8 +525,8 @@ public static void reverse(int[] order) { * @param offset Where to start reversing. * @param length How many elements to reverse */ - @SuppressWarnings("WeakerAccess") - public static void reverse(int[] order, int offset, int length) { + public static void reverse(int[] order, int offset, int length) + { for (int i = 0; i < length / 2; i++) { int t = order[offset + i]; order[offset + i] = order[offset + length - i - 1]; @@ -471,8 +541,9 @@ public static void reverse(int[] order, int offset, int length) { * @param offset Where to start reversing. * @param length How many elements to reverse */ - @SuppressWarnings({"WeakerAccess", "SameParameterValue"}) - public static void reverse(double[] order, int offset, int length) { + @SuppressWarnings("SameParameterValue") + public static void reverse(double[] order, int offset, int length) + { for (int i = 0; i < length / 2; i++) { double t = order[offset + i]; order[offset + i] = order[offset + length - i - 1]; diff --git a/presto-main/src/test/java/com/facebook/presto/tdigest/BenchmarkTDigest.java b/presto-main/src/test/java/com/facebook/presto/tdigest/BenchmarkTDigest.java new file mode 100644 index 0000000000000..8315240a90c29 --- /dev/null +++ b/presto-main/src/test/java/com/facebook/presto/tdigest/BenchmarkTDigest.java @@ -0,0 +1,203 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Licensed to Ted Dunning under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.facebook.presto.tdigest; + +import io.airlift.slice.Slice; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OperationsPerInvocation; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.runner.Runner; +import org.openjdk.jmh.runner.RunnerException; +import org.openjdk.jmh.runner.options.Options; +import org.openjdk.jmh.runner.options.OptionsBuilder; +import org.openjdk.jmh.runner.options.VerboseMode; + +import java.util.concurrent.ThreadLocalRandom; +import java.util.concurrent.TimeUnit; + +import static com.facebook.presto.tdigest.TDigest.createTDigest; + +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@Fork(3) +@Warmup(iterations = 10) +@Measurement(iterations = 10) +public class BenchmarkTDigest +{ + private static final int NUMBER_OF_ENTRIES = 1_000_000; + private static final int STANDARD_COMPRESSION_FACTOR = 100; + + @State(Scope.Thread) + public static class Data + { + private long[] normalDistribution1; + private long[] normalDistribution2; + + @Setup + public void setup() + { + normalDistribution1 = makeNormalValues(NUMBER_OF_ENTRIES); + normalDistribution2 = makeNormalValues(NUMBER_OF_ENTRIES); + } + + private long[] makeNormalValues(int size) + { + long[] values = new long[size]; + for (int i = 0; i < size; i++) { + long value = Math.abs((long) (ThreadLocalRandom.current().nextGaussian() * 1_000_000_000)); + values[i] = value; + } + + return values; + } + + private long[] makeRandomValues(int size) + { + long[] values = new long[size]; + for (int i = 0; i < size; i++) { + values[i] = (long) (Math.random() * 1_000_000_000); + } + + return values; + } + } + + @State(Scope.Thread) + public static class Digest + { + protected TDigest digest1; + protected TDigest digest2; + protected Slice serializedDigest1; + + @Setup + public void setup(Data data) + { + digest1 = makeTDigest(data.normalDistribution1); + digest2 = makeTDigest(data.normalDistribution2); + serializedDigest1 = digest1.serialize(); + } + + private TDigest makeTDigest(long[] values) + { + TDigest result = createTDigest(STANDARD_COMPRESSION_FACTOR); + int k = 1_000_000 / NUMBER_OF_ENTRIES; + for (int i = 0; i < k; i++) { + for (long value : values) { + result.add(value); + } + } + return result; + } + } + + @State(Scope.Thread) + public static class DigestWithQuantile + extends Digest + { + // allows testing how fast different quantiles can be computed + @Param({"0.0001", "0.01", "0.2500", "0.5000", "0.7500", "0.9999"}) + float quantile; + } + + @Benchmark + @BenchmarkMode(Mode.AverageTime) + @OperationsPerInvocation(NUMBER_OF_ENTRIES) + public TDigest benchmarkInsertsT(Data data) + { + TDigest digest = createTDigest(STANDARD_COMPRESSION_FACTOR); + int k = 1_000_000 / NUMBER_OF_ENTRIES; + for (int i = 0; i < k; i++) { + for (long value : data.normalDistribution1) { + digest.add(value); + } + } + + return digest; + } + + @Benchmark + @BenchmarkMode(Mode.AverageTime) + public double benchmarkQuantilesT(DigestWithQuantile data) + { + return data.digest1.getQuantile(data.quantile); + } + + @Benchmark + @BenchmarkMode(Mode.AverageTime) + public TDigest benchmarkCopyT(Digest data) + { + TDigest copy = createTDigest(data.digest1.getCompressionFactor()); + copy.merge(data.digest1); + return copy; + } + + @Benchmark @BenchmarkMode(Mode.AverageTime) + public TDigest benchmarkMergeT(Digest data) + { + TDigest merged = createTDigest(data.digest1.serialize()); + merged.merge(data.digest2); + return merged; + } + + @Benchmark + @BenchmarkMode(Mode.AverageTime) + public TDigest benchmarkDeserializeT(Digest data) + { + return createTDigest(data.serializedDigest1); + } + + @Benchmark + @BenchmarkMode(Mode.AverageTime) + public Slice benchmarkSerializeT(Digest data) + { + return data.digest1.serialize(); + } + + public static void main(String[] args) + throws RunnerException + { + Options options = new OptionsBuilder() + .verbosity(VerboseMode.NORMAL) + .include(".*" + BenchmarkTDigest.class.getSimpleName() + ".*") + .build(); + + new Runner(options).run(); + } +} diff --git a/presto-main/src/test/java/com/facebook/presto/tdigest/TestTDigest.java b/presto-main/src/test/java/com/facebook/presto/tdigest/TestTDigest.java new file mode 100644 index 0000000000000..c72cb9579a5a5 --- /dev/null +++ b/presto-main/src/test/java/com/facebook/presto/tdigest/TestTDigest.java @@ -0,0 +1,369 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Licensed to Ted Dunning under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.facebook.presto.tdigest; + +import org.apache.commons.math3.distribution.BinomialDistribution; +import org.apache.commons.math3.distribution.GeometricDistribution; +import org.apache.commons.math3.distribution.NormalDistribution; +import org.apache.commons.math3.distribution.PoissonDistribution; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import static com.facebook.presto.tdigest.TDigest.createTDigest; +import static java.lang.String.format; +import static java.util.Collections.sort; +import static org.testng.Assert.assertTrue; + +public class TestTDigest +{ + private static final int NUMBER_OF_ENTRIES = 1_000_000; + private static final int STANDARD_COMPRESSION_FACTOR = 100; + private static final double STANDARD_ERROR = 0.01; + private static final double[] quantile = {0.0001, 0.0200, 0.0300, 0.04000, 0.0500, 0.1000, 0.2000, 0.3000, 0.4000, 0.5000, 0.6000, 0.7000, 0.8000, + 0.9000, 0.9500, 0.9600, 0.9700, 0.9800, 0.9999}; + + @Test + public void testAddElementsInOrder() + { + TDigest tDigest = createTDigest(STANDARD_COMPRESSION_FACTOR); + List list = new ArrayList<>(); + + for (int i = 0; i < NUMBER_OF_ENTRIES; i++) { + tDigest.add(i); + list.add(i); + } + + for (int i = 0; i < quantile.length; i++) { + assertDiscreteWithinBound(quantile[i], STANDARD_ERROR, list, tDigest); + } + } + + @Test + public void testMergeTwoDistributionsWithoutOverlap() + { + TDigest tDigest1 = createTDigest(STANDARD_COMPRESSION_FACTOR); + TDigest tDigest2 = createTDigest(STANDARD_COMPRESSION_FACTOR); + List list = new ArrayList(); + + for (int i = 0; i < NUMBER_OF_ENTRIES / 2; i++) { + tDigest1.add(i); + tDigest2.add(i + NUMBER_OF_ENTRIES / 2); + list.add(i); + list.add(i + NUMBER_OF_ENTRIES / 2); + } + + tDigest1.merge(tDigest2); + sort(list); + + for (int i = 0; i < quantile.length; i++) { + assertDiscreteWithinBound(quantile[i], STANDARD_ERROR, list, tDigest1); + } + } + + @Test + public void testMergeTwoDistributionsWithOverlap() + { + TDigest tDigest1 = createTDigest(STANDARD_COMPRESSION_FACTOR); + TDigest tDigest2 = createTDigest(STANDARD_COMPRESSION_FACTOR); + List list = new ArrayList(); + + for (int i = 0; i < NUMBER_OF_ENTRIES / 2; i++) { + tDigest1.add(i); + tDigest2.add(i); + list.add(i); + list.add(i); + } + + tDigest2.merge(tDigest1); + sort(list); + + for (int i = 0; i < quantile.length; i++) { + assertDiscreteWithinBound(quantile[i], STANDARD_ERROR, list, tDigest2); + } + } + + @Test + public void testAddElementsRandomized() + { + TDigest tDigest = createTDigest(STANDARD_COMPRESSION_FACTOR); + List list = new ArrayList(); + + for (int i = 0; i < NUMBER_OF_ENTRIES; i++) { + double value = Math.random() * NUMBER_OF_ENTRIES; + tDigest.add(value); + list.add(value); + } + + sort(list); + + for (int i = 0; i < quantile.length; i++) { + assertContinuousWithinBound(quantile[i], STANDARD_ERROR, list, tDigest); + } + } + + @Test + public void testNormalDistributionLowVariance() + { + TDigest tDigest = createTDigest(STANDARD_COMPRESSION_FACTOR); + List list = new ArrayList(); + NormalDistribution normal = new NormalDistribution(1000, 1); + + for (int i = 0; i < NUMBER_OF_ENTRIES; i++) { + double value = normal.sample(); + tDigest.add(value); + list.add(value); + } + + sort(list); + + for (int i = 0; i < quantile.length; i++) { + assertContinuousWithinBound(quantile[i], STANDARD_ERROR, list, tDigest); + } + } + + @Test + public void testNormalDistributionHighVariance() + { + TDigest tDigest = createTDigest(STANDARD_COMPRESSION_FACTOR); + List list = new ArrayList(); + NormalDistribution normal = new NormalDistribution(0, 1); + + for (int i = 0; i < NUMBER_OF_ENTRIES; i++) { + double value = normal.sample(); + tDigest.add(value); + list.add(value); + } + + sort(list); + + for (int i = 0; i < quantile.length; i++) { + assertContinuousWithinBound(quantile[i], STANDARD_ERROR, list, tDigest); + } + } + + @Test + public void testMergeTwoNormalDistributions() + { + TDigest tDigest1 = createTDigest(STANDARD_COMPRESSION_FACTOR); + TDigest tDigest2 = createTDigest(STANDARD_COMPRESSION_FACTOR); + List list = new ArrayList<>(); + NormalDistribution normal = new NormalDistribution(0, 50); + + for (int i = 0; i < NUMBER_OF_ENTRIES / 2; i++) { + double value1 = normal.sample(); + double value2 = normal.sample(); + tDigest1.add(value1); + tDigest2.add(value2); + list.add(value1); + list.add(value2); + } + + tDigest1.merge(tDigest2); + sort(list); + + for (int i = 0; i < quantile.length; i++) { + assertContinuousWithinBound(quantile[i], STANDARD_ERROR, list, tDigest1); + } + } + + @Test + public void testMergeManySmallNormalDistributions() + { + TDigest tDigest = createTDigest(STANDARD_COMPRESSION_FACTOR); + List list = new ArrayList<>(); + NormalDistribution normal = new NormalDistribution(500, 20); + int digests = 100_000; + + for (int k = 0; k < digests; k++) { + TDigest current = createTDigest(STANDARD_COMPRESSION_FACTOR); + for (int i = 0; i < 10; i++) { + double value = normal.sample(); + current.add(value); + list.add(value); + } + tDigest.merge(current); + } + + sort(list); + + for (int i = 0; i < quantile.length; i++) { + assertContinuousWithinBound(quantile[i], STANDARD_ERROR, list, tDigest); + } + } + + @Test + public void testMergeManyLargeNormalDistributions() + { + TDigest tDigest = createTDigest(STANDARD_COMPRESSION_FACTOR); + List list = new ArrayList<>(); + NormalDistribution normal = new NormalDistribution(500, 20); + int digests = 1000; + + for (int k = 0; k < digests; k++) { + TDigest current = createTDigest(STANDARD_COMPRESSION_FACTOR); + for (int i = 0; i < NUMBER_OF_ENTRIES / digests; i++) { + double value = normal.sample(); + current.add(value); + list.add(value); + } + tDigest.merge(current); + } + + sort(list); + + for (int i = 0; i < quantile.length; i++) { + assertContinuousWithinBound(quantile[i], STANDARD_ERROR, list, tDigest); + } + } + + // disabled because test takes almost 10s + @Test(enabled = false) + public void testBinomialDistribution() + { + int trials = 10; + for (int k = 1; k < trials; k++) { + TDigest tDigest = createTDigest(STANDARD_COMPRESSION_FACTOR); + BinomialDistribution binomial = new BinomialDistribution(trials, k * 0.1); + List list = new ArrayList<>(); + + for (int i = 0; i < NUMBER_OF_ENTRIES; i++) { + int sample = binomial.sample(); + tDigest.add(sample); + list.add(sample); + } + + Collections.sort(list); + + for (int i = 0; i < quantile.length; i++) { + assertDiscreteWithinBound(quantile[i], STANDARD_ERROR, list, tDigest); + } + } + } + + @Test(enabled = false) + public void testGeometricDistribution() + { + int trials = 10; + for (int k = 1; k < trials; k++) { + TDigest tDigest = createTDigest(STANDARD_COMPRESSION_FACTOR); + GeometricDistribution geometric = new GeometricDistribution(k * 0.1); + List list = new ArrayList(); + + for (int i = 0; i < NUMBER_OF_ENTRIES; i++) { + int sample = geometric.sample(); + tDigest.add(sample); + list.add(sample); + } + + Collections.sort(list); + + for (int i = 0; i < quantile.length; i++) { + assertDiscreteWithinBound(quantile[i], STANDARD_ERROR, list, tDigest); + } + } + } + + @Test(enabled = false) + public void testPoissonDistribution() + { + int trials = 10; + for (int k = 1; k < trials; k++) { + TDigest tDigest = createTDigest(STANDARD_COMPRESSION_FACTOR); + PoissonDistribution poisson = new PoissonDistribution(k * 0.1); + List list = new ArrayList(); + + for (int i = 0; i < NUMBER_OF_ENTRIES; i++) { + int sample = poisson.sample(); + tDigest.add(sample); + list.add(sample); + } + + Collections.sort(list); + + for (int i = 0; i < quantile.length; i++) { + assertDiscreteWithinBound(quantile[i], STANDARD_ERROR, list, tDigest); + } + } + } + + private void assertContinuousWithinBound(double quantile, double bound, List values, TDigest tDigest) + { + double lowerBound = quantile - bound; + double upperBound = quantile + bound; + + if (lowerBound < 0) { + lowerBound = tDigest.getMin(); + } + else { + lowerBound = values.get((int) (NUMBER_OF_ENTRIES * lowerBound)); + } + + if (upperBound >= 1) { + upperBound = tDigest.getMax(); + } + else { + upperBound = values.get((int) (NUMBER_OF_ENTRIES * upperBound)); + } + + assertTrue(tDigest.getQuantile(quantile) >= lowerBound && tDigest.getQuantile(quantile) <= upperBound, + format("Value %s is outside bound [%s, %s] for quantile %s", + tDigest.getQuantile(quantile), lowerBound, upperBound, quantile)); + } + + private void assertDiscreteWithinBound(double quantile, double bound, List values, TDigest tDigest) + { + double lowerBound = quantile - bound; + double upperBound = quantile + bound; + + if (lowerBound < 0) { + lowerBound = tDigest.getMin(); + } + else { + lowerBound = values.get((int) (NUMBER_OF_ENTRIES * lowerBound)); + } + + if (upperBound >= 1) { + upperBound = tDigest.getMax(); + } + else { + upperBound = values.get((int) (NUMBER_OF_ENTRIES * upperBound)); + } + // for discrete distributions, t-digest usually gives back a double that is between 2 integers (2.88, 1.16, etc) + // however, a discrete distribution should return an integer, not something in between + // we use Math.rint to round to the nearest integer, since casting as (int) always rounds down and no casting results in error > 1% + assertTrue(Math.rint(tDigest.getQuantile(quantile)) >= lowerBound && Math.rint(tDigest.getQuantile(quantile)) <= upperBound, + format("Value %s is outside bound [%s, %s] for quantile %s", tDigest.getQuantile(quantile), lowerBound, upperBound, quantile)); + } +}