diff --git a/.gitignore b/.gitignore index 836905ce4..3913864c6 100644 --- a/.gitignore +++ b/.gitignore @@ -48,6 +48,7 @@ sonatype.sbt BUILD target/ lib_managed/ +project/metals.sbt project/boot/ project/build/target/ project/plugins/target/ diff --git a/.scalafmt.conf b/.scalafmt.conf index c9f903c4f..d4daaafab 100644 --- a/.scalafmt.conf +++ b/.scalafmt.conf @@ -1,7 +1,10 @@ version=3.6.0 runner.dialect = scala212 fileOverride { - "glob:**/scala-2.13*/**" { + "glob:**/scala-3/**" { + runner.dialect = scala3 + } + "glob:**/scala-2*/**" { runner.dialect = scala213 } } diff --git a/algebird-core/src/main/scala/com/twitter/algebird/Aggregator.scala b/algebird-core/src/main/scala-2.11/Aggregator.scala similarity index 99% rename from algebird-core/src/main/scala/com/twitter/algebird/Aggregator.scala rename to algebird-core/src/main/scala-2.11/Aggregator.scala index 4e78d234b..fd380a15d 100644 --- a/algebird-core/src/main/scala/com/twitter/algebird/Aggregator.scala +++ b/algebird-core/src/main/scala-2.11/Aggregator.scala @@ -20,7 +20,7 @@ object Aggregator extends java.io.Serializable { * This is a trivial aggregator that always returns a single value */ def const[T](t: T): MonoidAggregator[Any, Unit, T] = - prepareMonoid { _: Any => () }.andThenPresent(_ => t) + prepareMonoid((_: Any) => ()).andThenPresent(_ => t) /** * Using Aggregator.prepare,present you can add to this aggregator @@ -172,7 +172,7 @@ object Aggregator extends java.io.Serializable { * How many items satisfy a predicate */ def count[T](pred: T => Boolean): MonoidAggregator[T, Long, Long] = - prepareMonoid { t: T => if (pred(t)) 1L else 0L } + prepareMonoid((t: T) => if (pred(t)) 1L else 0L) /** * Do any items satisfy some predicate @@ -310,7 +310,7 @@ object Aggregator extends java.io.Serializable { * Put everything in a Set. Note, this could fill the memory if the Set is very large. */ def toSet[T]: MonoidAggregator[T, Set[T], Set[T]] = - prepareMonoid { t: T => Set(t) } + prepareMonoid((t: T) => Set(t)) /** * This builds an in-memory Set, and then finally gets the size of that set. This may not be scalable if the diff --git a/algebird-core/src/main/scala/com/twitter/algebird/CountMinSketch.scala b/algebird-core/src/main/scala-2.11/CountMinSketch.scala similarity index 99% rename from algebird-core/src/main/scala/com/twitter/algebird/CountMinSketch.scala rename to algebird-core/src/main/scala-2.11/CountMinSketch.scala index f000c7fe3..809d8785f 100644 --- a/algebird-core/src/main/scala/com/twitter/algebird/CountMinSketch.scala +++ b/algebird-core/src/main/scala-2.11/CountMinSketch.scala @@ -185,9 +185,9 @@ class CMSSummation[K](params: CMSParams[K]) { val rit = matrix.iterator while (rit.hasNext) { var col = 0 - val cit = rit.next.iterator + val cit = rit.next().iterator while (cit.hasNext) { - cells(offset + col) += cit.next + cells(offset + col) += cit.next() col += 1 } offset += width @@ -206,7 +206,7 @@ class CMSSummation[K](params: CMSParams[K]) { b += cells(offset + col) col += 1 } - b.result + b.result() } val b = Vector.newBuilder[Vector[Long]] @@ -215,7 +215,7 @@ class CMSSummation[K](params: CMSParams[K]) { b += vectorize(row) row += 1 } - CMSInstance(CMSInstance.CountsTable(b.result), totalCount, params) + CMSInstance(CMSInstance.CountsTable(b.result()), totalCount, params) } } @@ -724,7 +724,7 @@ case class CMSInstance[K]( val it = countsTable.counts.iterator var i = 0 while (it.hasNext) { - val row = it.next + val row = it.next() val count = row(hs(i)(item)) if (count < freq) freq = count i += 1 @@ -817,13 +817,13 @@ object CMSInstance { val yss = other.counts.iterator val rows = Vector.newBuilder[Vector[Long]] while (xss.hasNext) { - val xs = xss.next.iterator - val ys = yss.next.iterator + val xs = xss.next().iterator + val ys = yss.next().iterator val row = Vector.newBuilder[Long] - while (xs.hasNext) row += (xs.next + ys.next) - rows += row.result + while (xs.hasNext) row += (xs.next() + ys.next()) + rows += row.result() } - CountsTable[K](rows.result) + CountsTable[K](rows.result()) } } diff --git a/algebird-core/src/main/scala/com/twitter/algebird/DecayedVector.scala b/algebird-core/src/main/scala-2.11/DecayedVector.scala similarity index 100% rename from algebird-core/src/main/scala/com/twitter/algebird/DecayedVector.scala rename to algebird-core/src/main/scala-2.11/DecayedVector.scala diff --git a/algebird-core/src/main/scala/com/twitter/algebird/DecayingCMS.scala b/algebird-core/src/main/scala-2.11/DecayingCMS.scala similarity index 98% rename from algebird-core/src/main/scala/com/twitter/algebird/DecayingCMS.scala rename to algebird-core/src/main/scala-2.11/DecayingCMS.scala index 2b6a5f157..fd8433754 100644 --- a/algebird-core/src/main/scala/com/twitter/algebird/DecayingCMS.scala +++ b/algebird-core/src/main/scala-2.11/DecayingCMS.scala @@ -210,7 +210,7 @@ final class DecayingCMS[K]( val hashFns: Array[K => Int] = { val rng = new Random(seed) def genPos(): Int = - rng.nextInt match { + rng.nextInt() match { case 0 => genPos() case n => n & 0x7fffffff } @@ -323,10 +323,10 @@ final class DecayingCMS[K]( var i = 0 while (i < cells.length) { val it = cells(i).iterator - var localMax = it.next // we know it doesn't start empty + var localMax = it.next() // we know it doesn't start empty if (localMax < minMinimum) minMinimum = localMax while (it.hasNext) { - val n = it.next + val n = it.next() if (n > localMax) localMax = n else if (n < minMinimum) minMinimum = n } @@ -362,7 +362,7 @@ final class DecayingCMS[K]( val it0 = this.cells(i).iterator val it1 = that.cells(i).iterator while (it0.hasNext) { - val x = it0.next * it1.next + val x = it0.next() * it1.next() if (x != 0.0) sum += x } if (sum < res) res = sum @@ -426,7 +426,7 @@ final class DecayingCMS[K]( val x = this val y = other val timeInHL = Math.max(x.timeInHL, y.timeInHL) - val cms = new CMS(allocCells, 0.0, timeInHL) + val cms = new CMS(allocCells(), 0.0, timeInHL) val xscale = x.getScale(timeInHL) val yscale = y.getScale(timeInHL) @@ -445,7 +445,7 @@ final class DecayingCMS[K]( bldr += prod(left(j), xscale) + prod(right(j), yscale) j += 1 } - cms.cells(i) = bldr.result + cms.cells(i) = bldr.result() i += 1 } cms @@ -505,7 +505,7 @@ final class DecayingCMS[K]( if (expL == 0.0) { new CMS(monoid.zero.cells, 0.0, ts) } else { - val cms = new CMS(allocCells, 0.0, ts) + val cms = new CMS(allocCells(), 0.0, ts) var i = 0 while (i < depth) { val ci = cells(i) @@ -547,7 +547,7 @@ final class DecayingCMS[K]( bldr += scratch(j) j += 1 } - cells(i) = bldr.result + cells(i) = bldr.result() i += 1 } cells @@ -606,7 +606,7 @@ final class DecayingCMS[K]( val arr = new Array[CMS](ChunkSize) while (it.hasNext) { while (it.hasNext && i < ChunkSize) { - arr(i) = it.next + arr(i) = it.next() i += 1 } if (i > 1) { diff --git a/algebird-core/src/main/scala/com/twitter/algebird/Fold.scala b/algebird-core/src/main/scala-2.11/Fold.scala similarity index 99% rename from algebird-core/src/main/scala/com/twitter/algebird/Fold.scala rename to algebird-core/src/main/scala-2.11/Fold.scala index c2f21d145..ded32e628 100644 --- a/algebird-core/src/main/scala/com/twitter/algebird/Fold.scala +++ b/algebird-core/src/main/scala-2.11/Fold.scala @@ -66,8 +66,8 @@ sealed trait Fold[-I, +O] extends Serializable { val self = this new Fold[I, P] { type X = self.X - override def build: FoldState[X, I, P] = - self.build.map(f) + override def build(): FoldState[X, I, P] = + self.build().map(f) } } diff --git a/algebird-core/src/main/scala/com/twitter/algebird/Interval.scala b/algebird-core/src/main/scala-2.11/Interval.scala similarity index 100% rename from algebird-core/src/main/scala/com/twitter/algebird/Interval.scala rename to algebird-core/src/main/scala-2.11/Interval.scala diff --git a/algebird-core/src/main/scala/com/twitter/algebird/InvariantAlgebras.scala b/algebird-core/src/main/scala-2.11/InvariantAlgebras.scala similarity index 100% rename from algebird-core/src/main/scala/com/twitter/algebird/InvariantAlgebras.scala rename to algebird-core/src/main/scala-2.11/InvariantAlgebras.scala diff --git a/algebird-core/src/main/scala/com/twitter/algebird/JavaMonoids.scala b/algebird-core/src/main/scala-2.11/JavaMonoids.scala similarity index 100% rename from algebird-core/src/main/scala/com/twitter/algebird/JavaMonoids.scala rename to algebird-core/src/main/scala-2.11/JavaMonoids.scala diff --git a/algebird-core/src/main/scala/com/twitter/algebird/MapAlgebra.scala b/algebird-core/src/main/scala-2.11/MapAlgebra.scala similarity index 99% rename from algebird-core/src/main/scala/com/twitter/algebird/MapAlgebra.scala rename to algebird-core/src/main/scala-2.11/MapAlgebra.scala index 8ee81c42d..55a9f8e54 100644 --- a/algebird-core/src/main/scala/com/twitter/algebird/MapAlgebra.scala +++ b/algebird-core/src/main/scala-2.11/MapAlgebra.scala @@ -224,7 +224,7 @@ object MapAlgebra { } else oldVOpt.get bldr += v } - mutable.iterator.map { case (k, bldr) => (k, bldr.result) }.toMap + mutable.iterator.map { case (k, bldr) => (k, bldr.result()) }.toMap } // Consider this as edges from k -> v, produce a Map[K,Set[V]] diff --git a/algebird-core/src/main/scala/com/twitter/algebird/Scan.scala b/algebird-core/src/main/scala-2.11/Scan.scala similarity index 99% rename from algebird-core/src/main/scala/com/twitter/algebird/Scan.scala rename to algebird-core/src/main/scala-2.11/Scan.scala index ff0dce400..d1d10ced7 100644 --- a/algebird-core/src/main/scala/com/twitter/algebird/Scan.scala +++ b/algebird-core/src/main/scala-2.11/Scan.scala @@ -169,9 +169,9 @@ sealed abstract class Scan[-I, +O] extends Serializable { def scanIterator(iter: Iterator[I]): Iterator[O] = new AbstractIterator[O] { override def hasNext: Boolean = iter.hasNext var state: State = initialState - override def next: O = { + override def next(): O = { val thisState = state - val thisA = iter.next + val thisA = iter.next() val (thisC, nextState) = presentAndNextState(thisA, thisState) state = nextState thisC diff --git a/algebird-core/src/main/scala/com/twitter/algebird/SpaceSaver.scala b/algebird-core/src/main/scala-2.11/SpaceSaver.scala similarity index 99% rename from algebird-core/src/main/scala/com/twitter/algebird/SpaceSaver.scala rename to algebird-core/src/main/scala-2.11/SpaceSaver.scala index 68830547e..d18b58dd6 100644 --- a/algebird-core/src/main/scala/com/twitter/algebird/SpaceSaver.scala +++ b/algebird-core/src/main/scala-2.11/SpaceSaver.scala @@ -78,7 +78,7 @@ object SpaceSaver { buff.putLong(b) buffer ++= buff.array() } - buffer.result.toArray + buffer.result().toArray } // Make sure to be reversible so fromBytes(toBytes(x)) == x diff --git a/algebird-core/src/main/scala/com/twitter/algebird/VectorSpace.scala b/algebird-core/src/main/scala-2.11/VectorSpace.scala similarity index 100% rename from algebird-core/src/main/scala/com/twitter/algebird/VectorSpace.scala rename to algebird-core/src/main/scala-2.11/VectorSpace.scala diff --git a/algebird-core/src/main/scala/com/twitter/algebird/monad/EitherMonad.scala b/algebird-core/src/main/scala-2.11/monad/EitherMonad.scala similarity index 100% rename from algebird-core/src/main/scala/com/twitter/algebird/monad/EitherMonad.scala rename to algebird-core/src/main/scala-2.11/monad/EitherMonad.scala diff --git a/algebird-core/src/main/scala/com/twitter/algebird/monad/Reader.scala b/algebird-core/src/main/scala-2.11/monad/Reader.scala similarity index 100% rename from algebird-core/src/main/scala/com/twitter/algebird/monad/Reader.scala rename to algebird-core/src/main/scala-2.11/monad/Reader.scala diff --git a/algebird-core/src/main/scala/com/twitter/algebird/monad/StateWithError.scala b/algebird-core/src/main/scala-2.11/monad/StateWithError.scala similarity index 100% rename from algebird-core/src/main/scala/com/twitter/algebird/monad/StateWithError.scala rename to algebird-core/src/main/scala-2.11/monad/StateWithError.scala diff --git a/algebird-core/src/main/scala-2.12/Aggregator.scala b/algebird-core/src/main/scala-2.12/Aggregator.scala new file mode 100644 index 000000000..8a4d2b230 --- /dev/null +++ b/algebird-core/src/main/scala-2.12/Aggregator.scala @@ -0,0 +1,637 @@ +package com.twitter.algebird + +import java.util.PriorityQueue +import scala.collection.compat._ +import scala.collection.generic.CanBuildFrom + +/** + * Aggregators compose well. + * + * To create a parallel aggregator that operates on a single input in parallel, use: + * GeneratedTupleAggregator.from2((agg1, agg2)) + */ +object Aggregator extends java.io.Serializable { + implicit def applicative[I]: Applicative[({ type L[O] = Aggregator[I, ?, O] })#L] = + new AggregatorApplicative[I] + + private val DefaultSeed = 471312384 + + /** + * This is a trivial aggregator that always returns a single value + */ + def const[T](t: T): MonoidAggregator[Any, Unit, T] = + prepareMonoid { (_: Any) => () }.andThenPresent(_ => t) + + /** + * Using Aggregator.prepare,present you can add to this aggregator + */ + def fromReduce[T](red: (T, T) => T): Aggregator[T, T, T] = + fromSemigroup(Semigroup.from(red)) + def fromSemigroup[T](implicit sg: Semigroup[T]): Aggregator[T, T, T] = + new Aggregator[T, T, T] { + override def prepare(input: T): T = input + override def semigroup: Semigroup[T] = sg + override def present(reduction: T): T = reduction + } + def fromMonoid[T](implicit mon: Monoid[T]): MonoidAggregator[T, T, T] = + prepareMonoid(identity[T]) + // Uses the product from the ring + def fromRing[T](implicit rng: Ring[T]): RingAggregator[T, T, T] = + fromRing[T, T](rng, identity[T]) + + def fromMonoid[F, T](implicit mon: Monoid[T], prep: F => T): MonoidAggregator[F, T, T] = + prepareMonoid(prep)(mon) + + def prepareSemigroup[F, T](prep: F => T)(implicit sg: Semigroup[T]): Aggregator[F, T, T] = + new Aggregator[F, T, T] { + override def prepare(input: F): T = prep(input) + override def semigroup: Semigroup[T] = sg + override def present(reduction: T): T = reduction + } + def prepareMonoid[F, T](prep: F => T)(implicit m: Monoid[T]): MonoidAggregator[F, T, T] = + new MonoidAggregator[F, T, T] { + override def prepare(input: F): T = prep(input) + override def monoid: Monoid[T] = m + override def present(reduction: T): T = reduction + } + // Uses the product from the ring + def fromRing[F, T](implicit rng: Ring[T], prep: F => T): RingAggregator[F, T, T] = + new RingAggregator[F, T, T] { + override def prepare(input: F): T = prep(input) + override def ring: Ring[T] = rng + override def present(reduction: T): T = reduction + } + + /** + * Obtain an [[Aggregator]] that uses an efficient append operation for faster aggregation. Equivalent to + * {{{appendSemigroup(prep, appnd, identity[T]_)(sg)}}} + */ + def appendSemigroup[F, T](prep: F => T, appnd: (T, F) => T)(implicit + sg: Semigroup[T] + ): Aggregator[F, T, T] = + appendSemigroup(prep, appnd, identity[T])(sg) + + /** + * Obtain an [[Aggregator]] that uses an efficient append operation for faster aggregation + * @tparam F + * Data input type + * @tparam T + * Aggregating [[Semigroup]] type + * @tparam P + * Presentation (output) type + * @param prep + * The preparation function. Expected to construct an instance of type T from a single data element. + * @param appnd + * Function that appends the [[Semigroup]]. Defines the [[Aggregator.append]] method for this aggregator. + * Analogous to the 'seqop' function in Scala's sequence 'aggregate' method + * @param pres + * The presentation function + * @param sg + * The [[Semigroup]] type class + * @note + * The functions 'appnd' and 'prep' are expected to obey the law: {{{appnd(t, f) == sg.plus(t, prep(f))}}} + */ + def appendSemigroup[F, T, P](prep: F => T, appnd: (T, F) => T, pres: T => P)(implicit + sg: Semigroup[T] + ): Aggregator[F, T, P] = + new Aggregator[F, T, P] { + override def semigroup: Semigroup[T] = sg + override def prepare(input: F): T = prep(input) + override def present(reduction: T): P = pres(reduction) + + override def apply(inputs: TraversableOnce[F]): P = + applyOption(inputs).get + + override def applyOption(inputs: TraversableOnce[F]): Option[P] = + agg(inputs).map(pres) + + override def append(l: T, r: F): T = appnd(l, r) + + override def appendAll(old: T, items: TraversableOnce[F]): T = + if (items.iterator.isEmpty) old else reduce(old, agg(items).get) + + private def agg(inputs: TraversableOnce[F]): Option[T] = + if (inputs.iterator.isEmpty) None + else { + val itr = inputs.iterator + val t = prepare(itr.next) + Some(itr.foldLeft(t)(appnd)) + } + } + + /** + * Obtain a [[MonoidAggregator]] that uses an efficient append operation for faster aggregation. Equivalent + * to {{{appendMonoid(appnd, identity[T]_)(m)}}} + */ + def appendMonoid[F, T](appnd: (T, F) => T)(implicit m: Monoid[T]): MonoidAggregator[F, T, T] = + appendMonoid(appnd, identity[T])(m) + + /** + * Obtain a [[MonoidAggregator]] that uses an efficient append operation for faster aggregation + * @tparam F + * Data input type + * @tparam T + * Aggregating [[Monoid]] type + * @tparam P + * Presentation (output) type + * @param appnd + * Function that appends the [[Monoid]]. Defines the [[MonoidAggregator.append]] method for this + * aggregator. Analogous to the 'seqop' function in Scala's sequence 'aggregate' method + * @param pres + * The presentation function + * @param m + * The [[Monoid]] type class + * @note + * The function 'appnd' is expected to obey the law: {{{appnd(t, f) == m.plus(t, appnd(m.zero, f))}}} + */ + def appendMonoid[F, T, P](appnd: (T, F) => T, pres: T => P)(implicit + m: Monoid[T] + ): MonoidAggregator[F, T, P] = + new MonoidAggregator[F, T, P] { + override def monoid: Monoid[T] = m + override def prepare(input: F): T = appnd(m.zero, input) + override def present(reduction: T): P = pres(reduction) + + override def apply(inputs: TraversableOnce[F]): P = present(agg(inputs)) + + override def applyOption(inputs: TraversableOnce[F]): Option[P] = + if (inputs.isEmpty) None else Some(apply(inputs)) + + override def append(l: T, r: F): T = appnd(l, r) + + override def appendAll(old: T, items: TraversableOnce[F]): T = + reduce(old, agg(items)) + + override def appendAll(items: TraversableOnce[F]): T = agg(items) + + private def agg(inputs: TraversableOnce[F]): T = + inputs.foldLeft(m.zero)(append) + } + + /** + * How many items satisfy a predicate + */ + def count[T](pred: T => Boolean): MonoidAggregator[T, Long, Long] = + prepareMonoid { (t: T) => if (pred(t)) 1L else 0L } + + /** + * Do any items satisfy some predicate + */ + def exists[T](pred: T => Boolean): MonoidAggregator[T, Boolean, Boolean] = + prepareMonoid(pred)(OrVal.unboxedMonoid) + + /** + * Do all items satisfy a predicate + */ + def forall[T](pred: T => Boolean): MonoidAggregator[T, Boolean, Boolean] = + prepareMonoid(pred)(AndVal.unboxedMonoid) + + /** + * Take the first (left most in reduce order) item found + */ + def head[T]: Aggregator[T, T, T] = fromReduce[T]((l, _) => l) + + /** + * Take the last (right most in reduce order) item found + */ + def last[T]: Aggregator[T, T, T] = fromReduce[T]((_, r) => r) + + /** + * Get the maximum item + */ + def max[T: Ordering]: Aggregator[T, T, T] = new MaxAggregator[T] + def maxBy[U, T: Ordering](fn: U => T): Aggregator[U, U, U] = { + implicit val ordU: Ordering[U] = Ordering.by(fn) + max[U] + } + + /** + * Get the minimum item + */ + def min[T: Ordering]: Aggregator[T, T, T] = new MinAggregator[T] + def minBy[U, T: Ordering](fn: U => T): Aggregator[U, U, U] = { + implicit val ordU: Ordering[U] = Ordering.by(fn) + min[U] + } + + /** + * This returns the number of items we find + */ + def size: MonoidAggregator[Any, Long, Long] = + prepareMonoid((_: Any) => 1L) + + /** + * Take the smallest `count` items using a heap + */ + def sortedTake[T: Ordering](count: Int): MonoidAggregator[T, PriorityQueue[T], Seq[T]] = + new mutable.PriorityQueueToListAggregator[T](count) + + /** + * Same as sortedTake, but using a function that returns a value that has an Ordering. + * + * This function is like writing list.sortBy(fn).take(count). + */ + def sortByTake[T, U: Ordering](count: Int)(fn: T => U): MonoidAggregator[T, PriorityQueue[T], Seq[T]] = + Aggregator.sortedTake(count)(Ordering.by(fn)) + + /** + * Take the largest `count` items using a heap + */ + def sortedReverseTake[T: Ordering](count: Int): MonoidAggregator[T, PriorityQueue[T], Seq[T]] = + new mutable.PriorityQueueToListAggregator[T](count)(implicitly[Ordering[T]].reverse) + + /** + * Same as sortedReverseTake, but using a function that returns a value that has an Ordering. + * + * This function is like writing list.sortBy(fn).reverse.take(count). + */ + def sortByReverseTake[T, U: Ordering]( + count: Int + )(fn: T => U): MonoidAggregator[T, PriorityQueue[T], Seq[T]] = + Aggregator.sortedReverseTake(count)(Ordering.by(fn)) + + /** + * Immutable version of sortedTake, for frameworks that check immutability of reduce functions. + */ + def immutableSortedTake[T: Ordering](count: Int): MonoidAggregator[T, TopK[T], Seq[T]] = + new TopKToListAggregator[T](count) + + /** + * Immutable version of sortedReverseTake, for frameworks that check immutability of reduce functions. + */ + def immutableSortedReverseTake[T: Ordering](count: Int): MonoidAggregator[T, TopK[T], Seq[T]] = + new TopKToListAggregator[T](count)(implicitly[Ordering[T]].reverse) + + /** + * Randomly selects input items where each item has an independent probability 'prob' of being selected. + * This assumes that all sampled records can fit in memory, so use this only when the expected number of + * sampled values is small. + */ + def randomSample[T]( + prob: Double, + seed: Int = DefaultSeed + ): MonoidAggregator[T, Option[Batched[T]], List[T]] = { + assert(prob >= 0 && prob <= 1, "randomSample.prob must lie in [0, 1]") + val rng = new java.util.Random(seed) + Preparer[T] + .filter(_ => rng.nextDouble() <= prob) + .monoidAggregate(toList) + } + + /** + * Selects exactly 'count' of the input records randomly (or all of the records if there are less then + * 'count' total records). This assumes that all 'count' of the records can fit in memory, so use this only + * for small values of 'count'. + */ + def reservoirSample[T]( + count: Int, + seed: Int = DefaultSeed + ): MonoidAggregator[T, PriorityQueue[(Double, T)], Seq[T]] = { + val rng = new java.util.Random(seed) + Preparer[T] + .map(rng.nextDouble() -> _) + .monoidAggregate(sortByTake(count)(_._1)) + .andThenPresent(_.map(_._2)) + } + + /** + * Put everything in a List. Note, this could fill the memory if the List is very large. + */ + def toList[T]: MonoidAggregator[T, Option[Batched[T]], List[T]] = + new MonoidAggregator[T, Option[Batched[T]], List[T]] { + override def prepare(t: T): Option[Batched[T]] = Some(Batched(t)) + override def monoid: Monoid[Option[Batched[T]]] = + Monoid.optionMonoid(Batched.semigroup) + override def present(o: Option[Batched[T]]): List[T] = + o.map(_.toList).getOrElse(Nil) + } + + /** + * Put everything in a Set. Note, this could fill the memory if the Set is very large. + */ + def toSet[T]: MonoidAggregator[T, Set[T], Set[T]] = + prepareMonoid { (t: T) => Set(t) } + + /** + * This builds an in-memory Set, and then finally gets the size of that set. This may not be scalable if the + * Uniques are very large. You might check the approximateUniqueCount or HyperLogLog Aggregator to get an + * approximate version of this that is scalable. + */ + def uniqueCount[T]: MonoidAggregator[T, Set[T], Int] = + toSet[T].andThenPresent(_.size) + + /** + * Using a constant amount of memory, give an approximate unique count (~ 1% error). This uses an exact set + * for up to 100 items, then HyperLogLog (HLL) with an 1.2% standard error which uses at most 8192 bytes for + * each HLL. For more control, see HyperLogLogAggregator. + */ + def approximateUniqueCount[T: Hash128]: MonoidAggregator[T, Either[HLL, Set[T]], Long] = + SetSizeHashAggregator[T](hllBits = 13, maxSetSize = 100) + + /** + * Returns the lower bound of a given percentile where the percentile is between (0,1] The items that are + * iterated over cannot be negative. + */ + def approximatePercentile[T](percentile: Double, k: Int = QTreeAggregator.DefaultK)(implicit + num: Numeric[T] + ): QTreeAggregatorLowerBound[T] = + QTreeAggregatorLowerBound[T](percentile, k) + + /** + * Returns the intersection of a bounded percentile where the percentile is between (0,1] The items that are + * iterated over cannot be negative. + */ + def approximatePercentileBounds[T](percentile: Double, k: Int = QTreeAggregator.DefaultK)(implicit + num: Numeric[T] + ): QTreeAggregator[T] = + QTreeAggregator[T](percentile, k) + + /** + * An aggregator that sums Numeric values into Doubles. + * + * This is really no more than converting to Double and then summing. The conversion to double means we + * don't have the overflow semantics of integer types on the jvm (e.g. Int.MaxValue + 1 == Int.MinValue). + * + * Note that if you instead wanted to aggregate Numeric values of a type T into the same type T (e.g. if you + * want MonoidAggregator[T, T, T] for some Numeric type T), you can directly use Aggregator.fromMonoid[T] + * after importing the numericRing implicit: + * + * > import com.twitter.algebird.Ring.numericRing > def numericAggregator[T: Numeric]: MonoidAggregator[T, + * T, T] = Aggregator.fromMonoid[T] + */ + def numericSum[T](implicit num: Numeric[T]): MonoidAggregator[T, Double, Double] = + Preparer[T].map(num.toDouble).monoidAggregate(Aggregator.fromMonoid) + +} + +/** + * This is a type that models map/reduce(map). First each item is mapped, then we reduce with a semigroup, + * then finally we present the results. + * + * Unlike Fold, Aggregator keeps it's middle aggregation type externally visible. This is because Aggregators + * are useful in parallel map/reduce systems where there may be some additional types needed to cross the + * map/reduce boundary (such a serialization and intermediate storage). If you don't care about the middle + * type, an _ may be used and the main utility of the instance is still preserved (e.g. def operate[T](ag: + * Aggregator[T, _, Int]): Int) + * + * Note, join is very useful to combine multiple aggregations with one pass. Also + * GeneratedTupleAggregator.fromN((agg1, agg2, ... aggN)) can glue these together well. + * + * This type is the the Fold.M from Haskell's fold package: + * https://hackage.haskell.org/package/folds-0.6.2/docs/Data-Fold-M.html + */ +trait Aggregator[-A, B, +C] extends java.io.Serializable { self => + def prepare(input: A): B + def semigroup: Semigroup[B] + def present(reduction: B): C + + /* ***** + * All the following are in terms of the above + */ + + /** + * combine two inner values + */ + def reduce(l: B, r: B): B = semigroup.plus(l, r) + + /** + * This may error if items is empty. To be safe you might use reduceOption if you don't know that items is + * non-empty + */ + def reduce(items: TraversableOnce[B]): B = semigroup.sumOption(items).get + + /** + * This is the safe version of the above. If the input in empty, return None, else reduce the items + */ + def reduceOption(items: TraversableOnce[B]): Option[B] = + semigroup.sumOption(items) + + /** + * This may error if inputs are empty (for Monoid Aggregators it never will, instead you see + * present(Monoid.zero[B]) + */ + def apply(inputs: TraversableOnce[A]): C = + present(reduce(inputs.iterator.map(prepare))) + + /** + * This returns None if the inputs are empty + */ + def applyOption(inputs: TraversableOnce[A]): Option[C] = + reduceOption(inputs.iterator.map(prepare)) + .map(present) + + /** + * This returns the cumulative sum of its inputs, in the same order. If the inputs are empty, the result + * will be empty too. + */ + def cumulativeIterator(inputs: Iterator[A]): Iterator[C] = + inputs + .scanLeft(None: Option[B]) { + case (None, a) => Some(prepare(a)) + case (Some(b), a) => Some(append(b, a)) + } + .collect { case Some(b) => present(b) } + + /** + * This returns the cumulative sum of its inputs, in the same order. If the inputs are empty, the result + * will be empty too. + */ + def applyCumulatively[In <: TraversableOnce[A], Out]( + inputs: In + )(implicit bf: CanBuildFrom[In, C, Out]): Out = + (bf: BuildFrom[In, C, Out]).fromSpecific(inputs)(cumulativeIterator(inputs.iterator)) + + def append(l: B, r: A): B = reduce(l, prepare(r)) + + def appendAll(old: B, items: TraversableOnce[A]): B = + if (items.iterator.isEmpty) old else reduce(old, reduce(items.iterator.map(prepare))) + + /** Like calling andThen on the present function */ + def andThenPresent[D](present2: C => D): Aggregator[A, B, D] = + new Aggregator[A, B, D] { + override def prepare(input: A): B = self.prepare(input) + override def semigroup: Semigroup[B] = self.semigroup + override def present(reduction: B): D = present2(self.present(reduction)) + } + + /** Like calling compose on the prepare function */ + def composePrepare[A1](prepare2: A1 => A): Aggregator[A1, B, C] = + new Aggregator[A1, B, C] { + override def prepare(input: A1): B = self.prepare(prepare2(input)) + override def semigroup: Semigroup[B] = self.semigroup + override def present(reduction: B): C = self.present(reduction) + } + + /** + * This allows you to run two aggregators on the same data with a single pass + */ + def join[A2 <: A, B2, C2](that: Aggregator[A2, B2, C2]): Aggregator[A2, (B, B2), (C, C2)] = + GeneratedTupleAggregator.from2((this, that)) + + /** + * This allows you to join two aggregators into one that takes a tuple input, which in turn allows you to + * chain .composePrepare onto the result if you have an initial input that has to be prepared differently + * for each of the joined aggregators. + * + * The law here is: ag1.zip(ag2).apply(as.zip(bs)) == (ag1(as), ag2(bs)) + */ + def zip[A2, B2, C2](ag2: Aggregator[A2, B2, C2]): Aggregator[(A, A2), (B, B2), (C, C2)] = { + val ag1 = this + new Aggregator[(A, A2), (B, B2), (C, C2)] { + override def prepare(a: (A, A2)): (B, B2) = (ag1.prepare(a._1), ag2.prepare(a._2)) + override val semigroup = new Tuple2Semigroup()(ag1.semigroup, ag2.semigroup) + override def present(b: (B, B2)): (C, C2) = (ag1.present(b._1), ag2.present(b._2)) + } + } + + /** + * An Aggregator can be converted to a Fold, but not vice-versa Note, a Fold is more constrained so only do + * this if you require joining a Fold with an Aggregator to produce a Fold + */ + def toFold: Fold[A, Option[C]] = + Fold.fold[Option[B], A, Option[C]]( + { + case (None, a) => Some(self.prepare(a)) + case (Some(b), a) => Some(self.append(b, a)) + }, + None, + _.map(self.present) + ) + + def lift: MonoidAggregator[A, Option[B], Option[C]] = + new MonoidAggregator[A, Option[B], Option[C]] { + override def prepare(input: A): Option[B] = Some(self.prepare(input)) + override def present(reduction: Option[B]): Option[C] = reduction.map(self.present) + override def monoid = new OptionMonoid[B]()(self.semigroup) + } +} + +/** + * Aggregators are Applicatives, but this hides the middle type. If you need a join that does not hide the + * middle type use join on the trait, or GeneratedTupleAggregator.fromN + */ +class AggregatorApplicative[I] extends Applicative[({ type L[O] = Aggregator[I, ?, O] })#L] { + override def map[T, U](mt: Aggregator[I, ?, T])(fn: T => U): Aggregator[I, ?, U] = + mt.andThenPresent(fn) + override def apply[T](v: T): Aggregator[I, ?, T] = + Aggregator.const(v) + override def join[T, U](mt: Aggregator[I, ?, T], mu: Aggregator[I, ?, U]): Aggregator[I, ?, (T, U)] = + mt.join(mu) + override def join[T1, T2, T3]( + m1: Aggregator[I, ?, T1], + m2: Aggregator[I, ?, T2], + m3: Aggregator[I, ?, T3] + ): Aggregator[I, ?, (T1, T2, T3)] = + GeneratedTupleAggregator.from3((m1, m2, m3)) + + override def join[T1, T2, T3, T4]( + m1: Aggregator[I, ?, T1], + m2: Aggregator[I, ?, T2], + m3: Aggregator[I, ?, T3], + m4: Aggregator[I, ?, T4] + ): Aggregator[I, ?, (T1, T2, T3, T4)] = + GeneratedTupleAggregator.from4((m1, m2, m3, m4)) + + override def join[T1, T2, T3, T4, T5]( + m1: Aggregator[I, ?, T1], + m2: Aggregator[I, ?, T2], + m3: Aggregator[I, ?, T3], + m4: Aggregator[I, ?, T4], + m5: Aggregator[I, ?, T5] + ): Aggregator[I, ?, (T1, T2, T3, T4, T5)] = + GeneratedTupleAggregator.from5((m1, m2, m3, m4, m5)) +} + +trait MonoidAggregator[-A, B, +C] extends Aggregator[A, B, C] { self => + def monoid: Monoid[B] + override def semigroup: Monoid[B] = monoid + final override def reduce(items: TraversableOnce[B]): B = + monoid.sum(items) + + def appendAll(items: TraversableOnce[A]): B = reduce(items.iterator.map(prepare)) + + override def andThenPresent[D](present2: C => D): MonoidAggregator[A, B, D] = { + val self = this + new MonoidAggregator[A, B, D] { + override def prepare(a: A): B = self.prepare(a) + override def monoid: Monoid[B] = self.monoid + override def present(b: B): D = present2(self.present(b)) + } + } + override def composePrepare[A2](prepare2: A2 => A): MonoidAggregator[A2, B, C] = { + val self = this + new MonoidAggregator[A2, B, C] { + override def prepare(a: A2): B = self.prepare(prepare2(a)) + override def monoid: Monoid[B] = self.monoid + override def present(b: B): C = self.present(b) + } + } + + /** + * Build a MonoidAggregator that either takes left or right input and outputs the pair from both + */ + def either[A2, B2, C2]( + that: MonoidAggregator[A2, B2, C2] + ): MonoidAggregator[Either[A, A2], (B, B2), (C, C2)] = + new MonoidAggregator[Either[A, A2], (B, B2), (C, C2)] { + override def prepare(e: Either[A, A2]): (B, B2) = e match { + case Left(a) => (self.prepare(a), that.monoid.zero) + case Right(a2) => (self.monoid.zero, that.prepare(a2)) + } + override val monoid = new Tuple2Monoid[B, B2]()(self.monoid, that.monoid) + override def present(bs: (B, B2)): (C, C2) = (self.present(bs._1), that.present(bs._2)) + } + + /** + * Only transform values where the function is defined, else discard + */ + def collectBefore[A2](fn: PartialFunction[A2, A]): MonoidAggregator[A2, B, C] = + new MonoidAggregator[A2, B, C] { + override def prepare(a: A2): B = + if (fn.isDefinedAt(a)) self.prepare(fn(a)) else self.monoid.zero + override def monoid: Monoid[B] = self.monoid + override def present(b: B): C = self.present(b) + } + + /** + * Only aggregate items that match a predicate + */ + def filterBefore[A1 <: A](pred: A1 => Boolean): MonoidAggregator[A1, B, C] = + new MonoidAggregator[A1, B, C] { + override def prepare(a: A1): B = if (pred(a)) self.prepare(a) else self.monoid.zero + override def monoid: Monoid[B] = self.monoid + override def present(b: B): C = self.present(b) + } + + /** + * This maps the inputs to Bs, then sums them, effectively flattening the inputs to the MonoidAggregator + */ + def sumBefore: MonoidAggregator[TraversableOnce[A], B, C] = + new MonoidAggregator[TraversableOnce[A], B, C] { + override def monoid: Monoid[B] = self.monoid + override def prepare(input: TraversableOnce[A]): B = + monoid.sum(input.iterator.map(self.prepare)) + override def present(reduction: B): C = self.present(reduction) + } + + /** + * This allows you to join two aggregators into one that takes a tuple input, which in turn allows you to + * chain .composePrepare onto the result if you have an initial input that has to be prepared differently + * for each of the joined aggregators. + * + * The law here is: ag1.zip(ag2).apply(as.zip(bs)) == (ag1(as), ag2(bs)) + */ + def zip[A2, B2, C2](ag2: MonoidAggregator[A2, B2, C2]): MonoidAggregator[(A, A2), (B, B2), (C, C2)] = { + val ag1 = self + new MonoidAggregator[(A, A2), (B, B2), (C, C2)] { + override def prepare(a: (A, A2)): (B, B2) = (ag1.prepare(a._1), ag2.prepare(a._2)) + override val monoid = new Tuple2Monoid[B, B2]()(ag1.monoid, ag2.monoid) + override def present(b: (B, B2)): (C, C2) = (ag1.present(b._1), ag2.present(b._2)) + } + } +} + +trait RingAggregator[-A, B, +C] extends MonoidAggregator[A, B, C] { + def ring: Ring[B] + override def monoid: Monoid[B] = Ring.asTimesMonoid(ring) +} diff --git a/algebird-core/src/main/scala-2.12/CountMinSketch.scala b/algebird-core/src/main/scala-2.12/CountMinSketch.scala new file mode 100644 index 000000000..826aebd5a --- /dev/null +++ b/algebird-core/src/main/scala-2.12/CountMinSketch.scala @@ -0,0 +1,1420 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ + +package com.twitter.algebird + +import algebra.CommutativeMonoid + +import scala.collection.compat._ + +/** + * A Count-Min sketch is a probabilistic data structure used for summarizing streams of data in sub-linear + * space. + * + * It works as follows. Let `(eps, delta)` be two parameters that describe the confidence in our error + * estimates, and let `d = ceil(ln 1/delta)` and `w = ceil(e / eps)`. + * + * Note: Throughout the code `d` and `w` are called `depth` and `width`, respectively. + * + * Then: + * + * - Take `d` pairwise independent hash functions `h_i`, each of which maps onto the domain `[0, w - 1]`. + * - Create a 2-dimensional table of counts, with `d` rows and `w` columns, initialized with all zeroes. + * - When a new element x arrives in the stream, update the table of counts by setting `counts[i, h_i[x]] += + * 1`, for each `1 <= i <= d`. + * - (Note the rough similarity to a Bloom filter.) + * + * As an example application, suppose you want to estimate the number of times an element `x` has appeared in + * a data stream so far. The Count-Min sketch estimate of this frequency is + * + * min_i { counts[i, h_i[x]] } + * + * With probability at least `1 - delta`, this estimate is within `eps * N` of the true frequency (i.e., `true + * frequency <= estimate <= true frequency + eps * N`), where N is the total size of the stream so far. + * + * See http://www.eecs.harvard.edu/~michaelm/CS222/countmin.pdf for technical details, including proofs of the + * estimates and error bounds used in this implementation. + * + * Parts of this implementation are taken from + * https://github.com/clearspring/stream-lib/blob/master/src/main/java/com/clearspring/analytics/stream/frequency/CountMinSketch.java + * + * @author + * Edwin Chen + */ +/** + * Monoid for adding CMS sketches. + * + * =Usage= + * + * `eps` and `delta` are parameters that bound the error of each query estimate. For example, errors in + * answering point queries (e.g., how often has element x appeared in the stream described by the sketch?) are + * often of the form: "with probability p >= 1 - delta, the estimate is close to the truth by some factor + * depending on eps." + * + * The type `K` is the type of items you want to count. You must provide an implicit `CMSHasher[K]` for `K`, + * and Algebird ships with several such implicits for commonly used types such as `Long` and `BigInt`. + * + * If your type `K` is not supported out of the box, you have two options: 1) You provide a "translation" + * function to convert items of your (unsupported) type `K` to a supported type such as Double, and then use + * the `contramap` function of [[CMSHasher]] to create the required `CMSHasher[K]` for your type (see the + * documentation of [[CMSHasher]] for an example); 2) You implement a `CMSHasher[K]` from scratch, using the + * existing CMSHasher implementations as a starting point. + * + * Note: Because Arrays in Scala/Java not have sane `equals` and `hashCode` implementations, you cannot safely + * use types such as `Array[Byte]`. Extra work is required for Arrays. For example, you may opt to convert + * `Array[T]` to a `Seq[T]` via `toSeq`, or you can provide appropriate wrapper classes. Algebird provides one + * such wrapper class, [[Bytes]], to safely wrap an `Array[Byte]` for use with CMS. + * + * @param eps + * One-sided error bound on the error of each point query, i.e. frequency estimate. + * @param delta + * A bound on the probability that a query estimate does not lie within some small interval (an interval + * that depends on `eps`) around the truth. + * @param seed + * A seed to initialize the random number generator used to create the pairwise independent hash functions. + * @param maxExactCountOpt + * An Option parameter about how many exact counts a sparse CMS wants to keep. + * @tparam K + * The type used to identify the elements to be counted. For example, if you want to count the occurrence of + * user names, you could map each username to a unique numeric ID expressed as a `Long`, and then count the + * occurrences of those `Long`s with a CMS of type `K=Long`. Note that this mapping between the elements of + * your problem domain and their identifiers used for counting via CMS should be bijective. We require a + * [[CMSHasher]] context bound for `K`, see [[CMSHasherImplicits]] for available implicits that can be + * imported. Which type K should you pick in practice? For domains that have less than `2^64` unique + * elements, you'd typically use `Long`. For larger domains you can try `BigInt`, for example. Other + * possibilities include Spire's `SafeLong` and `Numerical` data types (https://github.com/non/spire), + * though Algebird does not include the required implicits for CMS-hashing (cf. [[CMSHasherImplicits]]. + */ +class CMSMonoid[K: CMSHasher](eps: Double, delta: Double, seed: Int, maxExactCountOpt: Option[Int] = None) + extends Monoid[CMS[K]] + with CommutativeMonoid[CMS[K]] { + + val params: CMSParams[K] = { + val hashes: Seq[CMSHash[K]] = CMSFunctions.generateHashes(eps, delta, seed) + CMSParams(hashes, eps, delta, maxExactCountOpt) + } + + override val zero: CMS[K] = CMSZero[K](params) + + /** + * Combines the two sketches. + * + * The sketches must use the same hash functions. + */ + override def plus(left: CMS[K], right: CMS[K]): CMS[K] = { + require(left.params.hashes == right.params.hashes, "The sketches must use the same hash functions.") + left ++ right + } + + /** + * Creates a sketch out of a single item. + */ + def create(item: K): CMS[K] = CMSItem[K](item, 1L, params) + + /** + * Creates a sketch out of multiple items. + */ + def create(data: Seq[K]): CMS[K] = { + val summation = new CMSSummation(params) + data.foreach(k => summation.insert(k, 1L)) + summation.result + } + + override def sumOption(sketches: TraversableOnce[CMS[K]]): Option[CMS[K]] = + if (sketches.iterator.isEmpty) None else Some(sum(sketches)) + + override def sum(sketches: TraversableOnce[CMS[K]]): CMS[K] = { + val summation = new CMSSummation(params) + summation.updateAll(sketches) + summation.result + } +} + +/** + * This mutable builder can be used when speed is essential and you can be sure the scope of the mutability + * cannot escape in an unsafe way. The intended use is to allocate and call result in one method without + * letting a reference to the instance escape into a closure. + */ +class CMSSummation[K](params: CMSParams[K]) { + private[this] val hashes = params.hashes.toArray + private[this] val height = CMSFunctions.depth(params.delta) + private[this] val width = CMSFunctions.width(params.eps) + private[this] val cells = new Array[Long](height * width) + private[this] var totalCount = 0L + + final def insert(k: K, count: Long): Unit = { + var row = 0 + var offset = 0 + val hs = hashes + while (row < hs.length) { + cells(offset + hs(row)(k)) += count + offset += width + row += 1 + } + totalCount += count + } + + def updateAll(sketches: TraversableOnce[CMS[K]]): Unit = + sketches.iterator.foreach(updateInto) + + def updateInto(cms: CMS[K]): Unit = + cms match { + case CMSZero(_) => + () + case CMSItem(item, count, _) => + insert(item, count) + case SparseCMS(table, _, _) => + table.foreach { case (item, c) => + insert(item, c) + } + case CMSInstance(CMSInstance.CountsTable(matrix), count, _) => + var offset = 0 + val rit = matrix.iterator + while (rit.hasNext) { + var col = 0 + val cit = rit.next().iterator + while (cit.hasNext) { + cells(offset + col) += cit.next() + col += 1 + } + offset += width + } + totalCount += count + } + + def result: CMS[K] = + if (totalCount == 0L) CMSZero(params) + else { + def vectorize(row: Int): Vector[Long] = { + val offset = row * width + val b = Vector.newBuilder[Long] + var col = 0 + while (col < width) { + b += cells(offset + col) + col += 1 + } + b.result() + } + + val b = Vector.newBuilder[Vector[Long]] + var row = 0 + while (row < height) { + b += vectorize(row) + row += 1 + } + CMSInstance(CMSInstance.CountsTable(b.result()), totalCount, params) + } +} + +/** + * An Aggregator for [[CMS]]. Can be created using CMS.aggregator. + */ +case class CMSAggregator[K](cmsMonoid: CMSMonoid[K]) extends MonoidAggregator[K, CMS[K], CMS[K]] { + override val monoid: CMSMonoid[K] = cmsMonoid + + override def prepare(value: K): CMS[K] = monoid.create(value) + + override def present(cms: CMS[K]): CMS[K] = cms + +} + +/** + * Configuration parameters for [[CMS]]. + * + * @param hashes + * Pair-wise independent hashes functions. We need `N=depth` such functions (`depth` can be derived from + * `delta`). + * @param eps + * One-sided error bound on the error of each point query, i.e. frequency estimate. + * @param delta + * A bound on the probability that a query estimate does not lie within some small interval (an interval + * that depends on `eps`) around the truth. + * @param maxExactCountOpt + * An Option parameter about how many exact counts a sparse CMS wants to keep. + * @tparam K + * The type used to identify the elements to be counted. + */ +case class CMSParams[K]( + hashes: Seq[CMSHash[K]], + eps: Double, + delta: Double, + maxExactCountOpt: Option[Int] = None +) { + + require(0 < eps && eps < 1, "eps must lie in (0, 1)") + require(0 < delta && delta < 1, "delta must lie in (0, 1)") + require( + hashes.size >= CMSFunctions.depth(delta), + s"we require at least ${CMSFunctions.depth(delta)} hash functions" + ) + +} + +/** + * Helper functions to generate or to translate between various CMS parameters (cf. [[CMSParams]]). + */ +object CMSFunctions { + + /** + * Translates from `width` to `eps`. + */ + def eps(width: Int): Double = scala.math.exp(1.0) / width + + /** + * Translates from `depth` to `delta`. + */ + @throws[IllegalArgumentException]("if depth is too large, causing precision errors when computing delta") + def delta(depth: Int): Double = { + val i = scala.math.exp(-depth) + require( + i > 0.0, + s"depth must be smaller as it causes precision errors when computing delta ($depth led to an invalid delta of $i)" + ) + i + } + + /** + * Translates from `delta` to `depth`. + */ + @throws[IllegalArgumentException]("if delta is is not in (0, 1)") + def depth(delta: Double): Int = { + require(0 < delta && delta < 1, "delta must lie in (0, 1)") + scala.math.ceil(scala.math.log(1.0 / delta)).toInt + } + + /** + * Translates from `eps` to `width`. + */ + def width(eps: Double): Int = + scala.math.ceil(truncatePrecisionError(scala.math.exp(1) / eps)).toInt + + /** + * Compute maxExactCount from parameters or `depth` and `width` + */ + def maxExactCount(maxExactCountOpt: Option[Int], depth: Int, width: Int): Int = + maxExactCountOpt.getOrElse(math.max(width * depth / 100, 50)) + + // Eliminates precision errors such as the following: + // + // scala> val width = 39 + // scala> scala.math.exp(1) / CMSFunctions.eps(width) + // res171: Double = 39.00000000000001 <<< should be 39.0 + // + // Because of the actual types on which CMSFunctions operates (i.e. Int and Double), the maximum number of decimal + // places should be 6. + private def truncatePrecisionError(i: Double, decimalPlaces: Int = 6) = + BigDecimal(i) + .setScale(decimalPlaces, BigDecimal.RoundingMode.HALF_UP) + .toDouble + + /** + * Generates `N=depth` pair-wise independent hash functions. + * + * @param eps + * One-sided error bound on the error of each point query, i.e. frequency estimate. + * @param delta + * Error bound on the probability that a query estimate does NOT lie within some small interval around the + * truth. + * @param seed + * Seed for the random number generator. + * @tparam K + * The type used to identify the elements to be counted. + * @return + * The generated hash functions. + */ + def generateHashes[K: CMSHasher](eps: Double, delta: Double, seed: Int): Seq[CMSHash[K]] = { + // Typically, we would use d -- aka depth -- pair-wise independent hash functions of the form + // + // h_i(x) = a_i * x + b_i (mod p) + // + // But for this particular application, setting b_i does not matter (since all it does is shift the results of a + // particular hash), so we omit it (by setting b_i to 0) and simply use hash functions of the form + // + // h_i(x) = a_i * x (mod p) + // + val r = new scala.util.Random(seed) + val numHashes = depth(delta) + val numCounters = width(eps) + (0 to (numHashes - 1)).map(_ => CMSHash[K](r.nextInt(), 0, numCounters)) + } + +} + +/** + * A trait for CMS implementations that can count elements in a data stream and that can answer point queries + * (i.e. frequency estimates) for these elements. + * + * Known implementations: [[CMS]], [[TopCMS]]. + * + * @tparam K + * The type used to identify the elements to be counted. + * @tparam C + * The type of the actual CMS that implements this trait. + */ +trait CMSCounting[K, C[_]] { + + /** + * Returns the one-sided error bound on the error of each point query, i.e. frequency estimate. + */ + def eps: Double + + /** + * Returns the bound on the probability that a query estimate does NOT lie within some small interval (an + * interval that depends on `eps`) around the truth. + */ + def delta: Double + + /** + * Number of hash functions (also: number of rows in the counting table). This number is derived from + * `delta`. + */ + def depth: Int = CMSFunctions.depth(delta) + + /** + * Number of counters per hash function (also: number of columns in the counting table). This number is + * derived from `eps`. + */ + def width: Int = CMSFunctions.width(eps) + + /** + * An Option parameter about how many exact counts a sparse CMS wants to keep + */ + def maxExactCountOpt: Option[Int] + + /** + * Number of exact counts a sparse CMS wants to keep. This number is derived from `maxExactCountOpt`. + */ + def maxExactCount: Int = + CMSFunctions.maxExactCount(maxExactCountOpt, depth, width) + + /** + * Returns a new sketch that is the combination of this sketch and the other sketch. + */ + def ++(other: C[K]): C[K] + + /** + * Counts the item and returns the result as a new sketch. + */ + def +(item: K): C[K] = this + (item, 1L) + + /** + * Counts the item `count` times and returns the result as a new sketch. + */ + def +(item: K, count: Long): C[K] + + /** + * Returns an estimate of the total number of times this item has been seen in the stream so far. This + * estimate is an upper bound. + * + * It is always true that `estimatedFrequency >= trueFrequency`. With probability `p >= 1 - delta`, it also + * holds that `estimatedFrequency <= trueFrequency + eps * totalCount`. + */ + def frequency(item: K): Approximate[Long] + + /** + * Returns an estimate of the inner product against another data stream. + * + * In other words, let a_i denote the number of times element i has been seen in the data stream summarized + * by this CMS, and let b_i denote the same for the other CMS. Then this returns an estimate of ` = + * \sum a_i b_i`. + * + * Note: This can also be viewed as the join size between two relations. + * + * It is always true that actualInnerProduct <= estimatedInnerProduct. With probability `p >= 1 - delta`, it + * also holds that `estimatedInnerProduct <= actualInnerProduct + eps * thisTotalCount * otherTotalCount`. + */ + def innerProduct(other: C[K]): Approximate[Long] + + /** + * Total number of elements counted (i.e. seen in the data stream) so far. + */ + def totalCount: Long + + /** + * The first frequency moment is the total number of elements in the stream. + */ + def f1: Long = totalCount + + /** + * The second frequency moment is `\sum a_i^2`, where `a_i` is the count of the i-th element. + */ + def f2: Approximate[Long] + +} + +/** + * A trait for CMS implementations that can track heavy hitters in a data stream. + * + * It is up to the implementation how the semantics of tracking heavy hitters are defined. For instance, one + * implementation could track the "top %" heavy hitters whereas another implementation could track the "top N" + * heavy hitters. + * + * Known implementations: [[TopCMS]]. + * + * @tparam K + * The type used to identify the elements to be counted. + */ +trait CMSHeavyHitters[K] { + + /** + * The pluggable logic of how heavy hitters are being tracked. + */ + def heavyHittersLogic: HeavyHittersLogic[K] + + /** + * Returns the set of heavy hitters. + */ + def heavyHitters: Set[K] + +} + +object CMS { + + def monoid[K: CMSHasher](eps: Double, delta: Double, seed: Int): CMSMonoid[K] = + monoid(eps, delta, seed, None) + def monoid[K: CMSHasher]( + eps: Double, + delta: Double, + seed: Int, + maxExactCountOpt: Option[Int] + ): CMSMonoid[K] = + new CMSMonoid[K](eps, delta, seed, maxExactCountOpt) + + def monoid[K: CMSHasher](depth: Int, width: Int, seed: Int): CMSMonoid[K] = + monoid(depth, width, seed, None) + def monoid[K: CMSHasher](depth: Int, width: Int, seed: Int, maxExactCountOpt: Option[Int]): CMSMonoid[K] = + monoid(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, maxExactCountOpt) + + def aggregator[K: CMSHasher](eps: Double, delta: Double, seed: Int): CMSAggregator[K] = + aggregator(eps, delta, seed, None) + def aggregator[K: CMSHasher]( + eps: Double, + delta: Double, + seed: Int, + maxExactCountOpt: Option[Int] + ): CMSAggregator[K] = + new CMSAggregator[K](monoid(eps, delta, seed, maxExactCountOpt)) + + def aggregator[K: CMSHasher](depth: Int, width: Int, seed: Int): CMSAggregator[K] = + aggregator(depth, width, seed, None) + def aggregator[K: CMSHasher]( + depth: Int, + width: Int, + seed: Int, + maxExactCountOpt: Option[Int] + ): CMSAggregator[K] = + aggregator(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, maxExactCountOpt) + + /** + * Returns a fresh, zeroed CMS instance. + */ + def apply[K: CMSHasher]( + eps: Double, + delta: Double, + seed: Int, + maxExactCountOpt: Option[Int] = None + ): CMS[K] = { + val params = { + val hashes: Seq[CMSHash[K]] = + CMSFunctions.generateHashes(eps, delta, seed) + CMSParams(hashes, eps, delta, maxExactCountOpt) + } + CMSZero[K](params) + } + +} + +/** + * A Count-Min sketch data structure that allows for counting and frequency estimation of elements in a data + * stream. + * + * Tip: If you also need to track heavy hitters ("Top N" problems), take a look at [[TopCMS]]. + * + * =Usage= + * + * This example demonstrates how to count `Long` elements with [[CMS]], i.e. `K=Long`. + * + * Note that the actual counting is always performed with a `Long`, regardless of your choice of `K`. That is, + * the counting table behind the scenes is backed by `Long` values (at least in the current implementation), + * and thus the returned frequency estimates are always instances of `Approximate[Long]`. + * + * @example + * {{{ + * + * // Creates a monoid for a CMS that can count `Long` elements. val cmsMonoid: CMSMonoid[Long] = { val eps = + * 0.001 val delta = 1E-10 val seed = 1 CMS.monoid[Long](eps, delta, seed) } + * + * // Creates a CMS instance that has counted the element `1L`. val cms: CMS[Long] = cmsMonoid.create(1L) + * + * // Estimates the frequency of `1L` val estimate: Approximate[Long] = cms.frequency(1L) + * }}} + * + * @tparam K + * The type used to identify the elements to be counted. + */ +sealed abstract class CMS[K](val params: CMSParams[K]) extends java.io.Serializable with CMSCounting[K, CMS] { + + override val eps: Double = params.eps + + override val delta: Double = params.delta + + override val maxExactCountOpt: Option[Int] = params.maxExactCountOpt + + override def f2: Approximate[Long] = innerProduct(this) + +} + +/** + * Zero element. Used for initialization. + */ +case class CMSZero[K](override val params: CMSParams[K]) extends CMS[K](params) { + + override val totalCount: Long = 0L + + override def +(item: K, count: Long): CMS[K] = CMSItem[K](item, count, params) + + override def ++(other: CMS[K]): CMS[K] = other + + override def frequency(item: K): Approximate[Long] = Approximate.exact(0L) + + override def innerProduct(other: CMS[K]): Approximate[Long] = + Approximate.exact(0L) + +} + +/** + * Used for holding a single element, to avoid repeatedly adding elements from sparse counts tables. + */ +case class CMSItem[K](item: K, override val totalCount: Long, override val params: CMSParams[K]) + extends CMS[K](params) { + + override def +(x: K, count: Long): CMS[K] = + SparseCMS[K](params) + (item, totalCount) + (x, count) + + override def ++(other: CMS[K]): CMS[K] = + other match { + case _: CMSZero[?] => this + case other: CMSItem[K] => + CMSInstance[K](params) + (item, totalCount) + (other.item, other.totalCount) + case _ => other + item + } + + override def frequency(x: K): Approximate[Long] = + if (item == x) Approximate.exact(totalCount) else Approximate.exact(0L) + + override def innerProduct(other: CMS[K]): Approximate[Long] = + Approximate.exact(totalCount) * other.frequency(item) + +} + +/** + * A sparse Count-Min sketch structure, used for situations where the key is highly skewed. + */ +case class SparseCMS[K]( + exactCountTable: Map[K, Long], + override val totalCount: Long, + override val params: CMSParams[K] +) extends CMS[K](params) { + import SparseCMS._ + + override def +(x: K, count: Long): CMS[K] = { + val currentCount = exactCountTable.getOrElse(x, 0L) + val newTable = exactCountTable.updated(x, currentCount + count) + if (newTable.size < maxExactCount) { + // still sparse + SparseCMS(newTable, totalCount = totalCount + count, params = params) + } else { + toDense(newTable, params) + } + } + + override def ++(other: CMS[K]): CMS[K] = + other match { + case _: CMSZero[?] => this + case other: CMSItem[K] => this + (other.item, other.totalCount) + case other: SparseCMS[K] => + // This SparseCMS's maxExactCount is used, so ++ is not communitive + val newTable = Semigroup.plus(exactCountTable, other.exactCountTable) + if (newTable.size < maxExactCount) { + // still sparse + SparseCMS(newTable, totalCount = totalCount + other.totalCount, params = params) + } else { + toDense(newTable, params) + } + + case other: CMSInstance[K] => other ++ this + } + + override def frequency(x: K): Approximate[Long] = + Approximate.exact(exactCountTable.getOrElse(x, 0L)) + + override def innerProduct(other: CMS[K]): Approximate[Long] = + exactCountTable.iterator + .map { case (x, count) => Approximate.exact(count) * other.frequency(x) } + .reduceOption(_ + _) + .getOrElse(Approximate.exact(0L)) +} + +object SparseCMS { + + /** + * Creates a new [[SparseCMS]] with empty exactCountTable + */ + def apply[K](params: CMSParams[K]): SparseCMS[K] = { + val exactCountTable = Map[K, Long]() + SparseCMS[K](exactCountTable, totalCount = 0, params = params) + } + + /** + * Creates a new [[CMSInstance]] from a Map[K, Long] + */ + def toDense[K](exactCountTable: Map[K, Long], params: CMSParams[K]): CMS[K] = + // Create new CMSInstace + exactCountTable.foldLeft(CMSInstance[K](params)) { case (cms, (x, count)) => + cms + (x, count) + } +} + +/** + * The general Count-Min sketch structure, used for holding any number of elements. + */ +case class CMSInstance[K]( + countsTable: CMSInstance.CountsTable[K], + override val totalCount: Long, + override val params: CMSParams[K] +) extends CMS[K](params) { + + override def ++(other: CMS[K]): CMS[K] = + other match { + case _: CMSZero[?] => this + case other: CMSItem[K] => this + other.item + case other: SparseCMS[K] => + other.exactCountTable.foldLeft(this) { case (cms, (x, count)) => + cms + (x, count) + } + case other: CMSInstance[K] => + val newTable = countsTable ++ other.countsTable + val newTotalCount = totalCount + other.totalCount + CMSInstance[K](newTable, newTotalCount, params) + } + + private def makeApprox(est: Long): Approximate[Long] = + if (est == 0L) Approximate.exact(0L) + else { + val lower = math.max(0L, est - (eps * totalCount).toLong) + Approximate(lower, est, est, 1 - delta) + } + + override def frequency(item: K): Approximate[Long] = { + var freq = Long.MaxValue + val hs = params.hashes + val it = countsTable.counts.iterator + var i = 0 + while (it.hasNext) { + val row = it.next() + val count = row(hs(i)(item)) + if (count < freq) freq = count + i += 1 + } + makeApprox(freq) + } + + /** + * Let X be a CMS, and let count_X[j, k] denote the value in X's 2-dimensional count table at row j and + * column k. Then the Count-Min sketch estimate of the inner product between A and B is the minimum inner + * product between their rows: estimatedInnerProduct = min_j (\sum_k count_A[j, k] * count_B[j, k]|) + */ + override def innerProduct(other: CMS[K]): Approximate[Long] = + other match { + case other: CMSInstance[?] => + require(other.depth == depth && other.width == width, "Tables must have the same dimensions.") + + def innerProductAtDepth(d: Int) = + (0 to (width - 1)).iterator.map { w => + countsTable.getCount((d, w)) * other.countsTable.getCount((d, w)) + }.sum + + val est = (0 to (depth - 1)).iterator.map(innerProductAtDepth).min + val minimum = + math.max(est - (eps * totalCount * other.totalCount).toLong, 0) + Approximate(minimum, est, est, 1 - delta) + case _ => other.innerProduct(this) + } + + override def +(item: K, count: Long): CMSInstance[K] = { + require(count >= 0, "count must be >= 0 (negative counts not implemented") + if (count != 0L) { + val newCountsTable = + (0 to (depth - 1)).foldLeft(countsTable) { case (table, row) => + val pos = (row, params.hashes(row)(item)) + table + (pos, count) + } + CMSInstance[K](newCountsTable, totalCount + count, params) + } else this + } + +} + +object CMSInstance { + + /** + * Initializes a [[CMSInstance]] with all zeroes, i.e. nothing has been counted yet. + */ + def apply[K](params: CMSParams[K]): CMSInstance[K] = { + val countsTable = CountsTable[K](CMSFunctions.depth(params.delta), CMSFunctions.width(params.eps)) + CMSInstance[K](countsTable, 0, params) + } + + /** + * The 2-dimensional table of counters used in the Count-Min sketch. Each row corresponds to a particular + * hash function. + */ + // TODO: implement a dense matrix type, and use it here + case class CountsTable[K](counts: Vector[Vector[Long]]) { + require(depth > 0, "Table must have at least 1 row.") + require(width > 0, "Table must have at least 1 column.") + + def depth: Int = counts.size + + def width: Int = counts(0).size + + def getCount(pos: (Int, Int)): Long = { + val (row, col) = pos + require(row < depth && col < width, "Position must be within the bounds of this table.") + counts(row)(col) + } + + /** + * Updates the count of a single cell in the table. + */ + def +(pos: (Int, Int), count: Long): CountsTable[K] = { + val (row, col) = pos + val currCount = getCount(pos) + val newCounts = + counts.updated(row, counts(row).updated(col, currCount + count)) + CountsTable[K](newCounts) + } + + /** + * Adds another counts table to this one, through element-wise addition. + */ + def ++(other: CountsTable[K]): CountsTable[K] = { + require(depth == other.depth && width == other.width, "Tables must have the same dimensions.") + val xss = this.counts.iterator + val yss = other.counts.iterator + val rows = Vector.newBuilder[Vector[Long]] + while (xss.hasNext) { + val xs = xss.next().iterator + val ys = yss.next().iterator + val row = Vector.newBuilder[Long] + while (xs.hasNext) row += (xs.next() + ys.next()) + rows += row.result() + } + CountsTable[K](rows.result()) + } + } + + object CountsTable { + + /** + * Creates a new [[CountsTable]] with counts initialized to all zeroes. + */ + def apply[K](depth: Int, width: Int): CountsTable[K] = + CountsTable[K](Vector.fill[Long](depth, width)(0L)) + + } + +} + +case class TopCMSParams[K](logic: HeavyHittersLogic[K]) + +/** + * A Count-Min sketch data structure that allows for (a) counting and frequency estimation of elements in a + * data stream and (b) tracking the heavy hitters among these elements. + * + * The logic of how heavy hitters are computed is pluggable, see [[HeavyHittersLogic]]. + * + * Tip: If you do not need to track heavy hitters, take a look at [[CMS]], which is more efficient in this + * case. + * + * =Usage= + * + * This example demonstrates how to count `Long` elements with [[TopCMS]], i.e. `K=Long`. + * + * Note that the actual counting is always performed with a `Long`, regardless of your choice of `K`. That is, + * the counting table behind the scenes is backed by `Long` values (at least in the current implementation), + * and thus the returned frequency estimates are always instances of `Approximate[Long]`. + * + * @example + * {{{ // Creates a monoid for a CMS that can count `Long` elements. val topPctCMSMonoid: + * TopPctCMSMonoid[Long] = { val eps = 0.001 val delta = 1E-10 val seed = 1 val heavyHittersPct = 0.1 + * TopPctCMS.monoid[Long](eps, delta, seed, heavyHittersPct) } + * + * // Creates a TopCMS instance that has counted the element `1L`. val topCMS: TopCMS[Long] = + * topPctCMSMonoid.create(1L) + * + * // Estimates the frequency of `1L` val estimate: Approximate[Long] = topCMS.frequency(1L) + * + * // What are the heavy hitters so far? val heavyHitters: Set[Long] = topCMS.heavyHitters }}} + * + * @tparam K + * The type used to identify the elements to be counted. + */ +sealed abstract class TopCMS[K](val cms: CMS[K], params: TopCMSParams[K]) + extends java.io.Serializable + with CMSCounting[K, TopCMS] + with CMSHeavyHitters[K] { + + override val eps: Double = cms.eps + + override val delta: Double = cms.delta + + override val totalCount: Long = cms.totalCount + + override val maxExactCountOpt: Option[Int] = cms.maxExactCountOpt + + override def frequency(item: K): Approximate[Long] = cms.frequency(item) + + override def innerProduct(other: TopCMS[K]): Approximate[Long] = + cms.innerProduct(other.cms) + + override def f2: Approximate[Long] = innerProduct(this) + + /** + * The pluggable logic with which heavy hitters are being tracked. + */ + override def heavyHittersLogic: HeavyHittersLogic[K] = params.logic + +} + +/** + * Zero element. Used for initialization. + */ +case class TopCMSZero[K](override val cms: CMS[K], params: TopCMSParams[K]) extends TopCMS[K](cms, params) { + + override val heavyHitters: Set[K] = Set.empty[K] + + override def +(item: K, count: Long): TopCMS[K] = + TopCMSInstance(cms, params) + (item, count) + + override def ++(other: TopCMS[K]): TopCMS[K] = other + +} + +/** + * Used for holding a single element, to avoid repeatedly adding elements from sparse counts tables. + */ +case class TopCMSItem[K](item: K, override val cms: CMS[K], params: TopCMSParams[K]) + extends TopCMS[K](cms, params) { + + override val heavyHitters: Set[K] = Set(item) + + override def +(x: K, count: Long): TopCMS[K] = toCMSInstance + (x, count) + + override def ++(other: TopCMS[K]): TopCMS[K] = other match { + case _: TopCMSZero[?] => this + case other: TopCMSItem[K] => toCMSInstance + other.item + case other: TopCMSInstance[K] => other + item + } + + private def toCMSInstance: TopCMSInstance[K] = { + val hhs = HeavyHitters.from(HeavyHitter(item, 1L)) + TopCMSInstance(cms, hhs, params) + } + +} + +object TopCMSInstance { + + def apply[K](cms: CMS[K], params: TopCMSParams[K]): TopCMSInstance[K] = + TopCMSInstance[K](cms, HeavyHitters.empty[K], params) + +} + +case class TopCMSInstance[K](override val cms: CMS[K], hhs: HeavyHitters[K], params: TopCMSParams[K]) + extends TopCMS[K](cms, params) { + + override def heavyHitters: Set[K] = hhs.items + + override def +(item: K, count: Long): TopCMSInstance[K] = { + require(count >= 0, "count must be >= 0 (negative counts not implemented") + if (count != 0L) { + val newCms = cms + (item, count) + val newHhs = + heavyHittersLogic.updateHeavyHitters(cms, newCms)(hhs, item, count) + TopCMSInstance[K](newCms, newHhs, params) + } else this + } + + override def ++(other: TopCMS[K]): TopCMS[K] = other match { + case _: TopCMSZero[?] => this + case other: TopCMSItem[K] => this + other.item + case other: TopCMSInstance[K] => + val newCms = cms ++ other.cms + val newHhs = heavyHittersLogic.updateHeavyHitters(newCms)(hhs, other.hhs) + TopCMSInstance(newCms, newHhs, params) + } + +} + +class TopCMSMonoid[K](emptyCms: CMS[K], logic: HeavyHittersLogic[K]) extends Monoid[TopCMS[K]] { + + val params: TopCMSParams[K] = TopCMSParams(logic) + + override val zero: TopCMS[K] = TopCMSZero[K](emptyCms, params) + + /** + * Combines the two sketches. + * + * The sketches must use the same hash functions. + */ + override def plus(left: TopCMS[K], right: TopCMS[K]): TopCMS[K] = { + require( + left.cms.params.hashes == right.cms.params.hashes, + "The sketches must use the same hash functions." + ) + left ++ right + } + + /** + * Creates a sketch out of a single item. + */ + def create(item: K): TopCMS[K] = + TopCMSItem[K](item, emptyCms + item, params) + + /** + * Creates a sketch out of multiple items. + */ + def create(data: Seq[K]): TopCMS[K] = + data.foldLeft(zero) { case (acc, x) => plus(acc, create(x)) } + + override def sum(sketches: TraversableOnce[TopCMS[K]]): TopCMS[K] = { + val topCandidates = scala.collection.mutable.Set.empty[K] + val summation = new CMSSummation(emptyCms.params) + sketches.iterator.foreach { sketch => + summation.updateInto(sketch.cms) + topCandidates ++= sketch.heavyHitters + } + val cms = summation.result + val ests = + topCandidates.map(k => HeavyHitter(k, cms.frequency(k).estimate)).toSet + val hhs = logic.purgeHeavyHitters(cms)(HeavyHitters(ests)) + TopCMSInstance(cms, hhs, params) + } + + override def sumOption(sketches: TraversableOnce[TopCMS[K]]): Option[TopCMS[K]] = + if (sketches.iterator.isEmpty) None else Some(sum(sketches)) +} + +class TopCMSAggregator[K](cmsMonoid: TopCMSMonoid[K]) extends MonoidAggregator[K, TopCMS[K], TopCMS[K]] { + + override def monoid: TopCMSMonoid[K] = cmsMonoid + + override def prepare(value: K): TopCMS[K] = monoid.create(value) + + override def present(cms: TopCMS[K]): TopCMS[K] = cms + +} + +/** + * Controls how a CMS that implements [[CMSHeavyHitters]] tracks heavy hitters. + */ +abstract class HeavyHittersLogic[K] extends java.io.Serializable { + + def updateHeavyHitters( + oldCms: CMS[K], + newCms: CMS[K] + )(hhs: HeavyHitters[K], item: K, count: Long): HeavyHitters[K] = { + val oldItemCount = oldCms.frequency(item).estimate + val oldHh = HeavyHitter[K](item, oldItemCount) + val newItemCount = oldItemCount + count + val newHh = HeavyHitter[K](item, newItemCount) + purgeHeavyHitters(newCms)(hhs - oldHh + newHh) + } + + def updateHeavyHitters(cms: CMS[K])(left: HeavyHitters[K], right: HeavyHitters[K]): HeavyHitters[K] = { + val candidates = (left.items ++ right.items).map { case i => + HeavyHitter[K](i, cms.frequency(i).estimate) + } + val newHhs = HeavyHitters.from(candidates) + purgeHeavyHitters(cms)(newHhs) + } + + def purgeHeavyHitters(cms: CMS[K])(hhs: HeavyHitters[K]): HeavyHitters[K] + +} + +/** + * Finds all heavy hitters, i.e., elements in the stream that appear at least `(heavyHittersPct * totalCount)` + * times. + * + * Every item that appears at least `(heavyHittersPct * totalCount)` times is output, and with probability `p + * >= 1 - delta`, no item whose count is less than `(heavyHittersPct - eps) * totalCount` is output. + * + * This also means that this parameter is an upper bound on the number of heavy hitters that will be tracked: + * the set of heavy hitters contains at most `1 / heavyHittersPct` elements. For example, if + * `heavyHittersPct=0.01` (or 0.25), then at most `1 / 0.01 = 100` items (or `1 / 0.25 = 4` items) will be + * tracked/returned as heavy hitters. This parameter can thus control the memory footprint required for + * tracking heavy hitters. + */ +case class TopPctLogic[K](heavyHittersPct: Double) extends HeavyHittersLogic[K] { + + require(0 < heavyHittersPct && heavyHittersPct < 1, "heavyHittersPct must lie in (0, 1)") + + override def purgeHeavyHitters(cms: CMS[K])(hitters: HeavyHitters[K]): HeavyHitters[K] = { + val minCount = heavyHittersPct * cms.totalCount + HeavyHitters[K](hitters.hhs.filter(_.count >= minCount)) + } + +} + +/** + * Tracks the top N heavy hitters, where `N` is defined by `heavyHittersN`. + * + * '''Warning:''' top-N computations are not associative. The effect is that a top-N CMS has an ordering bias + * (with regard to heavy hitters) when merging instances. This means merging heavy hitters across CMS + * instances may lead to incorrect, biased results: the outcome is biased by the order in which CMS instances + * / heavy hitters are being merged, with the rule of thumb being that the earlier a set of heavy hitters is + * being merged, the more likely is the end result biased towards these heavy hitters. + * + * @see + * Discussion in [[https://github.com/twitter/algebird/issues/353 Algebird issue 353]] + */ +case class TopNLogic[K](heavyHittersN: Int) extends HeavyHittersLogic[K] { + + require(heavyHittersN > 0, "heavyHittersN must be > 0") + + override def purgeHeavyHitters(cms: CMS[K])(hitters: HeavyHitters[K]): HeavyHitters[K] = { + val sorted = + hitters.hhs.toSeq.sortBy(hh => hh.count).takeRight(heavyHittersN) + HeavyHitters[K](sorted.toSet) + } + +} + +/** + * Containers for holding heavy hitter items and their associated counts. + */ +case class HeavyHitters[K](hhs: Set[HeavyHitter[K]]) extends java.io.Serializable { + + def -(hh: HeavyHitter[K]): HeavyHitters[K] = HeavyHitters[K](hhs - hh) + + def +(hh: HeavyHitter[K]): HeavyHitters[K] = HeavyHitters[K](hhs + hh) + + def ++(other: HeavyHitters[K]): HeavyHitters[K] = + HeavyHitters[K](hhs ++ other.hhs) + + def items: Set[K] = hhs.map(_.item) + +} + +object HeavyHitters { + + def empty[K]: HeavyHitters[K] = HeavyHitters(emptyHhs) + + private def emptyHhs[K]: Set[HeavyHitter[K]] = Set[HeavyHitter[K]]() + + def from[K](hhs: Set[HeavyHitter[K]]): HeavyHitters[K] = + hhs.foldLeft(empty[K])(_ + _) + + def from[K](hh: HeavyHitter[K]): HeavyHitters[K] = HeavyHitters(emptyHhs + hh) + +} + +case class HeavyHitter[K](item: K, count: Long) extends java.io.Serializable + +/** + * Monoid for Top-% based [[TopCMS]] sketches. + * + * =Usage= + * + * The type `K` is the type of items you want to count. You must provide an implicit `CMSHasher[K]` for `K`, + * and Algebird ships with several such implicits for commonly used types such as `Long` and `BigInt`. + * + * If your type `K` is not supported out of the box, you have two options: 1) You provide a "translation" + * function to convert items of your (unsupported) type `K` to a supported type such as Double, and then use + * the `contramap` function of [[CMSHasher]] to create the required `CMSHasher[K]` for your type (see the + * documentation of [[CMSHasher]] for an example); 2) You implement a `CMSHasher[K]` from scratch, using the + * existing CMSHasher implementations as a starting point. + * + * Note: Because Arrays in Scala/Java not have sane `equals` and `hashCode` implementations, you cannot safely + * use types such as `Array[Byte]`. Extra work is required for Arrays. For example, you may opt to convert + * `Array[T]` to a `Seq[T]` via `toSeq`, or you can provide appropriate wrapper classes. Algebird provides one + * such wrapper class, [[Bytes]], to safely wrap an `Array[Byte]` for use with CMS. + * + * @param cms + * A [[CMS]] instance, which is used for the counting and the frequency estimation performed by this class. + * @param heavyHittersPct + * A threshold for finding heavy hitters, i.e., elements that appear at least (heavyHittersPct * totalCount) + * times in the stream. + * @tparam K + * The type used to identify the elements to be counted. For example, if you want to count the occurrence of + * user names, you could map each username to a unique numeric ID expressed as a `Long`, and then count the + * occurrences of those `Long`s with a CMS of type `K=Long`. Note that this mapping between the elements of + * your problem domain and their identifiers used for counting via CMS should be bijective. We require a + * [[CMSHasher]] context bound for `K`, see [[CMSHasher]] for available implicits that can be imported. + * Which type K should you pick in practice? For domains that have less than `2^64` unique elements, you'd + * typically use `Long`. For larger domains you can try `BigInt`, for example. + */ +class TopPctCMSMonoid[K](cms: CMS[K], heavyHittersPct: Double = 0.01) + extends TopCMSMonoid[K](cms, TopPctLogic[K](heavyHittersPct)) + +object TopPctCMS { + + def monoid[K: CMSHasher]( + eps: Double, + delta: Double, + seed: Int, + heavyHittersPct: Double + ): TopPctCMSMonoid[K] = + new TopPctCMSMonoid[K](CMS(eps, delta, seed), heavyHittersPct) + + def monoid[K: CMSHasher](depth: Int, width: Int, seed: Int, heavyHittersPct: Double): TopPctCMSMonoid[K] = + monoid(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersPct) + + def aggregator[K: CMSHasher]( + eps: Double, + delta: Double, + seed: Int, + heavyHittersPct: Double + ): TopPctCMSAggregator[K] = + new TopPctCMSAggregator[K](monoid(eps, delta, seed, heavyHittersPct)) + + def aggregator[K: CMSHasher]( + depth: Int, + width: Int, + seed: Int, + heavyHittersPct: Double + ): TopPctCMSAggregator[K] = + aggregator(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersPct) + +} + +/** + * An Aggregator for [[TopPctCMS]]. Can be created using [[TopPctCMS.aggregator]]. + */ +case class TopPctCMSAggregator[K](cmsMonoid: TopPctCMSMonoid[K]) extends TopCMSAggregator(cmsMonoid) + +/** + * Monoid for top-N based [[TopCMS]] sketches. '''Use with care! (see warning below)''' + * + * =Warning: Adding top-N CMS instances (`++`) is an unsafe operation= + * + * Top-N computations are not associative. The effect is that a top-N CMS has an ordering bias (with regard to + * heavy hitters) when ''merging'' CMS instances (e.g. via `++`). This means merging heavy hitters across CMS + * instances may lead to incorrect, biased results: the outcome is biased by the order in which CMS instances + * / heavy hitters are being merged, with the rule of thumb being that the earlier a set of heavy hitters is + * being merged, the more likely is the end result biased towards these heavy hitters. + * + * The warning above only applies when ''adding CMS instances'' (think: `cms1 ++ cms2`). In comparison, heavy + * hitters are correctly computed when: + * + * - a top-N CMS instance is created from a single data stream, i.e. `Seq[K]` + * - items are added/counted individually, i.e. `cms + item` or `cms + (item, count)`. + * + * See the discussion in [[https://github.com/twitter/algebird/issues/353 Algebird issue 353]] for further + * details. + * + * =Alternatives= + * + * The following, alternative data structures may be better picks than a top-N based CMS given the warning + * above: + * + * - [[TopPctCMS]]: Has safe merge semantics for its instances including heavy hitters. + * - [[SpaceSaver]]: Has the same ordering bias than a top-N CMS, but at least it provides bounds on the + * bias. + * + * =Usage= + * + * The type `K` is the type of items you want to count. You must provide an implicit `CMSHasher[K]` for `K`, + * and Algebird ships with several such implicits for commonly used types such as `Long` and `BigInt`. + * + * If your type `K` is not supported out of the box, you have two options: 1) You provide a "translation" + * function to convert items of your (unsupported) type `K` to a supported type such as [[Double]], and then + * use the `contramap` function of [[CMSHasher]] to create the required `CMSHasher[K]` for your type (see the + * documentation of [[CMSHasher]] for an example); 2) You implement a `CMSHasher[K]` from scratch, using the + * existing CMSHasher implementations as a starting point. + * + * Note: Because Arrays in Scala/Java not have sane `equals` and `hashCode` implementations, you cannot safely + * use types such as `Array[Byte]`. Extra work is required for Arrays. For example, you may opt to convert + * `Array[T]` to a `Seq[T]` via `toSeq`, or you can provide appropriate wrapper classes. Algebird provides one + * such wrapper class, [[Bytes]], to safely wrap an `Array[Byte]` for use with CMS. + * + * @param cms + * A [[CMS]] instance, which is used for the counting and the frequency estimation performed by this class. + * @param heavyHittersN + * The maximum number of heavy hitters to track. + * @tparam K + * The type used to identify the elements to be counted. For example, if you want to count the occurrence of + * user names, you could map each username to a unique numeric ID expressed as a `Long`, and then count the + * occurrences of those `Long`s with a CMS of type `K=Long`. Note that this mapping between the elements of + * your problem domain and their identifiers used for counting via CMS should be bijective. We require a + * [[CMSHasher]] context bound for `K`, see [[CMSHasher]] for available implicits that can be imported. + * Which type K should you pick in practice? For domains that have less than `2^64` unique elements, you'd + * typically use `Long`. For larger domains you can try `BigInt`, for example. + */ +class TopNCMSMonoid[K](cms: CMS[K], heavyHittersN: Int = 100) + extends TopCMSMonoid[K](cms, TopNLogic[K](heavyHittersN)) + +object TopNCMS { + + def monoid[K: CMSHasher](eps: Double, delta: Double, seed: Int, heavyHittersN: Int): TopNCMSMonoid[K] = + new TopNCMSMonoid[K](CMS(eps, delta, seed), heavyHittersN) + + def monoid[K: CMSHasher](depth: Int, width: Int, seed: Int, heavyHittersN: Int): TopNCMSMonoid[K] = + monoid(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersN) + + def aggregator[K: CMSHasher]( + eps: Double, + delta: Double, + seed: Int, + heavyHittersN: Int + ): TopNCMSAggregator[K] = + new TopNCMSAggregator[K](monoid(eps, delta, seed, heavyHittersN)) + + def aggregator[K: CMSHasher](depth: Int, width: Int, seed: Int, heavyHittersN: Int): TopNCMSAggregator[K] = + aggregator(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersN) + +} + +/** + * An Aggregator for [[TopNCMS]]. Can be created using [[TopNCMS.aggregator]]. + */ +case class TopNCMSAggregator[K](cmsMonoid: TopNCMSMonoid[K]) extends TopCMSAggregator(cmsMonoid) + +/** + * K1 defines a scope for the CMS. For each k1, keep the top heavyHittersN associated k2 values. + */ +case class ScopedTopNLogic[K1, K2](heavyHittersN: Int) extends HeavyHittersLogic[(K1, K2)] { + + require(heavyHittersN > 0, "heavyHittersN must be > 0") + + override def purgeHeavyHitters( + cms: CMS[(K1, K2)] + )(hitters: HeavyHitters[(K1, K2)]): HeavyHitters[(K1, K2)] = { + val grouped = hitters.hhs.groupBy(hh => hh.item._1) + val (underLimit, overLimit) = grouped.partition { + _._2.size <= heavyHittersN + } + val sorted = overLimit.transform { case (_, hhs) => + hhs.toSeq.sortBy(hh => hh.count) + } + val purged = sorted.transform { case (_, hhs) => + hhs.takeRight(heavyHittersN) + } + HeavyHitters[(K1, K2)](purged.values.flatten.toSet ++ underLimit.values.flatten.toSet) + } + +} + +/* + * Monoid for Top-N values per key in an associative [[TopCMS]]. + * + * Typical use case for this might be (Country, City) pairs. For a stream of such + * pairs, we might want to keep track of the most popular cities for each country. + * + * This can, of course, be achieved using a Map[Country, TopNCMS[City]], but this + * requires storing one CMS per distinct Country. + * + * Similarly, one could attempt to use a TopNCMS[(Country, City)], but less common + * countries may not make the cut if N is not "very large". + * + * ScopedTopNCMSMonoid[Country, City] will avoid having one Country drown others + * out, while still only using a single CMS. + * + * In general the eviction of K1 is not supported, and all distinct K1 values must + * be retained. Therefore it is important to only use this Monoid when the number + * of distinct K1 values is known to be reasonably bounded. + */ +class ScopedTopNCMSMonoid[K1, K2](cms: CMS[(K1, K2)], heavyHittersN: Int = 100) + extends TopCMSMonoid[(K1, K2)](cms, ScopedTopNLogic[K1, K2](heavyHittersN)) + +object ScopedTopNCMS { + + def scopedHasher[K1: CMSHasher, K2: CMSHasher]: CMSHasher[(K1, K2)] = new CMSHasher[(K1, K2)] { + private val k1Hasher = implicitly[CMSHasher[K1]] + private val k2Hasher = implicitly[CMSHasher[K2]] + + override def hash(a: Int, b: Int, width: Int)(x: (K1, K2)): Int = { + val (k1, k2) = x + val xs = Seq(k1Hasher.hash(a, b, width)(k1), k2Hasher.hash(a, b, width)(k2), a, b) + (scala.util.hashing.MurmurHash3.seqHash(xs) & Int.MaxValue) % width + } + } + + def monoid[K1: CMSHasher, K2: CMSHasher]( + eps: Double, + delta: Double, + seed: Int, + heavyHittersN: Int + ): ScopedTopNCMSMonoid[K1, K2] = + new ScopedTopNCMSMonoid[K1, K2](CMS(eps, delta, seed)(scopedHasher[K1, K2]), heavyHittersN) + + def monoid[K1: CMSHasher, K2: CMSHasher]( + depth: Int, + width: Int, + seed: Int, + heavyHittersN: Int + ): ScopedTopNCMSMonoid[K1, K2] = + monoid(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersN) + + def aggregator[K1: CMSHasher, K2: CMSHasher]( + eps: Double, + delta: Double, + seed: Int, + heavyHittersN: Int + ): TopCMSAggregator[(K1, K2)] = + new TopCMSAggregator(monoid(eps, delta, seed, heavyHittersN)) + + def aggregator[K1: CMSHasher, K2: CMSHasher]( + depth: Int, + width: Int, + seed: Int, + heavyHittersN: Int + ): TopCMSAggregator[(K1, K2)] = + aggregator(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersN) + +} + +case class CMSHash[K: CMSHasher](a: Int, b: Int, width: Int) extends java.io.Serializable { + + /** + * Returns `a * x + b (mod p) (mod width)`. + */ + def apply(x: K): Int = implicitly[CMSHasher[K]].hash(a, b, width)(x) + +} + +/** + * This formerly held the instances that moved to object CMSHasher + * + * These instances are slow, but here for compatibility with old serialized data. For new code, avoid these + * and instead use the implicits found in the CMSHasher companion object. + */ +object CMSHasherImplicits { + + implicit object CMSHasherBigInt extends CMSHasher[BigInt] { + override def hash(a: Int, b: Int, width: Int)(x: BigInt): Int = + CMSHasher.hashBytes(a, b, width)(x.toByteArray) + } + + implicit object CMSHasherString extends CMSHasher[String] { + override def hash(a: Int, b: Int, width: Int)(x: String): Int = + CMSHasher.hashBytes(a, b, width)(x.getBytes("UTF-8")) + } + + def cmsHasherShort: CMSHasher[Short] = CMSHasher.cmsHasherShort +} diff --git a/algebird-core/src/main/scala-2.12/DecayedVector.scala b/algebird-core/src/main/scala-2.12/DecayedVector.scala new file mode 100644 index 000000000..18e816fe4 --- /dev/null +++ b/algebird-core/src/main/scala-2.12/DecayedVector.scala @@ -0,0 +1,75 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ + +package com.twitter.algebird + +/** + * Represents a container class together with time. Its monoid consists of exponentially scaling the older + * value and summing with the newer one. + */ +object DecayedVector extends CompatDecayedVector { + def buildWithHalflife[C[_]](vector: C[Double], time: Double, halfLife: Double): DecayedVector[C] = + DecayedVector(vector, time * scala.math.log(2.0) / halfLife) + + def monoidWithEpsilon[C[_]]( + eps: Double + )(implicit vs: VectorSpace[Double, C], metric: Metric[C[Double]]): Monoid[DecayedVector[C]] = + new Monoid[DecayedVector[C]] { + override val zero = DecayedVector(vs.group.zero, Double.NegativeInfinity) + override def plus(left: DecayedVector[C], right: DecayedVector[C]) = + if (left.scaledTime <= right.scaledTime) { + scaledPlus(right, left, eps) + } else { + scaledPlus(left, right, eps) + } + } + + def forMap[K](m: Map[K, Double], scaledTime: Double): DecayedVector[Map[K, _]] = + DecayedVector[Map[K, _]](m, scaledTime) + def forMapWithHalflife[K](m: Map[K, Double], time: Double, halfLife: Double): DecayedVector[Map[K, _]] = + forMap(m, time * scala.math.log(2.0) / halfLife) + + def mapMonoidWithEpsilon[K]( + eps: Double + )(implicit + vs: VectorSpace[Double, Map[K, _]], + metric: Metric[Map[K, Double]] + ): Monoid[DecayedVector[Map[K, _]]] = + monoidWithEpsilon[Map[K, _]](eps) + + implicit def mapMonoid[K](implicit + vs: VectorSpace[Double, Map[K, _]], + metric: Metric[Map[K, Double]] + ): Monoid[DecayedVector[Map[K, _]]] = + mapMonoidWithEpsilon(-1.0) + + def scaledPlus[C[_]](newVal: DecayedVector[C], oldVal: DecayedVector[C], eps: Double)(implicit + vs: VectorSpace[Double, C], + metric: Metric[C[Double]] + ): DecayedVector[C] = { + implicit val mon: Monoid[C[Double]] = vs.group + val expFactor = scala.math.exp(oldVal.scaledTime - newVal.scaledTime) + val newVector = + Monoid.plus(newVal.vector, vs.scale(expFactor, oldVal.vector)) + if (eps < 0.0 || Metric.norm(newVector) > eps) { + DecayedVector(newVector, newVal.scaledTime) + } else { + DecayedVector(mon.zero, Double.NegativeInfinity) + } + } +} + +case class DecayedVector[C[_]](vector: C[Double], scaledTime: Double) diff --git a/algebird-core/src/main/scala-2.12/DecayingCMS.scala b/algebird-core/src/main/scala-2.12/DecayingCMS.scala new file mode 100644 index 000000000..54809e2a8 --- /dev/null +++ b/algebird-core/src/main/scala-2.12/DecayingCMS.scala @@ -0,0 +1,650 @@ +package com.twitter.algebird + +import java.lang.Double.{compare => cmp} +import java.lang.Math +import java.util.Arrays.deepHashCode +import scala.concurrent.duration.Duration +import scala.util.Random + +/** + * DecayingCMS is a module to build count-min sketch instances whose counts decay exponentially. + * + * Similar to a Map[K, com.twitter.algebird.DecayedValue], each key is associated with a single count value + * that decays over time. Unlike a map, the decyaing CMS is an approximate count -- in exchange for the + * possibility of over-counting, we can bound its size in memory. + * + * The intended use case is for metrics or machine learning where exact values aren't needed. + * + * You can expect the keys with the biggest values to be fairly accurate but the very small values (rare keys + * or very old keys) to be lost in the noise. For both metrics and ML this should be fine: you can't learn too + * much from very rare values. + * + * We recommend depth of at least 5, and width of at least 100, but you should do some experiments to + * determine the smallest parameters that will work for your use case. + */ +final class DecayingCMS[K]( + seed: Long, + val halfLife: Duration, + val depth: Int, // number of hashing functions + val width: Int, // number of table cells per hashing function + hasher: CMSHasher[K] +) extends Serializable { module => + + override def toString: String = + s"DecayingCMS(seed=$seed, halfLife=$halfLife, depth=$depth, width=$width)" + + @inline private def getNextLogScale( + logScale: Double, + oldTimeInHL: Double, + nowInHL: Double + ): Double = + if (nowInHL == oldTimeInHL) logScale else logScale + (nowInHL - oldTimeInHL) * log2 + + @inline private def getScale(logScale: Double, oldTimeInHL: Double, nowInHL: Double): Double = { + val logScale1 = getNextLogScale(logScale, oldTimeInHL, nowInHL) + Math.exp(-logScale1) + } + + val empty: CMS = + new CMS(Array.fill(depth)(Vector.fill[Double](width)(0.0)), 0.0, Double.NegativeInfinity) + + /** + * Represents a decaying scalar value at a particular point in time. + * + * The value decays according to halfLife. Another way to think about DoubleAt is that it represents a + * particular decay curve (and in particular, a point along that curve). Two DoubleAt values may be + * equivalent if they are two points on the same curve. + * + * The `timeToZero` and `timeToUnit` methods can be used to "normalize" DoubleAt values. If two DoubleAt + * values do not produce the same (approximate) Double values from these methods, they represent different + * curves. + */ + class DoubleAt private[algebird] (val value: Double, val timeInHL: Double) extends Serializable { + lhs => + + // this is not public because it's not safe in general -- you need + // to run a function that is time-commutative. + private[algebird] def map(f: Double => Double): DoubleAt = + new DoubleAt(f(value), timeInHL) + + // this is not public because it's not safe in general -- you need + // to run a function that is time-commutative. + private[algebird] def map2(rhs: DoubleAt)(f: (Double, Double) => Double): DoubleAt = + if (lhs.timeInHL < rhs.timeInHL) { + val x = lhs.scaledAt(rhs.timeInHL) + new DoubleAt(f(x, rhs.value), rhs.timeInHL) + } else if (lhs.timeInHL == rhs.timeInHL) { + new DoubleAt(f(lhs.value, rhs.value), rhs.timeInHL) + } else { + val y = rhs.scaledAt(lhs.timeInHL) + new DoubleAt(f(lhs.value, y), lhs.timeInHL) + } + + def unary_- : DoubleAt = new DoubleAt(-value, timeInHL) + def abs: DoubleAt = new DoubleAt(Math.abs(value), timeInHL) + def *(n: Double): DoubleAt = new DoubleAt(value * n, timeInHL) + + def +(rhs: DoubleAt): DoubleAt = map2(rhs)(_ + _) + def -(rhs: DoubleAt): DoubleAt = map2(rhs)(_ - _) + def min(rhs: DoubleAt): DoubleAt = map2(rhs)(Math.min) + def max(rhs: DoubleAt): DoubleAt = map2(rhs)(Math.max) + + def /(rhs: DoubleAt): Double = map2(rhs)(_ / _).value + + /** + * We consider two DoubleAt values equal not just if their elements are equal, but also if they represent + * the same value at different points of decay. + */ + def compare(rhs: DoubleAt): Int = { + val vc = cmp(lhs.value, rhs.value) + val tc = cmp(lhs.timeInHL, rhs.timeInHL) + if (vc == tc) vc + else if (tc == 0) vc + else if (vc == 0) tc + else if (tc < 0) cmp(lhs.scaledAt(rhs.timeInHL), rhs.value) + else cmp(lhs.value, rhs.scaledAt(lhs.timeInHL)) + } + + /** + * Time when this value will reach the smallest double value bigger than zero, unless we are already at + * zero in which case we return the current time + */ + def timeToZero: Double = + if (java.lang.Double.isNaN(value)) Double.NaN + else if (java.lang.Double.isInfinite(value)) Double.PositiveInfinity + else if (value == 0.0) timeInHL + else timeToUnit + DoubleAt.TimeFromUnitToZero + + /** + * This is the scaled time when the current value will reach 1 (or -1 for negative values) + * + * This method is a way of collapsing a DoubleAt into a single value (the time in the past or future where + * its value would be 1, the unit value). + */ + def timeToUnit: Double = + if (java.lang.Double.isNaN(value)) Double.NaN + else if (java.lang.Double.isInfinite(value)) Double.PositiveInfinity + else if (value == 0.0) Double.NegativeInfinity + else { + // solve for result: + // + // 1 = value * module.getScale(0.0, timeInHL, result) + // 1 = value * Math.exp(-getNextLogScale(0.0, timeInHL, result)) + // 1 / value = Math.exp(-getNextLogScale(0.0, timeInHL, result)) + // log(1 / value) = -getNextLogScale(0.0, timeInHL, result) + // -log(1 / value) = getNextLogScale(0.0, timeInHL, result) + // log(value) = getNextLogScale(0.0, timeInHL, result) + // log(value) = if (result == timeInHL) 0 else 0 + (result - timeInHL) * log2 + // log(value) = if (result == timeInHL) 0 else (result - timeInHL) * log2 + // + // log(value) = (result - timeInHL) * log2 + // log(value) / log2 = result - timeInHL + // log(value) / log2 + timeInHL = result + Math.log(Math.abs(value)) / log2 + timeInHL + } + + override def equals(that: Any): Boolean = + that match { + case d: DoubleAt => compare(d) == 0 + case _ => false + } + + override def hashCode: Int = + timeToUnit.## + + override def toString: String = + s"DoubleAt($value, $timeInHL)" + + def <(rhs: DoubleAt): Boolean = (lhs.compare(rhs)) < 0 + def <=(rhs: DoubleAt): Boolean = (lhs.compare(rhs)) <= 0 + def >(rhs: DoubleAt): Boolean = (lhs.compare(rhs)) > 0 + def >=(rhs: DoubleAt): Boolean = (lhs.compare(rhs)) >= 0 + + def time: Long = + toTimestamp(timeInHL) + + private def scaledAt(t: Double): Double = + if (value == 0.0) 0.0 + else value * module.getScale(0.0, timeInHL, t) + + def at(time: Long): Double = + if (value == 0.0) 0.0 + else value * module.getScale(0.0, timeInHL, fromTimestamp(time)) + } + + object DoubleAt { + def apply(x: Double, t: Long): DoubleAt = + new DoubleAt(x, fromTimestamp(t)) + + val zero: DoubleAt = + new DoubleAt(0.0, Double.NegativeInfinity) + + private val TimeFromUnitToZero: Double = + -Math.log(Double.MinPositiveValue) / log2 + } + + val totalCells: Int = depth * width + + val halfLifeSecs: Double = + halfLife.toMillis.toDouble / 1000.0 + + // TODO: consider a smaller number? + // we are trading accuracy for possible performence + private[this] val maxLogScale: Double = 20.0 + + /** + * Allocate an empty array of row. + * + * The elements start as null. It's an important optimization _not_ to allocate vectors here, since we're + * often building up cells mutably. + */ + private def allocCells(): Array[Vector[Double]] = + new Array[Vector[Double]](depth) + + def toTimestamp(t: Double): Long = + (t * halfLifeSecs * 1000.0).toLong + + def fromTimestamp(t: Long): Double = + (t.toDouble / 1000.0) / halfLifeSecs + + val hashFns: Array[K => Int] = { + val rng = new Random(seed) + def genPos(): Int = + rng.nextInt() match { + case 0 => genPos() + case n => n & 0x7fffffff + } + + (0 until depth).map { _ => + val n = genPos() + (k: K) => hasher.hash(n, 0, width)(k) + }.toArray + } + + private final val log2 = Math.log(2.0) + + /** + * The idealized formula for the updating current value for a key (y0 -> y1) is given as: + * + * delta = (t1 - t0) / halflife y1 = y0 * 2^(-delta) + n + * + * However, we want to avoid having to rescale every single cell every time we update; i.e. a cell with a + * zero value should continue to have a zero value when n=0. + * + * Therefore, we introduce a change of variable to cell values (z) along with a scale factor (scale), and + * the following formula: + * + * (1) zN = yN * scaleN + * + * Our constraint is expressed as: + * + * (2) If n=0, z1 = z0 + * + * In that case: + * + * (3) If n=0, (y1 * scale1) = (y0 * scale0) (4) Substituting for y1, (y0 * 2^(-delta) + 0) * scale1 = y0 * + * scale0 (5) 2^(-delta) * scale1 = scale0 (6) scale1 = scale0 * 2^(delta) + * + * Also, to express z1 in terms of z0, we say: + * + * (7) z1 = y1 * scale1 (8) z1 = (y0 * 2^(-delta) + n) * scale1 (9) z1 = ((z0 / scale0) * 2^(-delta) + n) * + * scale1 (10) z1 / scale1 = (z0 / (scale1 * 2^(-delta))) * 2^(-delta) + n (11) z1 / scale1 = z0 / scale1 + + * n (12) z1 = z0 + n * scale1 + * + * So, for cells where n=0, we just update scale0 to scale1, and for cells where n is non-zero, we update z1 + * in terms of z0 and scale1. + * + * If we convert scale to logscale, we have: + * + * (13) logscale1 = logscale0 + delta * log(2) (14) z1 = z0 + n * exp(logscale1) + * + * When logscale1 gets big, we start to distort z1. For example, exp(36) is close to 2^53. We can measure + * when n * exp(logscale1) gets big, and in those cases we can rescale all our cells (set each z to its + * corresponding y) and set the logscale to 0. + * + * (15) y1 = z1 / scale1 (16) y1 = z1 / exp(logscale1) (17) y1 = z1 * exp(-logscale1) + */ + final class CMS( + val cells: Array[Vector[Double]], + val logScale: Double, + val timeInHL: Double + ) extends Serializable { + + @inline private def scale: Double = + Math.exp(-logScale) + + override def toString: String = { + val s = cells.iterator.map(_.toString).mkString("Array(", ", ", ")") + s"CMS($s, $logScale, $timeInHL)" + } + + override def hashCode: Int = + deepHashCode(cells.asInstanceOf[Array[Object]]) * 59 + + logScale.## * 17 + + timeInHL.## * 37 + + 19 + + // unfortunately we can't check the path-dependent type of this + // CMS, which we signal by using a type projection here. + override def equals(any: Any): Boolean = + any match { + case that: DecayingCMS[?]#CMS => + this.logScale == that.logScale && + this.timeInHL == that.timeInHL && + this.cells.length == that.cells.length && { + var i = 0 + while (i < depth) { + if (this.cells(i) != that.cells(i)) return false + i += 1 + } + true + } + case _ => + false + } + + def lastUpdateTime: Long = + toTimestamp(timeInHL) + + /** + * Provide lower and upper bounds on values returned for any possible key. + * + * The first value is a lower bound: even keys that have never been counted will return this value or + * greater. This will be zero unless the CMS is saturated. + * + * The second value is an upper bound: the key with the largest cardinality will not be reported as being + * larger than this value (though it might be reported as being smaller). + * + * Together these values indicate how saturated and skewed the CMS might be. + */ + def range: (DoubleAt, DoubleAt) = { + var minMinimum = Double.PositiveInfinity + var minMaximum = Double.PositiveInfinity + var i = 0 + while (i < cells.length) { + val it = cells(i).iterator + var localMax = it.next() // we know it doesn't start empty + if (localMax < minMinimum) minMinimum = localMax + while (it.hasNext) { + val n = it.next() + if (n > localMax) localMax = n + else if (n < minMinimum) minMinimum = n + } + if (localMax < minMaximum) minMaximum = localMax + i += 1 + } + + val s = scale + def sc(x: Double): DoubleAt = + new DoubleAt(if (x == 0.0) 0.0 else x * s, timeInHL) + + (sc(minMinimum), sc(minMaximum)) + } + + /** + * Returns the square-root of the inner product of two decaying CMSs. + * + * We want the result to decay at the same rate as the CMS for this method to be valid. Taking the square + * root ensures that this is true. Without it, we would violate the following equality (assuming we had + * at() on a CMS): + * + * x.innerProduct(y).at(t) = x.at(t).innerProduct(y.at(t)) + * + * This is why we don't support innerProduct, only innerProductRoot. + */ + def innerProductRoot(that: CMS): DoubleAt = { + var i = 0 + var res = Double.PositiveInfinity + val t = Math.max(this.timeInHL, that.timeInHL) + val scale = this.getScale(t) * that.getScale(t) + while (i < depth) { + var sum = 0.0 + val it0 = this.cells(i).iterator + val it1 = that.cells(i).iterator + while (it0.hasNext) { + val x = it0.next() * it1.next() + if (x != 0.0) sum += x + } + if (sum < res) res = sum + i += 1 + } + val x = if (res != 0.0) Math.sqrt(res * scale) else 0.0 + new DoubleAt(x, t) + } + + def l2Norm: DoubleAt = + innerProductRoot(this) + + def scale(x: Double): CMS = + if (java.lang.Double.isNaN(x)) { + throw new IllegalArgumentException(s"invalid scale: $x") + } else if (x < 0.0) { + throw new IllegalArgumentException(s"negative scale is not allowed: $x") + } else if (x == 0.0) { + module.empty + } else { + val s = logScale + Math.log(x) + val c = new CMS(cells, s, timeInHL) + if (s > maxLogScale) c.rescaleTo(timeInHL) else c + } + + /** + * Get the total count of all items in the CMS. + * + * The total is the same as the l1Norm, since we don't allow negative values. + * + * Total is one of the few non-approximate statistics that DecayingCMS supports. We expect the total to be + * exact (except for floating-point error). + */ + def total: DoubleAt = { + val n = cells(0).sum + val x = if (n == 0.0) 0.0 else scale * n + new DoubleAt(x, timeInHL) + } + + def get(k: K): DoubleAt = { + var minValue = Double.PositiveInfinity + var didx = 0 + while (didx < depth) { + val i = hashFns(didx)(k) + val inner = cells(didx) + val value = inner(i) + if (value < minValue) minValue = value + didx += 1 + } + val x = if (minValue == 0.0) 0.0 else scale * minValue + new DoubleAt(x, timeInHL) + } + + def getScale(t: Double): Double = + module.getScale(logScale, timeInHL, t) + + private final def nextLogScale(t: Double): Double = + module.getNextLogScale(logScale, timeInHL, t) + + def +(other: CMS): CMS = { + val x = this + val y = other + val timeInHL = Math.max(x.timeInHL, y.timeInHL) + val cms = new CMS(allocCells(), 0.0, timeInHL) + + val xscale = x.getScale(timeInHL) + val yscale = y.getScale(timeInHL) + + // a zero count is zero, no matter, how big the scale is. + @inline def prod(x: Double, y: Double): Double = + if (x == 0.0) 0.0 else x * y + + var i = 0 + while (i < depth) { + val left = x.cells(i) + val right = y.cells(i) + var j = 0 + val bldr = rowBuilder() + while (j < width) { + bldr += prod(left(j), xscale) + prod(right(j), yscale) + j += 1 + } + cms.cells(i) = bldr.result() + i += 1 + } + cms + } + + def add(t: Long, k: K, n: Double): CMS = + scaledAdd(fromTimestamp(t), k, n) + + // TODO: we could allocate a mutable scratch pad, write all the + // values into it, and then build a CMS out of it. if items is + // very small, this would be less efficient than what we're doing + // now. probably the "ideal" solution would be determine how many + // items there are. if we have fewer than ~width items, this + // approach is fine. for more, a scratch pad would be better + // (assuming we wrote that code). + // + // alternately, you could map items into (zero + item) and then + // use the monoid's sum to boil it down. + // + // we only use this in testing currently so the current code is + // fine until we rely on it in production. any change here should + // probably include benchmarks justifying the design. + def bulkAdd(items: Iterable[(Long, K, Double)]): CMS = + items.foldLeft(this) { case (c, (t, k, v)) => c.add(t, k, v) } + + private[algebird] def scaledAdd(ts1: Double, k: K, n: Double): CMS = + if (n < 0.0) { + val t = toTimestamp(ts1) + throw new IllegalArgumentException( + s"we can only add non-negative numbers to a CMS, got $n for key: $k at time: $t" + ) + } else if (n == 0.0) { + this + } else { + val logScale1 = nextLogScale(ts1) + if (logScale1 > maxLogScale) { + rescaleTo(ts1).scaledAdd(ts1, k, n) + } else { + val increment = n * Math.exp(logScale1) + val cells1 = allocCells() + var didx = 0 + while (didx < depth) { + val cell = cells(didx) + val w = hashFns(didx)(k) + cells1(didx) = cell.updated(w, cell(w) + increment) + didx += 1 + } + new CMS(cells1, logScale1, ts1) + } + } + + // Set the scale back to 0.0 + // input time is in half-lives + private[algebird] def rescaleTo(ts: Double): CMS = { + val logScale1 = nextLogScale(ts) + val expL = Math.exp(-logScale1) + if (expL == 0.0) { + new CMS(monoid.zero.cells, 0.0, ts) + } else { + val cms = new CMS(allocCells(), 0.0, ts) + var i = 0 + while (i < depth) { + val ci = cells(i) + cms.cells(i) = ci.map(_ * expL) + i += 1 + } + cms + } + } + } + + private def rowBuilder() = { + val bldr = Vector.newBuilder[Double] + bldr.sizeHint(width) + bldr + } + + object CMS { + + implicit val monoidForCMS: Monoid[CMS] = + new Monoid[CMS] { + + def zero: CMS = module.empty + + def plus(x: CMS, y: CMS): CMS = + x + y + + /** + * Turn a flat array into an array of vectors. + */ + private def scratchToCells(scratch: Array[Double]): Array[Vector[Double]] = { + val cells = new Array[Vector[Double]](depth) + var i = 0 + while (i < depth) { + var j = i * width + val limit = j + width + val bldr = rowBuilder() + while (j < limit) { + bldr += scratch(j) + j += 1 + } + cells(i) = bldr.result() + i += 1 + } + cells + } + + /** + * This method sums the first `num` items in `arr`. + */ + private def innerSum(arr: Array[CMS], num: Int): CMS = + if (num == 0) zero + else if (num == 1) arr(0) + else if (num == 2) plus(arr(0), arr(1)) + else { + // start with zero + val scratch: Array[Double] = new Array(totalCells) + + val latestTimeInHL: Double = + arr.iterator.take(num).map(cms => cms.timeInHL).max + + var i = 0 + while (i < num) { + val cms = arr(i) + val scale = cms.getScale(latestTimeInHL) + var j = 0 + while (j < depth) { + val row = cms.cells(j) + val stride = j * width + var k = 0 + while (k < width) { + val n = row(k) + if (n > 0.0) { + scratch(stride + k) += scale * n + } + k += 1 + } + j += 1 + } + i += 1 + } + + val cells = scratchToCells(scratch) + + new CMS(cells, 0.0, latestTimeInHL) + } + + override def sumOption(xs: TraversableOnce[CMS]): Option[CMS] = { + + val it: Iterator[CMS] = xs.toIterator + val ChunkSize = 1000 + + // the idea here is that we read up to 1000 CMS values into + // a fixed array, crunch them down to a single CMS, store it + // in the first array index, read up to 999 more CMS values + // in, crunch them down, and so on. + var i = 0 + val arr = new Array[CMS](ChunkSize) + while (it.hasNext) { + while (it.hasNext && i < ChunkSize) { + arr(i) = it.next() + i += 1 + } + if (i > 1) { + arr(0) = innerSum(arr, i) + } + i = 1 + } + if (i == 0) None else Some(arr(0)) + } + } + } + + val monoid: Monoid[CMS] = CMS.monoidForCMS +} + +object DecayingCMS { + + /** + * Construct a DecayingCMS module. + * + * The seed is used to initialize the hash families used by the count-min sketch. Using the same seed will + * always produce the same hash family. + * + * Half-life determines the rate at which values in the CMS decay. If a key was counted once at time t, by + * time (t + halfLife), the value for that key will be 0.5. After enough half lives the value will decay to + * zero. + * + * The size of the CMS in bytes is O(depth * width). + * + * Width controls the relative error due to over-counting (approximately 1/width). For 1% error, use + * width=100, for 0.1% error, use width=1000, etc. + * + * Depth controls the probability the error bounds are broken and that probability scales with exp(-alpha * + * depth) so, a small depth (e.g. 5-10) is fine. Each update requires O(depth) work so you want to keep this + * as small as possible. + */ + def apply[K](seed: Long, halfLife: Duration, depth: Int, width: Int)(implicit + hasher: CMSHasher[K] + ): DecayingCMS[K] = + new DecayingCMS(seed, halfLife, depth, width, hasher) +} diff --git a/algebird-core/src/main/scala-2.12/Fold.scala b/algebird-core/src/main/scala-2.12/Fold.scala new file mode 100644 index 000000000..0b89f2d62 --- /dev/null +++ b/algebird-core/src/main/scala-2.12/Fold.scala @@ -0,0 +1,352 @@ +/* +Copyright 2014 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.algebird + +import java.io.Serializable +import scala.collection.compat._ + +/** + * Folds are first-class representations of "Traversable.foldLeft." They have the nice property that they can + * be fused to work in parallel over an input sequence. + * + * A Fold accumulates inputs (I) into some internal type (X), converting to a defined output type (O) when + * done. We use existential types to hide internal details and to allow for internal and external (X and O) + * types to differ for "map" and "join." + * + * In discussing this type we draw parallels to Function1 and related types. You can think of a fold as a + * function "Seq[I] => O" but in reality we do not have to materialize the input sequence at once to "run" the + * fold. + * + * The traversal of the input data structure is NOT done by Fold itself. Instead we expose some methods like + * "overTraversable" that know how to iterate through various sequence types and drive the fold. We also + * expose some internal state so library authors can fold over their own types. + * + * See the companion object for constructors. + */ +sealed trait Fold[-I, +O] extends Serializable { + + /** + * Users can ignore this type. + * + * The internal accumulator type. No one outside this Fold needs to know what this is, and that's a good + * thing. It keeps type signatures sane and makes this easy to use for the amount of flexibility it + * provides. + */ + type X + + /** + * Users can ignore this method. It is exposed so library authors can run folds over their own sequence + * types. + * + * "build" constructs a FoldState, which tells us how to run the fold. It is expected that we can run the + * same Fold many times over different data structures, but we must build a new FoldState every time. + * + * See FoldState for information on how to use this for your own sequence types. + */ + def build(): FoldState[X, I, O] + + /** + * Transforms the output of the Fold after iteration is complete. This is analogous to "Future.map" or + * "Function1.compose." + */ + def map[P](f: O => P): Fold[I, P] = { + val self = this + new Fold[I, P] { + type X = self.X + override def build(): FoldState[X, I, P] = + self.build().map(f) + } + } + + /** + * Joins two folds into one and combines the results. The fused fold accumulates with both at the same time + * and combines at the end. + */ + def joinWith[I2 <: I, P, Q](other: Fold[I2, P])(f: (O, P) => Q): Fold[I2, Q] = { + val self = this + new Fold[I2, Q] { + type X = (self.X, other.X) + override def build(): FoldState[X, I2, Q] = { + val first = self.build() + val second = other.build() + new FoldState( + { case ((x, y), i) => (first.add(x, i), second.add(y, i)) }, + (first.start, second.start), + { case (x, y) => f(first.end(x), second.end(y)) } + ) + } + } + } + + /** + * Convenient shorthand for joining Folds without combining at the end. + */ + def join[I2 <: I, P](other: Fold[I2, P]): Fold[I2, (O, P)] = + joinWith(other) { case (o, p) => (o, p) } + + /** + * Transforms the input of the fold before every accumulation. (The name comes from "contravariant map.") + * This is analogous to "Function1.andThen." + */ + def contramap[H](f: H => I): Fold[H, O] = { + val self = this + new Fold[H, O] { + type X = self.X + override def build(): FoldState[X, H, O] = + self.build().contramap(f) + } + } + + /** + * Trivially runs a Fold over an empty sequence. + */ + def overEmpty: O = { + // build is a "def" so we construct the state once and use the pieces to run the fold + val state = build() + state.end(state.start) + } + + /** + * Trivially runs a Fold over a single element sequence. + */ + def overSingleton(i: I): O = { + val state = build() + state.end(state.add(state.start, i)) + } + + /** + * Runs a Fold over a Traversable. + */ + def overTraversable(is: TraversableOnce[I]): O = { + val state = build() + state.end(is.iterator.foldLeft(state.start)(state.add)) + } +} + +/** + * A FoldState defines a left fold with a "hidden" accumulator type. It is exposed so library authors can run + * Folds over their own sequence types. + * + * The fold can be executed correctly according to the properties of "add" and your traversed data structure. + * For example, the "add" function of a monoidal fold will be associative. A FoldState is valid for only one + * iteration because the accumulator (seeded by "start" may be mutable. + * + * The three components of a fold are add: (X, I) => X - updates and returns internal state for every input I + * start: X - the initial state end: X => O - transforms internal state to a final result + * + * Folding over Seq(x, y) would produce the result end(add(add(start, x), y)) + */ +final class FoldState[X, -I, +O] private[algebird] (val add: (X, I) => X, val start: X, val end: X => O) + extends Serializable { + + /** + * Transforms the output type of the FoldState (see Fold.map). + */ + def map[P](f: O => P): FoldState[X, I, P] = + new FoldState(add, start, end.andThen(f)) + + /** + * Transforms the input type of the FoldState (see Fold.contramap). + */ + def contramap[H](f: H => I): FoldState[X, H, O] = + new FoldState((x, h) => add(x, f(h)), start, end) +} + +/** + * Methods to create and run Folds. + * + * The Folds defined here are immutable and serializable, which we expect by default. It is important that you + * as a user indicate mutability or non-serializability when defining new Folds. Additionally, it is + * recommended that "end" functions not mutate the accumulator in order to support scans (producing a stream + * of intermediate outputs by calling "end" at each step). + */ +object Fold extends CompatFold { + + /** + * "import Fold.applicative" will bring the Applicative instance into scope. See FoldApplicative. + */ + implicit def applicative[I]: Applicative[Fold[I, _]] = + new FoldApplicative[I] + + /** + * Turn a common Scala foldLeft into a Fold. The accumulator MUST be immutable and serializable. + */ + def foldLeft[I, O](o: O)(add: (O, I) => O): Fold[I, O] = + fold[O, I, O](add, o, o => o) + + /** + * A general way of defining Folds that supports a separate accumulator type. The accumulator MUST be + * immutable and serializable. + */ + def fold[M, I, O](add: (M, I) => M, start: M, end: M => O): Fold[I, O] = + new Fold[I, O] { + type X = M + override def build(): FoldState[X, I, O] = + new FoldState(add, start, end) + } + + /** + * A general way of defining Folds that supports constructing mutable or non-serializable accumulators. + */ + def foldMutable[M, I, O](add: (M, I) => M, start: Unit => M, end: M => O): Fold[I, O] = + new Fold[I, O] { + type X = M + override def build(): FoldState[X, I, O] = + new FoldState(add, start(()), end) + } + + /** + * Fuse a sequence of Folds into one that outputs the result of each. + */ + def sequence[I, O](ms: Seq[Fold[I, O]]): Fold[I, Seq[O]] = + new Fold[I, Seq[O]] { + type X = Seq[Any] + override def build(): FoldState[Seq[Any], I, Seq[O]] = { + val bs: Seq[FoldState[Any, I, O]] = + ms.map(_.build().asInstanceOf[FoldState[Any, I, O]]) + val adds = + bs.map(_.add) + val ends = + bs.map(_.end) + val starts: Seq[Any] = + bs.map(_.start) + val add: (Seq[Any], I) => Seq[Any] = { (xs, i) => adds.zip(xs).map { case (f, x) => f(x, i) } } + val end: (Seq[Any] => Seq[O]) = { xs => ends.zip(xs).map { case (f, x) => f(x) } } + new FoldState(add, starts, end) + } + } + + /** + * An even simpler Fold that collects into a Seq. Shorthand for "container[I, Seq];" fewer type arguments, + * better type inferrence. + */ + def seq[I]: Fold[I, Seq[I]] = + container[I, Seq] + + /** + * A Fold that does no work and returns a constant. Analogous to Function1 const: def const[A, B](b: B): (A + * \=> B) = { _ => b } + */ + def const[O](value: O): Fold[Any, O] = + Fold.foldLeft(value) { case (u, _) => u } + + /** + * A Fold that runs the given side effect for every element. + */ + def foreach[I](e: I => Unit): Fold[I, Unit] = + Fold.foldLeft(()) { case (_, i) => e(i) } + + /** + * A Fold that returns the first value in a sequence. + */ + def first[I]: Fold[I, Option[I]] = + Fold.foldLeft[I, Option[I]](None) { + case (None, i) => Some(i) + case (x, _) => x + } + + /** + * A Fold that returns the last value in a sequence. + */ + def last[I]: Fold[I, Option[I]] = + Fold.foldLeft[I, Option[I]](None) { case (_, i) => Some(i) } + + /** + * A Fold that returns the max value in a sequence. (Biased to earlier equal values.) + */ + def max[I](implicit ordering: Ordering[I]): Fold[I, Option[I]] = + Fold.foldLeft[I, Option[I]](None) { + case (None, i) => Some(i) + case (Some(y), i) if ordering.compare(y, i) < 0 => Some(i) + case (x, _) => x + } + + /** + * A Fold that returns a min value in a sequence. (Biased to earlier equal values.) + */ + def min[I](implicit ordering: Ordering[I]): Fold[I, Option[I]] = + Fold.foldLeft[I, Option[I]](None) { + case (None, i) => Some(i) + case (Some(y), i) if ordering.compare(y, i) > 0 => Some(i) + case (x, _) => x + } + + /** + * A Fold that returns the sum of a numeric sequence. Does not protect against overflow. + */ + def sum[I](implicit numeric: Monoid[I]): Fold[I, I] = + Fold.foldLeft(numeric.zero) { case (x, i) => numeric.plus(x, i) } + + /** + * For a semigroup, if we get more than 0 items, use plus + */ + def sumOption[T](implicit sg: Semigroup[T]): Fold[T, Option[T]] = + Fold.foldLeft(None: Option[T]) { + case (None, i) => Some(i) + case (Some(l), r) => Some(sg.plus(l, r)) + } + + /** + * A Fold that returns the product of a numeric sequence. Does not protect against overflow. + */ + def product[I](implicit numeric: Ring[I]): Fold[I, I] = + Fold.foldLeft(numeric.one) { case (x, i) => numeric.times(x, i) } + + /** + * A Fold that returns the length of a sequence. + */ + def size: Fold[Any, Long] = + Fold.foldLeft(0L) { case (x, _) => x + 1 } + + /** + * A Fold that returns "true" if all elements of the sequence statisfy the predicate. Note this does not + * short-circuit enumeration of the sequence. + */ + def forall[I](pred: I => Boolean): Fold[I, Boolean] = + foldLeft(true)((b, i) => b && pred(i)) + + /** + * A Fold that returns "true" if any element of the sequence statisfies the predicate. Note this does not + * short-circuit enumeration of the sequence. + */ + def exists[I](pred: I => Boolean): Fold[I, Boolean] = + foldLeft(false)((b, i) => b || pred(i)) + + /** + * A Fold that counts the number of elements satisfying the predicate. + */ + def count[I](pred: I => Boolean): Fold[I, Long] = + foldLeft(0L) { + case (c, i) if pred(i) => c + 1L + case (c, _) => c + } +} + +/** + * Folds are Applicatives! + */ +class FoldApplicative[I] extends Applicative[Fold[I, _]] { + override def map[T, U](mt: Fold[I, T])(fn: T => U): Fold[I, U] = + mt.map(fn) + override def apply[T](v: T): Fold[I, T] = + Fold.const(v) + override def join[T, U](mt: Fold[I, T], mu: Fold[I, U]): Fold[I, (T, U)] = + mt.join(mu) + override def sequence[T](ms: Seq[Fold[I, T]]): Fold[I, Seq[T]] = + Fold.sequence(ms) + override def joinWith[T, U, V](mt: Fold[I, T], mu: Fold[I, U])(fn: (T, U) => V): Fold[I, V] = + mt.joinWith(mu)(fn) +} diff --git a/algebird-core/src/main/scala-2.12/Interval.scala b/algebird-core/src/main/scala-2.12/Interval.scala new file mode 100644 index 000000000..6a1645d16 --- /dev/null +++ b/algebird-core/src/main/scala-2.12/Interval.scala @@ -0,0 +1,380 @@ +/* + Copyright 2013 Twitter, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +package com.twitter.algebird + +// TODO this is clearly more general than summingbird, and should be extended to be a ring (add union, etc...) + +/** + * Represents a single interval on a T with an Ordering + */ +sealed trait Interval[T] extends java.io.Serializable { + def contains(t: T)(implicit ord: Ordering[T]): Boolean + + def intersect(that: Interval[T])(implicit ord: Ordering[T]): Interval[T] + final def apply(t: T)(implicit ord: Ordering[T]): Boolean = contains(t) + final def &&(that: Interval[T])(implicit ord: Ordering[T]): Interval[T] = intersect(that) + + /** + * Map the Interval with a non-decreasing function. If you use a non-monotonic function (like x^2) then the + * result is meaningless. TODO: It might be good to have types for these properties in algebird. + */ + def mapNonDecreasing[U](fn: T => U): Interval[U] +} + +case class Universe[T]() extends Interval[T] { + override def contains(t: T)(implicit ord: Ordering[T]): Boolean = true + override def intersect(that: Interval[T])(implicit ord: Ordering[T]): Interval[T] = + that + override def mapNonDecreasing[U](fn: T => U): Interval[U] = Universe() +} + +case class Empty[T]() extends Interval[T] { + override def contains(t: T)(implicit ord: Ordering[T]): Boolean = false + override def intersect(that: Interval[T])(implicit ord: Ordering[T]): Interval[T] = + this + override def mapNonDecreasing[U](fn: T => U): Interval[U] = Empty() +} + +object Interval extends java.io.Serializable { + + /** + * Class that only exists so that [[leftClosedRightOpen]] and [[leftOpenRightClosed]] can retain the type + * information of the returned interval. The compiler doesn't know anything about ordering, so without + * [[MaybeEmpty]] the only valid return type is Interval[T]. + */ + sealed abstract class MaybeEmpty[T, NonEmpty[t] <: Interval[t]] { + def isEmpty: Boolean + } + object MaybeEmpty { + + /** + * Represents an empty interval. + */ + case class SoEmpty[T, NonEmpty[t] <: Interval[t]]() extends MaybeEmpty[T, NonEmpty] { + override def isEmpty: Boolean = true + } + + /** + * Represents a non-empty interval. + */ + case class NotSoEmpty[T, NonEmpty[t] <: Interval[t]](get: NonEmpty[T]) extends MaybeEmpty[T, NonEmpty] { + override def isEmpty: Boolean = false + } + } + + type GenIntersection[T] = Intersection[Lower, Upper, T] + type InLowExUp[T] = Intersection[InclusiveLower, ExclusiveUpper, T] + type InLowInUp[T] = Intersection[InclusiveLower, InclusiveUpper, T] + type ExLowExUp[T] = Intersection[ExclusiveLower, ExclusiveUpper, T] + type ExLowInUp[T] = Intersection[ExclusiveLower, InclusiveUpper, T] + + implicit def monoid[T: Ordering]: Monoid[Interval[T]] = + Monoid.from[Interval[T]](Universe[T]())(_ && _) + + // Automatically convert from a MaybeEmpty instance + implicit def fromMaybeEmpty[T, NonEmpty[t] <: Interval[t]](me: MaybeEmpty[T, NonEmpty]): Interval[T] = + me match { + case MaybeEmpty.SoEmpty() => Empty() + case MaybeEmpty.NotSoEmpty(i) => i + } + + def leftClosedRightOpen[T: Ordering](lower: T, upper: T): MaybeEmpty[T, InLowExUp] = + if (Ordering[T].lt(lower, upper)) + MaybeEmpty.NotSoEmpty[T, InLowExUp](Intersection(InclusiveLower(lower), ExclusiveUpper(upper))) + else MaybeEmpty.SoEmpty[T, InLowExUp]() + + def leftOpenRightClosed[T: Ordering](lower: T, upper: T): MaybeEmpty[T, ExLowInUp] = + if (Ordering[T].lt(lower, upper)) + MaybeEmpty.NotSoEmpty[T, ExLowInUp](Intersection(ExclusiveLower(lower), InclusiveUpper(upper))) + else MaybeEmpty.SoEmpty[T, ExLowInUp]() + + def closed[T: Ordering](lower: T, upper: T): MaybeEmpty[T, InLowInUp] = + if (Ordering[T].lteq(lower, upper)) + MaybeEmpty.NotSoEmpty[T, InLowInUp](Intersection(InclusiveLower(lower), InclusiveUpper(upper))) + else MaybeEmpty.SoEmpty[T, InLowInUp]() + + def open[T: Ordering](lower: T, upper: T): MaybeEmpty[T, ExLowExUp] = + if (Ordering[T].lt(lower, upper)) + MaybeEmpty.NotSoEmpty[T, ExLowExUp](Intersection(ExclusiveLower(lower), ExclusiveUpper(upper))) + else MaybeEmpty.SoEmpty[T, ExLowExUp]() + + /** + * This is here for binary compatibility reasons. These methods should be moved to Interval, which should + * also be an abstract class for better binary compatibility at the next incompatible change + */ + implicit final class IntervalMethods[T](val intr: Interval[T]) extends AnyVal { + def isEmpty(implicit succ: Successible[T], pred: Predecessible[T]): Boolean = intr match { + case Empty() => true + case Universe() => false + case Intersection(InclusiveLower(l), ExclusiveUpper(u)) => + !succ.ordering.lt(l, u) + case Intersection(InclusiveLower(l), InclusiveUpper(u)) => + !succ.ordering.lteq(l, u) + case Intersection(ExclusiveLower(l), ExclusiveUpper(u)) => + !succ.next(l).exists(succ.ordering.lt(_, u)) + case Intersection(ExclusiveLower(l), InclusiveUpper(u)) => + !succ.next(l).exists(succ.ordering.lteq(_, u)) + case InclusiveLower(_) => false // we at least have l + case InclusiveUpper(_) => false // false // we at least have u + case ExclusiveLower(l) => + succ.next(l).isEmpty + case ExclusiveUpper(u) => + pred.prev(u).isEmpty + } + + /** + * If this returns Some(t), then intr.contains(t) and there is no s less than t such that intr.contains(s) + * + * if this returns None, it may be Empty, Upper or Universe + */ + def boundedLeast(implicit succ: Successible[T]): Option[T] = intr match { + case Empty() => None + case Universe() => None + case _: Upper[?] => None + case i @ Intersection(_, _) => i.least + case l: Lower[?] => l.least + } + + /** + * If this returns Some(t), then intr.contains(t) and there is no s greater than t such that + * intr.contains(s) + * + * if this returns None, it may be Empty, Lower, or Universe + */ + def boundedGreatest(implicit pred: Predecessible[T]): Option[T] = + intr match { + case Empty() => None + case Universe() => None + case _: Lower[?] => None + case i @ Intersection(_, _) => i.greatest + case u: Upper[?] => u.greatest + } + } +} + +// Marker traits to keep lower on the left in Intersection +sealed trait Lower[T] extends Interval[T] { + + /** + * This may give a false positive (but should try not to). Note the case of (0,1) for the integers. If they + * were doubles, this would intersect, but since there are no members of the set Int that are bigger than 0 + * and less than 1, they don't really intersect. So, ordering is not enough here. You need a stronger + * notion, which we don't have a typeclass for. + */ + def intersects(u: Upper[T])(implicit ord: Ordering[T]): Boolean + + /** + * The smallest value that is contained here This is an Option, because of cases like + * ExclusiveLower(Int.MaxValue) which are pathological and equivalent to Empty + */ + def least(implicit s: Successible[T]): Option[T] + def strictLowerBound(implicit p: Predecessible[T]): Option[T] + + /** + * Iterates all the items in this Lower[T] from lowest to highest + */ + def toIterable(implicit s: Successible[T]): Iterable[T] = + least match { + case Some(l) => s.iterateNext(l) + case None => Iterable.empty + } +} +sealed trait Upper[T] extends Interval[T] { + + /** + * The smallest value that is contained here This is an Option, because of cases like + * ExclusiveUpper(Int.MinValue), which are pathological and equivalent to Empty + */ + def greatest(implicit p: Predecessible[T]): Option[T] + // The smallest value that is not present + def strictUpperBound(implicit s: Successible[T]): Option[T] + + /** + * Iterates all the items in this Upper[T] from highest to lowest + */ + def toIterable(implicit p: Predecessible[T]): Iterable[T] = + greatest match { + case Some(g) => p.iteratePrev(g) + case None => Iterable.empty + } +} + +case class InclusiveLower[T](lower: T) extends Interval[T] with Lower[T] { + override def contains(t: T)(implicit ordering: Ordering[T]): Boolean = + ordering.lteq(lower, t) + override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match { + case Universe() => this + case Empty() => that + case ub @ InclusiveUpper(_) => + if (intersects(ub)) Intersection(this, ub) else Empty() + case ub @ ExclusiveUpper(_) => + if (intersects(ub)) Intersection(this, ub) else Empty() + case InclusiveLower(thatlb) => + if (ordering.gt(lower, thatlb)) this else that + case ExclusiveLower(thatlb) => + if (ordering.gt(lower, thatlb)) this else that + case Intersection(thatL, thatU) => (this && thatL) && thatU + } + override def intersects(u: Upper[T])(implicit ordering: Ordering[T]): Boolean = + u match { + case InclusiveUpper(upper) => ordering.lteq(lower, upper) + case ExclusiveUpper(upper) => ordering.lt(lower, upper) + } + override def least(implicit s: Successible[T]): Option[T] = Some(lower) + override def strictLowerBound(implicit p: Predecessible[T]): Option[T] = p.prev(lower) + override def mapNonDecreasing[U](fn: T => U): Interval[U] = InclusiveLower(fn(lower)) +} +case class ExclusiveLower[T](lower: T) extends Interval[T] with Lower[T] { + override def contains(t: T)(implicit ordering: Ordering[T]): Boolean = + ordering.lt(lower, t) + override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match { + case Universe() => this + case Empty() => that + case ub @ InclusiveUpper(_) => + if (intersects(ub)) Intersection(this, ub) else Empty() + case ub @ ExclusiveUpper(_) => + if (intersects(ub)) Intersection(this, ub) else Empty() + case InclusiveLower(thatlb) => + if (ordering.gteq(lower, thatlb)) this else that + case ExclusiveLower(thatlb) => + if (ordering.gteq(lower, thatlb)) this else that + case Intersection(thatL, thatU) => (this && thatL) && thatU + } + override def intersects(u: Upper[T])(implicit ordering: Ordering[T]): Boolean = + u match { + case InclusiveUpper(upper) => ordering.lt(lower, upper) + case ExclusiveUpper(upper) => + ordering.lt(lower, upper) // This is a false positive for (x, next(x)) + } + override def least(implicit s: Successible[T]): Option[T] = s.next(lower) + override def strictLowerBound(implicit p: Predecessible[T]): Option[T] = Some(lower) + override def mapNonDecreasing[U](fn: T => U): Interval[U] = ExclusiveLower(fn(lower)) +} +case class InclusiveUpper[T](upper: T) extends Interval[T] with Upper[T] { + override def contains(t: T)(implicit ordering: Ordering[T]): Boolean = + ordering.lteq(t, upper) + override def greatest(implicit p: Predecessible[T]): Option[T] = Some(upper) + // The smallest value that is not present + override def strictUpperBound(implicit s: Successible[T]): Option[T] = s.next(upper) + override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match { + case Universe() => this + case Empty() => that + case lb @ InclusiveLower(_) => + if (lb.intersects(this)) Intersection(lb, this) else Empty() + case lb @ ExclusiveLower(_) => + if (lb.intersects(this)) Intersection(lb, this) else Empty() + case InclusiveUpper(thatub) => + if (ordering.lt(upper, thatub)) this else that + case ExclusiveUpper(thatub) => + if (ordering.lt(upper, thatub)) this else that + case Intersection(thatL, thatU) => thatL && (this && thatU) + } + override def mapNonDecreasing[U](fn: T => U): Interval[U] = InclusiveUpper(fn(upper)) +} +case class ExclusiveUpper[T](upper: T) extends Interval[T] with Upper[T] { + override def contains(t: T)(implicit ordering: Ordering[T]): Boolean = + ordering.lt(t, upper) + override def greatest(implicit p: Predecessible[T]): Option[T] = p.prev(upper) + // The smallest value that is not present + override def strictUpperBound(implicit s: Successible[T]): Option[T] = Some(upper) + override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match { + case Universe() => this + case Empty() => that + case lb @ InclusiveLower(_) => + if (lb.intersects(this)) Intersection(lb, this) else Empty() + case lb @ ExclusiveLower(_) => + if (lb.intersects(this)) Intersection(lb, this) else Empty() + case InclusiveUpper(thatub) => + if (ordering.lteq(upper, thatub)) this else that + case ExclusiveUpper(thatub) => + if (ordering.lteq(upper, thatub)) this else that + case Intersection(thatL, thatU) => thatL && (this && thatU) + } + override def mapNonDecreasing[U](fn: T => U): Interval[U] = ExclusiveUpper(fn(upper)) +} + +case class Intersection[L[t] <: Lower[t], U[t] <: Upper[t], T](lower: L[T], upper: U[T]) extends Interval[T] { + override def contains(t: T)(implicit ordering: Ordering[T]): Boolean = + lower.contains(t) && upper.contains(t) + override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match { + case Universe() => this + case Empty() => that + case lb @ InclusiveLower(_) => (lb && lower) && upper + case lb @ ExclusiveLower(_) => (lb && lower) && upper + case ub @ InclusiveUpper(_) => lower && (ub && upper) + case ub @ ExclusiveUpper(_) => lower && (ub && upper) + case Intersection(thatL, thatU) => (lower && thatL) && (upper && thatU) + } + override def mapNonDecreasing[T1](fn: T => T1): Interval[T1] = { + val newLower = lower match { + case InclusiveLower(l) => InclusiveLower(fn(l)) + case ExclusiveLower(l) => ExclusiveLower(fn(l)) + } + val newUpper = upper match { + case InclusiveUpper(u) => InclusiveUpper(fn(u)) + case ExclusiveUpper(u) => ExclusiveUpper(fn(u)) + } + Intersection(newLower, newUpper) + } + + def least(implicit s: Successible[T]): Option[T] = + lower.least.filter(upper.contains(_)(s.ordering)) + + /** + * Goes from lowest to highest for all items that are contained in this Intersection + */ + def leastToGreatest(implicit s: Successible[T]): Iterable[T] = { + val self = this + implicit val ord: Ordering[T] = s.ordering + // TODO https://github.com/twitter/algebird/issues/263 + new AbstractIterable[T] { + // We have to do this because the normal takeWhile causes OOM on big intervals + override def iterator: Iterator[T] = lower.toIterable.iterator.takeWhile(self.upper.contains(_)) + } + } + + def greatest(implicit p: Predecessible[T]): Option[T] = + upper.greatest.filter(lower.contains(_)(p.ordering)) + + /** + * Goes from highest to lowest for all items that are contained in this Intersection + */ + def greatestToLeast(implicit p: Predecessible[T]): Iterable[T] = { + val self = this + implicit val ord: Ordering[T] = p.ordering + // TODO https://github.com/twitter/algebird/issues/263 + new AbstractIterable[T] { + // We have to do this because the normal takeWhile causes OOM on big intervals + override def iterator: Iterator[T] = upper.toIterable.iterator.takeWhile(self.lower.contains(_)) + } + } + + /** + * Some intervals can actually be synonyms for empty: (0,0) for instance, contains nothing. This cannot be + * normalized to [a, b) form, thus we return an option Also, there are cases like [Int.MinValue, + * Int.MaxValue] that cannot are actually equivalent to Universe. The bottom line: if this returns None, it + * just means you can't express it this way, it does not mean it is empty or universe, etc... (there are + * other cases). + */ + def toLeftClosedRightOpen(implicit + s: Successible[T] + ): Option[Intersection[InclusiveLower, ExclusiveUpper, T]] = + for { + l <- lower.least + g <- upper.strictUpperBound if s.ordering.lt(l, g) + } yield Intersection(InclusiveLower(l), ExclusiveUpper(g)) +} diff --git a/algebird-core/src/main/scala-2.12/InvariantAlgebras.scala b/algebird-core/src/main/scala-2.12/InvariantAlgebras.scala new file mode 100644 index 000000000..6f30ebc1c --- /dev/null +++ b/algebird-core/src/main/scala-2.12/InvariantAlgebras.scala @@ -0,0 +1,48 @@ +package com.twitter.algebird + +class InvariantSemigroup[T, U](val forward: T => U, val reverse: U => T)(implicit val semigroup: Semigroup[T]) + extends Semigroup[U] { + override def plus(l: U, r: U): U = + forward(semigroup.plus(reverse(l), reverse(r))) + override def sumOption(iter: TraversableOnce[U]): Option[U] = + semigroup.sumOption(iter.map(reverse)).map(forward) + + /* + * Note these work for the subclasses since in those cases semigroup + * will be the appropriate algebra. + */ + override val hashCode: Int = (forward, reverse, semigroup).hashCode + override def equals(that: Any): Boolean = + that match { + case r: InvariantSemigroup[?, ?] => + (hashCode == r.hashCode) && + (forward == r.forward) && + (reverse == r.reverse) && + (semigroup == r.semigroup) + case _ => false + } +} + +class InvariantMonoid[T, U](forward: T => U, reverse: U => T)(implicit val monoid: Monoid[T]) + extends InvariantSemigroup[T, U](forward, reverse) + with Monoid[U] { + override val zero: U = forward(monoid.zero) +} + +class InvariantGroup[T, U](forward: T => U, reverse: U => T)(implicit val group: Group[T]) + extends InvariantMonoid[T, U](forward, reverse) + with Group[U] { + override def negate(u: U): U = forward(group.negate(reverse(u))) + override def minus(l: U, r: U): U = + forward(group.minus(reverse(l), reverse(r))) +} + +class InvariantRing[T, U](forward: T => U, reverse: U => T)(implicit val ring: Ring[T]) + extends InvariantGroup[T, U](forward, reverse) + with Ring[U] { + override val one: U = forward(ring.one) + override def times(l: U, r: U): U = + forward(ring.times(reverse(l), reverse(r))) + override def product(iter: TraversableOnce[U]): U = + forward(ring.product(iter.map(reverse))) +} diff --git a/algebird-core/src/main/scala-2.12/JavaMonoids.scala b/algebird-core/src/main/scala-2.12/JavaMonoids.scala new file mode 100644 index 000000000..26ce54f0a --- /dev/null +++ b/algebird-core/src/main/scala-2.12/JavaMonoids.scala @@ -0,0 +1,147 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.algebird + +import java.lang.{ + Boolean => JBool, + Double => JDouble, + Float => JFloat, + Integer => JInt, + Long => JLong, + Short => JShort +} +import java.util.{ArrayList => JArrayList, HashMap => JHashMap, List => JList, Map => JMap} + +import scala.collection.JavaConverters._ + +object JIntRing extends Ring[JInt] { + override val zero: JInt = JInt.valueOf(0) + override val one: JInt = JInt.valueOf(1) + override def plus(x: JInt, y: JInt): JInt = x + y + override def negate(x: JInt): JInt = -x + override def minus(x: JInt, y: JInt): JInt = x - y + override def times(x: JInt, y: JInt): JInt = x * y +} + +object JShortRing extends Ring[JShort] { + override val zero: JShort = Short.box(0) + override val one: JShort = Short.box(1) + override def plus(x: JShort, y: JShort): JShort = (x + y).toShort + override def negate(x: JShort): JShort = (-x).toShort + override def minus(x: JShort, y: JShort): JShort = (x - y).toShort + override def times(x: JShort, y: JShort): JShort = (x * y).toShort +} + +object JLongRing extends Ring[JLong] { + override val zero: JLong = JLong.valueOf(0L) + override val one: JLong = JLong.valueOf(1L) + override def plus(x: JLong, y: JLong): JLong = x + y + override def negate(x: JLong): JLong = -x + override def minus(x: JLong, y: JLong): JLong = x - y + override def times(x: JLong, y: JLong): JLong = x * y +} + +object JFloatRing extends Ring[JFloat] { + override val zero: JFloat = JFloat.valueOf(0.0f) + override val one: JFloat = JFloat.valueOf(1.0f) + override def plus(x: JFloat, y: JFloat): JFloat = x + y + override def negate(x: JFloat): JFloat = -x + override def minus(x: JFloat, y: JFloat): JFloat = x - y + override def times(x: JFloat, y: JFloat): JFloat = x * y +} + +object JDoubleRing extends Ring[JDouble] { + override val zero: JDouble = JDouble.valueOf(0.0) + override val one: JDouble = JDouble.valueOf(1.0) + override def plus(x: JDouble, y: JDouble): JDouble = x + y + override def negate(x: JDouble): JDouble = -x + override def minus(x: JDouble, y: JDouble): JDouble = x - y + override def times(x: JDouble, y: JDouble): JDouble = x * y +} + +object JBoolRing extends Ring[JBool] { + override val zero: JBool = JBool.FALSE + override val one: JBool = JBool.TRUE + override def plus(x: JBool, y: JBool): JBool = + JBool.valueOf(x.booleanValue ^ y.booleanValue) + override def negate(x: JBool): JBool = x + override def minus(x: JBool, y: JBool): JBool = plus(x, y) + override def times(x: JBool, y: JBool): JBool = + JBool.valueOf(x.booleanValue & y.booleanValue) +} + +/** + * Since Lists are mutable, this always makes a full copy. Prefer scala immutable Lists if you use scala + * immutable lists, the tail of the result of plus is always the right argument + */ +class JListMonoid[T] extends Monoid[JList[T]] { + override def isNonZero(x: JList[T]): Boolean = !x.isEmpty + override lazy val zero: JArrayList[T] = new JArrayList[T](0) + override def plus(x: JList[T], y: JList[T]): JArrayList[T] = { + val res = new JArrayList[T](x.size + y.size) + res.addAll(x) + res.addAll(y) + res + } +} + +/** + * Since maps are mutable, this always makes a full copy. Prefer scala immutable maps if you use scala + * immutable maps, this operation is much faster TODO extend this to Group, Ring + */ +class JMapMonoid[K, V: Semigroup] extends Monoid[JMap[K, V]] { + override lazy val zero: JHashMap[K, V] = new JHashMap[K, V](0) + + val nonZero: (V => Boolean) = implicitly[Semigroup[V]] match { + case mon: Monoid[?] => mon.isNonZero(_) + case _ => _ => true + } + + override def isNonZero(x: JMap[K, V]): Boolean = + !x.isEmpty && (implicitly[Semigroup[V]] match { + case mon: Monoid[?] => + x.values.asScala.exists(v => mon.isNonZero(v)) + case _ => true + }) + override def plus(x: JMap[K, V], y: JMap[K, V]): JHashMap[K, V] = { + val (big, small, bigOnLeft) = + if (x.size > y.size) { + (x, y, true) + } else { + (y, x, false) + } + val vsemi = implicitly[Semigroup[V]] + val result = new JHashMap[K, V](big.size + small.size) + result.putAll(big) + small.entrySet.asScala.foreach { kv => + val smallK = kv.getKey + val smallV = kv.getValue + if (big.containsKey(smallK)) { + val bigV = big.get(smallK) + val newV = + if (bigOnLeft) vsemi.plus(bigV, smallV) else vsemi.plus(smallV, bigV) + if (nonZero(newV)) + result.put(smallK, newV) + else + result.remove(smallK) + } else { + // No need to explicitly add with zero on V, just put in the small value + result.put(smallK, smallV) + } + } + result + } +} diff --git a/algebird-core/src/main/scala-2.12/MapAlgebra.scala b/algebird-core/src/main/scala-2.12/MapAlgebra.scala new file mode 100644 index 000000000..9ca370eaf --- /dev/null +++ b/algebird-core/src/main/scala-2.12/MapAlgebra.scala @@ -0,0 +1,320 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.algebird + +import com.twitter.algebird.macros.{Cuber, Roller} +import scala.collection.mutable.{Builder, Map => MMap} +import scala.collection.{Map => ScMap} +import algebra.ring.Rng +import scala.collection.compat._ + +trait MapOperations[K, V, M <: ScMap[K, V]] { + def add(oldMap: M, kv: (K, V)): M + def remove(oldMap: M, k: K): M + def fromMutable(mut: MMap[K, V]): M +} + +abstract class GenericMapMonoid[K, V, M <: ScMap[K, V]](implicit val semigroup: Semigroup[V]) + extends Monoid[M] + with MapOperations[K, V, M] { + + val nonZero: (V => Boolean) = semigroup match { + case mon: Monoid[?] => mon.isNonZero(_) + case _ => _ => true + } + + override def isNonZero(x: M): Boolean = + !x.isEmpty && (semigroup match { + case mon: Monoid[?] => + x.valuesIterator.exists(v => mon.isNonZero(v)) + case _ => true + }) + + override def plus(x: M, y: M): M = { + // Scala maps can reuse internal structure, so don't copy just add into the bigger one: + // This really saves computation when adding lots of small maps into big ones (common) + val (big, small, bigOnLeft) = + if (x.size > y.size) { + (x, y, true) + } else { + (y, x, false) + } + small match { + // Mutable maps create new copies of the underlying data on add so don't use the + // handleImmutable method. + // Cannot have a None so 'get' is safe here. + case _: MMap[?, ?] => sumOption(Seq(big, small)).get + case _ => handleImmutable(big, small, bigOnLeft) + } + } + + private def handleImmutable(big: M, small: M, bigOnLeft: Boolean) = + small.foldLeft(big) { (oldMap, kv) => + val newV = big + .get(kv._1) + .map { bigV => + if (bigOnLeft) + semigroup.plus(bigV, kv._2) + else + semigroup.plus(kv._2, bigV) + } + .getOrElse(kv._2) + if (nonZero(newV)) + add(oldMap, kv._1 -> newV) + else + remove(oldMap, kv._1) + } + override def sumOption(items: TraversableOnce[M]): Option[M] = + if (items.iterator.isEmpty) None + else { + val mutable = MMap[K, V]() + items.iterator.foreach { m => + m.foreach { case (k, v) => + val oldVOpt = mutable.get(k) + // sorry for the micro optimization here: avoiding a closure + val newV = + if (oldVOpt.isEmpty) v else Semigroup.plus(oldVOpt.get, v) + if (nonZero(newV)) + mutable.update(k, newV) + else + mutable.remove(k) + } + } + Some(fromMutable(mutable)) + } +} + +class MapMonoid[K, V](implicit semigroup: Semigroup[V]) extends GenericMapMonoid[K, V, Map[K, V]] { + override lazy val zero: Map[K, V] = Map[K, V]() + override def add(oldMap: Map[K, V], kv: (K, V)): Map[K, V] = oldMap + kv + override def remove(oldMap: Map[K, V], k: K): Map[K, V] = oldMap - k + override def fromMutable(mut: MMap[K, V]): Map[K, V] = + new MutableBackedMap(mut) +} + +class ScMapMonoid[K, V](implicit semigroup: Semigroup[V]) extends GenericMapMonoid[K, V, ScMap[K, V]] { + override lazy val zero: ScMap[K, V] = ScMap[K, V]() + override def add(oldMap: ScMap[K, V], kv: (K, V)): ScMap[K, V] = oldMap + kv + override def remove(oldMap: ScMap[K, V], k: K): ScMap[K, V] = oldMap - k + override def fromMutable(mut: MMap[K, V]): ScMap[K, V] = + new MutableBackedMap(mut) +} + +/** + * You can think of this as a Sparse vector group + */ +class MapGroup[K, V](implicit val group: Group[V]) extends MapMonoid[K, V]()(group) with Group[Map[K, V]] { + override def negate(kv: Map[K, V]): Map[K, V] = + kv.iterator.map { case (k, v) => + (k, group.negate(v)) + }.toMap +} + +class ScMapGroup[K, V](implicit val group: Group[V]) + extends ScMapMonoid[K, V]()(group) + with Group[ScMap[K, V]] { + override def negate(kv: ScMap[K, V]): ScMap[K, V] = + kv.iterator.map { case (k, v) => + (k, group.negate(v)) + }.toMap +} + +/** + * You can think of this as a Sparse vector ring + */ +trait GenericMapRing[K, V, M <: ScMap[K, V]] extends Rng[M] with MapOperations[K, V, M] { + + implicit def ring: Ring[V] + + override def times(x: M, y: M): M = { + val (big, small, bigOnLeft) = + if (x.size > y.size) { + (x, y, true) + } else { + (y, x, false) + } + small.foldLeft(zero) { (oldMap, kv) => + val bigV = big.getOrElse(kv._1, ring.zero) + val newV = + if (bigOnLeft) ring.times(bigV, kv._2) else ring.times(kv._2, bigV) + if (ring.isNonZero(newV)) { + add(oldMap, kv._1 -> newV) + } else { + remove(oldMap, kv._1) + } + } + } +} + +class MapRing[K, V](implicit override val ring: Ring[V]) + extends MapGroup[K, V]()(ring) + with GenericMapRing[K, V, Map[K, V]] + +class ScMapRing[K, V](implicit override val ring: Ring[V]) + extends ScMapGroup[K, V]()(ring) + with GenericMapRing[K, V, ScMap[K, V]] + +object MapAlgebra { + def rightContainsLeft[K, V: Equiv](l: Map[K, V], r: Map[K, V]): Boolean = + l.forall { case (k, v) => + r.get(k).exists(Equiv[V].equiv(_, v)) + } + + implicit def sparseEquiv[K, V: Monoid: Equiv]: Equiv[Map[K, V]] = + Equiv.fromFunction { (m1, m2) => + val cleanM1 = removeZeros(m1) + val cleanM2 = removeZeros(m2) + rightContainsLeft(cleanM1, cleanM2) && rightContainsLeft(cleanM2, cleanM1) + } + + def mergeLookup[T, U, V: Monoid]( + keys: TraversableOnce[T] + )(lookup: T => Option[V])(present: T => U): Map[U, V] = + sumByKey { + keys.iterator.map(k => present(k) -> lookup(k).getOrElse(Monoid.zero[V])) + } + + // Returns a new map with zero-value entries removed + def removeZeros[K, V: Monoid](m: Map[K, V]): Map[K, V] = + m.filter { case (_, v) => Monoid.isNonZero(v) } + + /** + * For each key, sum all the values. Note that if V is a Monoid, the current implementation will drop from + * the output any key where the values are all Monoid.zero. If the Semigroup is a Monoid, This function is + * equivalent to: + * + * pairs.filter(_._2 != Monoid.zero).groupBy(_._1).mapValues(_.map(_._2).sum) + * + * Otherwise, the function is equivalent to: + * + * pairs.groupBy(_._1).mapValues(_.map(_._2).sum) + */ + def sumByKey[K, V: Semigroup](pairs: TraversableOnce[(K, V)]): Map[K, V] = + Monoid.sum(pairs.iterator.map(Map(_))) + + /** + * For each key, creates a list of all values. This function is equivalent to: + * + * pairs.groupBy(_._1).mapValues(_.map(_._2)) + */ + def group[K, V](pairs: TraversableOnce[(K, V)]): Map[K, List[V]] = + if (pairs.iterator.isEmpty) Map.empty + else { + val mutable = MMap[K, Builder[V, List[V]]]() + pairs.iterator.foreach { case (k, v) => + val oldVOpt = mutable.get(k) + // sorry for the micro optimization here: avoiding a closure + val bldr = if (oldVOpt.isEmpty) { + val b = List.newBuilder[V] + mutable.update(k, b) + b + } else oldVOpt.get + bldr += v + } + mutable.iterator.map { case (k, bldr) => (k, bldr.result()) }.toMap + } + + // Consider this as edges from k -> v, produce a Map[K,Set[V]] + def toGraph[K, V](pairs: TraversableOnce[(K, V)]): Map[K, Set[V]] = + Monoid.sum(pairs.map { case (k, v) => Map(k -> Set(v)) }) + + /** join the keys of two maps (similar to outer-join in a DB) */ + def join[K, V, W](map1: Map[K, V], map2: Map[K, W]): Map[K, (Option[V], Option[W])] = + Monoid + .plus( + map1.transform { case (_, v) => + (List(v), List[W]()) + }, + map2.transform { case (_, w) => + (List[V](), List(w)) + } + ) + .transform { case (_, (v, w)) => (v.headOption, w.headOption) } + + /** + * Reverses a graph losslessly None key is for v's with no sources. + */ + def invertExact[K, V](m: Map[Option[K], Set[V]]): Map[Option[V], Set[K]] = { + def nonEmptyIter[T](i: Iterable[T]): Iterable[Option[T]] = + if (i.isEmpty) Iterable(None) + else { + i.map(Some(_)) + } + + Monoid.sum { + for { + (k, sv) <- m.view.toIterable + v <- nonEmptyIter(sv) + } yield Map(v -> k.toSet) + } + } + + /** + * Invert the Common case of exactly one value for each key + */ + def invert[K, V](m: Map[K, V]): Map[V, Set[K]] = + Monoid.sum(m.view.toIterable.map { case (k, v) => Map(v -> Set(k)) }) + + def dot[K, V](left: Map[K, V], right: Map[K, V])(implicit mring: Ring[Map[K, V]], mon: Monoid[V]): V = + Monoid.sum(mring.times(left, right).values) + + def cube[K, V](it: TraversableOnce[(K, V)])(implicit c: Cuber[K]): Map[c.K, List[V]] = { + val map: MMap[c.K, List[V]] = MMap[c.K, List[V]]() + it.iterator.foreach { case (k, v) => + c(k).iterator.foreach { ik => + map.get(ik) match { + case Some(vs) => map += ik -> (v :: vs) + case None => map += ik -> List(v) + } + } + } + map.foreach { case (k, v) => map(k) = v.reverse } + new MutableBackedMap(map) + } + + def cubeSum[K, V](it: TraversableOnce[(K, V)])(implicit c: Cuber[K], sg: Semigroup[V]): Map[c.K, V] = + sumByKey(it.iterator.flatMap { case (k, v) => c(k).map((_, v)) }) + + def cubeAggregate[T, K, U, V](it: TraversableOnce[T], agg: Aggregator[T, U, V])( + fn: T => K + )(implicit c: Cuber[K]): Map[c.K, V] = + sumByKey(it.iterator.flatMap(t => c(fn(t)).iterator.map((_, agg.prepare(t)))))(agg.semigroup) + .map { case (k, v) => (k, agg.present(v)) } + + def rollup[K, V](it: TraversableOnce[(K, V)])(implicit r: Roller[K]): Map[r.K, List[V]] = { + val map: MMap[r.K, List[V]] = MMap[r.K, List[V]]() + it.iterator.foreach { case (k, v) => + r(k).iterator.foreach { ik => + map.get(ik) match { + case Some(vs) => map += ik -> (v :: vs) + case None => map += ik -> List(v) + } + } + } + map.foreach { case (k, v) => map(k) = v.reverse } + new MutableBackedMap(map) + } + + def rollupSum[K, V](it: TraversableOnce[(K, V)])(implicit r: Roller[K], sg: Semigroup[V]): Map[r.K, V] = + sumByKey(it.iterator.flatMap { case (k, v) => r(k).iterator.map((_, v)) }) + + def rollupAggregate[T, K, U, V](it: TraversableOnce[T], agg: Aggregator[T, U, V])( + fn: T => K + )(implicit r: Roller[K]): Map[r.K, V] = + sumByKey(it.iterator.flatMap(t => r(fn(t)).iterator.map((_, agg.prepare(t)))))(agg.semigroup) + .map { case (k, v) => (k, agg.present(v)) } + +} diff --git a/algebird-core/src/main/scala-2.12/Scan.scala b/algebird-core/src/main/scala-2.12/Scan.scala new file mode 100644 index 000000000..2dc2ff9c2 --- /dev/null +++ b/algebird-core/src/main/scala-2.12/Scan.scala @@ -0,0 +1,333 @@ +package com.twitter.algebird + +import scala.collection.compat._ + +object Scan { + + /** + * Most consumers of Scan don't care about the type of the type State type variable. But for those that do, + * we make an effort to expose it in all of our combinators. + * @tparam I + * @tparam S + * @tparam O + */ + type Aux[-I, S, +O] = Scan[I, O] { type State = S } + + implicit def applicative[I]: Applicative[Scan[I, _]] = new ScanApplicative[I] + + def from[I, S, O](initState: S)(presentAndNextStateFn: (I, S) => (O, S)): Aux[I, S, O] = + new Scan[I, O] { + override type State = S + override val initialState = initState + override def presentAndNextState(i: I, s: State): (O, State) = presentAndNextStateFn(i, s) + } + + def fromFunction[I, O](f: I => O): Aux[I, Unit, O] = new Scan[I, O] { + override type State = Unit + override val initialState = () + override def presentAndNextState(i: I, stateBeforeProcessingI: Unit): (O, State) = (f(i), ()) + } + + /** + * Scans take streams of inputs to streams of outputs, but some scans have trivial inputs and just produce a + * stream of outputs. Streams can be thought of as being a hidden state that is queryable for a head + * element, and another hidden state that represents the rest of the stream. + * @param initState + * The initial state of the scan; think of this as an infinite stream. + * @param destructor + * This function decomposes a stream into the its head-element and tail-stream. + * @tparam S + * The hidden state of the stream that we are turning into a Scan. + * @tparam O + * The type of the elments of the stream that we are turning into a Scan + * @return + * A Scan whose inputs are irrelevant, and whose outputs are those that we would get from implementing a + * stream using the information provided to this method. + */ + def iterate[S, O](initState: S)(destructor: S => (O, S)): Aux[Any, S, O] = new Scan[Any, O] { + override type State = S + override val initialState = initState + override def presentAndNextState(i: Any, stateBeforeProcessingI: S): (O, S) = + destructor(stateBeforeProcessingI) + } + + /** + * A Scan whose `Nth` output is the number `N` (starting from 0). + */ + val index: Aux[Any, Long, Long] = iterate(0L)(n => (n, n + 1)) + + def identity[A]: Aux[A, Unit, A] = fromFunction[A, A](x => x) + + /** + * @param initStateCreator + * A call-by-name method that allocates new mutable state + * @param presentAndUpdateStateFn + * A function that both presents the output value, and has the side-effect of updating the mutable state + * @tparam I + * @tparam S + * @tparam O + * @return + * A Scan that safely encapsulates state while it's doing its thing. + */ + def mutable[I, S, O](initStateCreator: => S)(presentAndUpdateStateFn: (I, S) => O): Aux[I, S, O] = + new Scan[I, O] { + override type State = S + override def initialState = initStateCreator + override def presentAndNextState(i: I, s: S): (O, S) = (presentAndUpdateStateFn(i, s), s) + } + + /** + * The trivial scan that always returns the same value, regardless of input + * @param t + * @tparam T + */ + def const[T](t: T): Aux[Any, Unit, T] = fromFunction(_ => t) + + /** + * @param aggregator + * @param initState + * @tparam A + * @tparam B + * @tparam C + * @return + * A scan which, when given `[a_1, ..., a_n]` outputs `[c_1, ..., c_n]` where `c_i = initState + + * aggregator.prepare(a_1) + ... + aggregator.prepare(a_i)` + */ + def fromAggregator[A, B, C](aggregator: Aggregator[A, B, C], initState: B): Aux[A, B, C] = + from(initState) { (a: A, stateBeforeProcessingI: B) => + // nb: the order of the arguments to semigroup.plus here is what determines the order of the final summation; + // this matters because not all semigroups are commutative + val stateAfterProcessingA = + aggregator.append(stateBeforeProcessingI, a) + (aggregator.present(stateAfterProcessingA), stateAfterProcessingA) + } + + /** + * @param monoidAggregator + * @tparam A + * @tparam B + * @tparam C + * @return + * A scan which, when given `[a_1, ..., a_n]` outputs `[c_1, ..., c_n]` where `c_i = + * monoidAggregator.monoid.zero + aggregator.prepare(a_1) + ... + aggregator.prepare(a_i)` + */ + def fromMonoidAggregator[A, B, C](monoidAggregator: MonoidAggregator[A, B, C]): Aux[A, B, C] = + fromAggregator(monoidAggregator, monoidAggregator.monoid.zero) + +} + +/** + * The Scan trait is an alternative to the `scanLeft` method on iterators/other collections for a range of of + * use-cases where `scanLeft` is awkward to use. At a high level it provides some of the same functionality as + * `scanLeft`, but with a separation of "what is the state of the scan" from "what are the elements that I'm + * scanning over?". In particular, when scanning over an iterator with `N` elements, the output is an iterator + * with `N` elements (in contrast to scanLeft's `N+1`). + * + * If you find yourself writing a `scanLeft` over pairs of elements, where you only use one element of the + * pair within the `scanLeft`, then throw that element away in a `map` immediately after the scanLeft is done, + * then this abstraction is for you. + * + * The canonical method to use a scan is `apply`. + * + * @tparam I + * The type of elements that the computation is scanning over. + * @tparam O + * The output type of the scan (typically distinct from the hidden `State` of the scan). + */ +sealed abstract class Scan[-I, +O] extends Serializable { + + import Scan.{from, Aux} + + /** + * The computation of any given scan involves keeping track of a hidden state. + */ + type State + + /** + * The state of the scan before any elements have been processed + * @return + */ + def initialState: State + + /** + * @param i + * An element in the stream to process + * @param stateBeforeProcessingI + * The state of the scan before processing i + * @return + * The output of the scan corresponding to processing i with state stateBeforeProcessing, along with the + * result of updating stateBeforeProcessing with the information from i. + */ + def presentAndNextState(i: I, stateBeforeProcessingI: State): (O, State) + + /** + * @param iter + * @return + * If `iter = Iterator(a_1, ..., a_n)`, return:` `Iterator(o_1, ..., o_n)` where `(o_(i+1), state_(i+1)) = + * presentAndNextState(a_i, state_i)` and `state_0 = initialState` + */ + def scanIterator(iter: Iterator[I]): Iterator[O] = new AbstractIterator[O] { + override def hasNext: Boolean = iter.hasNext + var state: State = initialState + override def next(): O = { + val thisState = state + val thisA = iter.next() + val (thisC, nextState) = presentAndNextState(thisA, thisState) + state = nextState + thisC + } + } + + /** + * @param inputs + * @param bf + * @tparam In + * The type of the input collection + * @tparam Out + * The type of the output collection + * @return + * Given inputs as a collection of the form `[a_1, ..., a_n]` the output will be a collection of the form: + * `[o_1, ..., o_n]` where `(o_(i+1), state_(i+1)) = presentAndNextState(a_i, state_i)` and `state_0 = + * initialState`. + */ + def apply[In <: TraversableOnce[I], Out]( + inputs: In + )(implicit bf: BuildFrom[In, O, Out]): Out = + bf.fromSpecific(inputs)(scanIterator(inputs.toIterator)) + + // combinators + + /** + * Return a new scan that is the same as this scan, but with a different `initialState`. + * @param newInitialState + * @return + */ + def replaceState(newInitialState: => State): Aux[I, State, O] = + from(newInitialState)(presentAndNextState(_, _)) + + def composePrepare[I1](f: I1 => I): Aux[I1, State, O] = from(initialState) { (i, stateBeforeProcessingI) => + presentAndNextState(f(i), stateBeforeProcessingI) + } + + def andThenPresent[O1](g: O => O1): Aux[I, State, O1] = from(initialState) { (i, stateBeforeProcessingI) => + val (c, stateAfterProcessingA) = presentAndNextState(i, stateBeforeProcessingI) + (g(c), stateAfterProcessingA) + } + + /** + * Return a scan that is semantically identical to `this.join(Scan.identity[I1])`, but where we don't + * pollute the `State` by pairing it redundantly with `Unit`. + * @tparam I1 + * @return + * If this Scan's `apply` method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1, + * ..., o_n`, then this results in a Scan whose `apply` method returns `[(o_1, a_1), ..., (o_n, a_n)]` + * when given the same input. + */ + def joinWithInput[I1 <: I]: Aux[I1, State, (O, I1)] = from(initialState) { (i, stateBeforeProcessingI) => + val (o, stateAfterProcessingI) = presentAndNextState(i, stateBeforeProcessingI) + ((o, i), stateAfterProcessingI) + } + + /** + * Return a scan whose output is paired with the state of the scan before each input updates the state. + * @return + * If this Scan's `apply` method is given inputs [a_1, ..., a_n] resulting in outputs of the form `[o_1, + * ..., o_n]`, where `(o_(i+1), state_(i+1)) = presentAndNextState(a_i, state_i)` and `state_0 = + * initialState`, return a scan that whose apply method, when given inputs `[a_1, ..., a_n]` will return + * `[(o_1, state_0), ..., (o_n, state_(n-1))]`. + */ + def joinWithPriorState: Aux[I, State, (State, O)] = from(initialState) { (i, stateBeforeProcessingI) => + val (o, stateAfterProcessingA) = presentAndNextState(i, stateBeforeProcessingI) + ((stateBeforeProcessingI, o), stateAfterProcessingA) + } + + /** + * Return a scan whose output is paired with the state of the scan after each input updates the state. + * @return + * If this Scan's `apply` method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1, + * ..., o_n]`, where `(o_(i+1), state_(i+1)) = presentAndNextState(a_i, state_i)`` and state_0 = + * initialState, return a scan that whose apply method, when given inputs `[a_1, ..., a_n]` will return + * `[(o_1, state_1), ..., (o_n, state_n]`. + */ + def joinWithPosteriorState: Aux[I, State, (O, State)] = from(initialState) { (i, stateBeforeProcessingI) => + val (c, stateAfterProcessingA) = presentAndNextState(i, stateBeforeProcessingI) + ((c, stateAfterProcessingA), stateAfterProcessingA) + } + + /** + * For every `foo`, `scan.joinWithIndex(foo) == scan(foo).zipWithIndex`. + * @return + * If this Scan's `apply` method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1, + * ..., o_n]`, return a scan that whose apply method, when given the same input, will return `[(o_1, 1), + * ..., (o_n, n)]`. + */ + def joinWithIndex: Aux[I, (State, Long), (O, Long)] = join(Scan.index) + + /** + * Compose two scans pairwise such that, when given pairwise zipped inputs, the resulting scan will output + * pairwise zipped outputs. + * @param scan2 + * @tparam I2 + * @tparam O2 + * @return + * If this Scan's apply method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1, + * ..., o_n]`, and `scan2.apply([b_1, ..., b_n] = [p_1, ..., p_n]` then `zip` will return a scan whose + * apply method, when given input `[(a_1, b_1), ..., (a_n, b_n)]` results in the output `[(o_1, p_1), ..., + * (o_2, p_2)]`. In other words: `scan.zip(scan2)(foo.zip(bar)) == scan(foo).zip(scan2(bar))` + */ + def zip[I2, O2](scan2: Scan[I2, O2]): Aux[(I, I2), (State, scan2.State), (O, O2)] = + from((initialState, scan2.initialState)) { (i1i2, stateBeforeProcessingI1I2) => + val (o1, state1AfterProcesingI1) = + presentAndNextState(i1i2._1, stateBeforeProcessingI1I2._1) + val (o2, state2AfterProcesingI2) = + scan2.presentAndNextState(i1i2._2, stateBeforeProcessingI1I2._2) + ((o1, o2), (state1AfterProcesingI1, state2AfterProcesingI2)) + } + + /** + * Given a scan that takes compatible input to this one, pairwise compose the state and outputs of each scan + * on a common input stream. + * @param scan2 + * @tparam I2 + * @tparam O2 + * @return + * If this Scan's apply method is given inputs [a_1, ..., a_n] resulting in outputs of the form `[o_1, + * ..., o_n]`, and `scan2.apply([a_1, ..., a_n] = [p_1, ..., p_n]` then `join` will return a scan whose + * apply method returns `[(o_1, p_1), ..., (o_2, p_2)]`. In other words: `scan.join(scan2)(foo) == + * scan(foo).zip(scan2(foo))` + */ + def join[I2 <: I, O2](scan2: Scan[I2, O2]): Aux[I2, (State, scan2.State), (O, O2)] = + from((initialState, scan2.initialState)) { (i, stateBeforeProcessingI) => + val (o1, state1AfterProcesingI1) = presentAndNextState(i, stateBeforeProcessingI._1) + val (o2, state2AfterProcesingI2) = scan2.presentAndNextState(i, stateBeforeProcessingI._2) + ((o1, o2), (state1AfterProcesingI1, state2AfterProcesingI2)) + } + + /** + * Takes the output of this scan and feeds as input into scan2. + * @param scan2 + * @tparam P + * @return + * If this Scan's apply method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1, + * ..., o_n]`, and `scan2.apply([o_1, ..., o_n] = [p_1, ..., p_n]` then `compose` will return a scan which + * returns `[p_1, ..., p_n]`. + */ + def compose[P](scan2: Scan[O, P]): Aux[I, (State, scan2.State), P] = + from((initialState, scan2.initialState)) { (i, stateBeforeProcessingI) => + val (o, state1AfterProcesingI) = presentAndNextState(i, stateBeforeProcessingI._1) + val (p, state2AfterProcesingO) = scan2.presentAndNextState(o, stateBeforeProcessingI._2) + (p, (state1AfterProcesingI, state2AfterProcesingO)) + } + +} + +class ScanApplicative[I] extends Applicative[Scan[I, _]] { + override def map[T, U](mt: Scan[I, T])(fn: T => U): Scan[I, U] = + mt.andThenPresent(fn) + + override def apply[T](v: T): Scan[I, T] = + Scan.const(v) + + override def join[T, U](mt: Scan[I, T], mu: Scan[I, U]): Scan[I, (T, U)] = + mt.join(mu) +} diff --git a/algebird-core/src/main/scala-2.12/SpaceSaver.scala b/algebird-core/src/main/scala-2.12/SpaceSaver.scala new file mode 100644 index 000000000..5f9eee7e6 --- /dev/null +++ b/algebird-core/src/main/scala-2.12/SpaceSaver.scala @@ -0,0 +1,296 @@ +package com.twitter.algebird + +import java.nio.ByteBuffer + +import scala.collection.immutable.SortedMap +import scala.util.{Failure, Success, Try} + +object SpaceSaver { + + /** + * Construct SpaceSaver with given capacity containing a single item. This is the public api to create a new + * SpaceSaver. + */ + def apply[T](capacity: Int, item: T): SpaceSaver[T] = SSOne(capacity, item) + + /** + * Construct SpaceSaver with given capacity containing a single item with provided exact count. This is the + * public api to create a new SpaceSaver. + */ + def apply[T](capacity: Int, item: T, count: Long): SpaceSaver[T] = + SSMany(capacity, Map(item -> ((count, 0L)))) + + private[algebird] val ordering = + Ordering.by[(?, (Long, Long)), (Long, Long)] { case (_, (count, err)) => + (-count, err) + } + + implicit def spaceSaverSemiGroup[T]: Semigroup[SpaceSaver[T]] = + new SpaceSaverSemigroup[T] + + /** + * Encodes the SpaceSaver as a sequence of bytes containing in order + * - 1 byte: 1/2 => 1 = SSOne, 2 = SSMany + * - 4 bytes: the capacity + * - N bytes: the item/counters (counters as length + N*(item size + item + 2 * counters) + */ + def toBytes[T](ss: SpaceSaver[T], tSerializer: T => Array[Byte]): Array[Byte] = + ss match { + case SSOne(capacity, item) => + val itemAsBytes = tSerializer(item) + val itemLength = itemAsBytes.length + // 1 for the type, 4 for capacity, 4 for itemAsBytes.length + val buffer = new Array[Byte](1 + 4 + 4 + itemLength) + ByteBuffer + .wrap(buffer) + .put(1: Byte) + .putInt(capacity) + .putInt(itemLength) + .put(itemAsBytes) + buffer + + case SSMany( + capacity, + counters, + _ + ) => // We do not care about the buckets are thery are created by SSMany.apply + val buffer = scala.collection.mutable.ArrayBuffer.newBuilder[Byte] + buffer += (2: Byte) + + var buff = ByteBuffer.allocate(4) + buff.putInt(capacity) + buffer ++= buff.array() + + buff = ByteBuffer.allocate(4) + buff.putInt(counters.size) + buffer ++= buff.array() + counters.foreach { case (item, (a, b)) => + val itemAsBytes = tSerializer(item) + + buff = ByteBuffer.allocate(4) + buff.putInt(itemAsBytes.length) + buffer ++= buff.array() + + buffer ++= itemAsBytes + + buff = ByteBuffer.allocate(8 * 2) + buff.putLong(a) + buff.putLong(b) + buffer ++= buff.array() + } + buffer.result().toArray + } + + // Make sure to be reversible so fromBytes(toBytes(x)) == x + def fromBytes[T](bytes: Array[Byte], tDeserializer: Array[Byte] => Try[T]): Try[SpaceSaver[T]] = + fromByteBuffer(ByteBuffer.wrap(bytes), buffer => tDeserializer(buffer.array())) + + def fromByteBuffer[T](bb: ByteBuffer, tDeserializer: ByteBuffer => Try[T]): Try[SpaceSaver[T]] = + Try { + bb.get.toInt match { + case 1 => + val capacity = bb.getInt + val itemLength = bb.getInt + val itemAsBytes = new Array[Byte](itemLength) + bb.get(itemAsBytes) + tDeserializer(ByteBuffer.wrap(itemAsBytes)).map(item => SSOne(capacity, item)) + case 2 => + val capacity = bb.getInt + + var countersToDeserialize = bb.getInt + val counters = scala.collection.mutable.Map.empty[T, (Long, Long)] + while (countersToDeserialize != 0) { + val itemLength = bb.getInt() + val itemAsBytes = new Array[Byte](itemLength) + bb.get(itemAsBytes) + val item = tDeserializer(ByteBuffer.wrap(itemAsBytes)) + + val a = bb.getLong + val b = bb.getLong + + item match { + case Failure(e) => return Failure(e) + case Success(i) => + counters += ((i, (a, b))) + } + + countersToDeserialize -= 1 + } + + Success(SSMany(capacity, counters.toMap)) + } + }.flatten +} + +/** + * Data structure used in the Space-Saving Algorithm to find the approximate most frequent and top-k elements. + * The algorithm is described in "Efficient Computation of Frequent and Top-k Elements in Data Streams". See + * here: www.cs.ucsb.edu/research/tech_reports/reports/2005-23.pdf In the paper the data structure is called + * StreamSummary but we chose to call it SpaceSaver instead. Note that the adaptation to hadoop and + * parallelization were not described in the article and have not been proven to be mathematically correct or + * preserve the guarantees or benefits of the algorithm. + */ +sealed abstract class SpaceSaver[T] { + import SpaceSaver.ordering + + /** + * Maximum number of counters to keep (parameter "m" in the research paper). + */ + def capacity: Int + + /** + * Current lowest value for count + */ + def min: Long + + /** + * Map of item to counter, where each counter consists of an observed count and possible over-estimation + * (error) + */ + def counters: Map[T, (Long, Long)] + + def ++(other: SpaceSaver[T]): SpaceSaver[T] + + /** + * returns the frequency estimate for the item + */ + def frequency(item: T): Approximate[Long] = { + val (count, err) = counters.getOrElse(item, (min, min)) + Approximate(count - err, count, count, 1.0) + } + + /** + * Get the elements that show up more than thres times. Returns sorted in descending order: (item, + * Approximate[Long], guaranteed) + */ + def mostFrequent(thres: Int): Seq[(T, Approximate[Long], Boolean)] = + counters.iterator + .filter { case (_, (count, _)) => count >= thres } + .toList + .sorted(ordering) + .map { case (item, (count, err)) => + (item, Approximate(count - err, count, count, 1.0), thres <= count - err) + } + + /** + * Get the top-k elements. Returns sorted in descending order: (item, Approximate[Long], guaranteed) + */ + def topK(k: Int): Seq[(T, Approximate[Long], Boolean)] = { + require(k < capacity) + val si = counters.toList + .sorted(ordering) + val siK = si.take(k) + val countKPlus1 = si.drop(k).headOption.map(_._2._1).getOrElse(0L) + siK.map { case (item, (count, err)) => + (item, Approximate(count - err, count, count, 1.0), countKPlus1 < count - err) + } + } + + /** + * Check consistency with other SpaceSaver, useful for testing. Returns boolean indicating if they are + * consistent + */ + def consistentWith(that: SpaceSaver[T]): Boolean = + (counters.keys ++ that.counters.keys).forall(item => (frequency(item) - that.frequency(item)) ~ 0) +} + +case class SSOne[T] private[algebird] (override val capacity: Int, item: T) extends SpaceSaver[T] { + require(capacity > 1) + + override def min: Long = 0L + + override def counters: Map[T, (Long, Long)] = Map(item -> ((1L, 1L))) + + override def ++(other: SpaceSaver[T]): SpaceSaver[T] = other match { + case other: SSOne[?] => SSMany(this).add(other) + case other: SSMany[?] => other.add(this) + } +} + +object SSMany { + private def bucketsFromCounters[T](counters: Map[T, (Long, Long)]): SortedMap[Long, Set[T]] = + SortedMap[Long, Set[T]]() ++ counters.groupBy(_._2._1).mapValues(_.keySet).toMap + + private[algebird] def apply[T](capacity: Int, counters: Map[T, (Long, Long)]): SSMany[T] = + SSMany(capacity, counters, bucketsFromCounters(counters)) + + private[algebird] def apply[T](one: SSOne[T]): SSMany[T] = + SSMany(one.capacity, Map(one.item -> ((1L, 0L))), SortedMap(1L -> Set(one.item))) +} + +case class SSMany[T] private ( + override val capacity: Int, + override val counters: Map[T, (Long, Long)], + buckets: SortedMap[Long, Set[T]] +) extends SpaceSaver[T] { + private val exact: Boolean = counters.size < capacity + + override val min: Long = if (counters.size < capacity) 0L else buckets.firstKey + + // item is already present and just needs to be bumped up one + private def bump(item: T) = { + val (count, err) = counters(item) + val counters1 = counters + (item -> ((count + 1L, err))) // increment by one + val currBucket = buckets(count) // current bucket + val buckets1 = { + if (currBucket.size == 1) // delete current bucket since it will be empty + buckets - count + else // remove item from current bucket + buckets + (count -> (currBucket - item)) + } + (count + 1L -> (buckets.getOrElse(count + 1L, Set()) + item)) + SSMany(capacity, counters1, buckets1) + } + + // lose one item to meet capacity constraint + private def loseOne = { + val firstBucket = buckets(buckets.firstKey) + val itemToLose = firstBucket.head + val counters1 = counters - itemToLose + val buckets1 = + if (firstBucket.size == 1) + buckets - min + else + buckets + (min -> (firstBucket - itemToLose)) + SSMany(capacity, counters1, buckets1) + } + + // introduce new item + private def introduce(item: T, count: Long, err: Long) = { + val counters1 = counters + (item -> ((count, err))) + val buckets1 = buckets + (count -> (buckets.getOrElse(count, Set()) + item)) + SSMany(capacity, counters1, buckets1) + } + + // add a single element + private[algebird] def add(x: SSOne[T]): SSMany[T] = { + require(x.capacity == capacity) + if (counters.contains(x.item)) + bump(x.item) + else + (if (exact) this else this.loseOne).introduce(x.item, min + 1L, min) + } + + // merge two stream summaries + private def merge(x: SSMany[T]): SSMany[T] = { + require(x.capacity == capacity) + val counters1 = Map() ++ + (counters.keySet ++ x.counters.keySet).toList + .map { key => + val (count1, err1) = counters.getOrElse(key, (min, min)) + val (count2, err2) = x.counters.getOrElse(key, (x.min, x.min)) + key -> ((count1 + count2, err1 + err2)) + } + .sorted(SpaceSaver.ordering) + .take(capacity) + SSMany(capacity, counters1) + } + + override def ++(other: SpaceSaver[T]): SpaceSaver[T] = other match { + case other: SSOne[?] => add(other) + case other: SSMany[?] => merge(other) + } +} + +class SpaceSaverSemigroup[T] extends Semigroup[SpaceSaver[T]] { + override def plus(x: SpaceSaver[T], y: SpaceSaver[T]): SpaceSaver[T] = x ++ y +} diff --git a/algebird-core/src/main/scala-2.12/VectorSpace.scala b/algebird-core/src/main/scala-2.12/VectorSpace.scala new file mode 100644 index 000000000..f8818600c --- /dev/null +++ b/algebird-core/src/main/scala-2.12/VectorSpace.scala @@ -0,0 +1,59 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ + +package com.twitter.algebird + +import scala.annotation.implicitNotFound + +/** + * This class represents a vector space. For the required properties see: + * + * http://en.wikipedia.org/wiki/Vector_space#Definition + */ +object VectorSpace extends VectorSpaceOps with Implicits + +sealed trait VectorSpaceOps { + def scale[F, C[_]](v: F, c: C[F])(implicit vs: VectorSpace[F, C]): C[F] = + vs.scale(v, c) + def from[F, C[_]](scaleFn: (F, C[F]) => C[F])(implicit r: Ring[F], cGroup: Group[C[F]]): VectorSpace[F, C] = + new VectorSpace[F, C] { + override def ring: Ring[F] = r + override def group: Group[C[F]] = cGroup + override def scale(v: F, c: C[F]): C[F] = + if (r.isNonZero(v)) scaleFn(v, c) else cGroup.zero + } +} +private object VectorSpaceOps extends VectorSpaceOps + +sealed trait Implicits extends LowPrioImpicits { + implicit def indexedSeqSpace[T: Ring]: VectorSpace[T, IndexedSeq] = + VectorSpaceOps.from[T, IndexedSeq]((s, seq) => seq.map(Ring.times(s, _))) +} + +sealed trait LowPrioImpicits { + implicit def mapSpace[K, T: Ring]: VectorSpace[T, Map[K, _]] = + VectorSpaceOps.from[T, Map[K, _]] { (s, m) => + m.transform { case (_, v) => Ring.times(s, v) } + } +} + +@implicitNotFound(msg = "Cannot find VectorSpace type class for Container: ${C} and Ring: ${F}") +trait VectorSpace[F, C[_]] extends java.io.Serializable { + implicit def ring: Ring[F] + def field: Ring[F] = ring // this is for compatibility with older versions + implicit def group: Group[C[F]] + def scale(v: F, c: C[F]): C[F] +} diff --git a/algebird-core/src/main/scala-2.12/monad/EitherMonad.scala b/algebird-core/src/main/scala-2.12/monad/EitherMonad.scala new file mode 100644 index 000000000..b6d5e2ffc --- /dev/null +++ b/algebird-core/src/main/scala-2.12/monad/EitherMonad.scala @@ -0,0 +1,37 @@ +/* + Copyright 2013 Twitter, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +package com.twitter.algebird.monad + +import com.twitter.algebird.Monad + +// Monad for either, used for modeling Error where L is the type of the error +object EitherMonad { + class Error[L] extends Monad[Either[L, *]] { + override def apply[R](r: R): Right[L, R] = Right(r) + + override def flatMap[T, U](self: Either[L, T])(next: T => Either[L, U]): Either[L, U] = + self.right.flatMap(next) + + override def map[T, U](self: Either[L, T])(fn: T => U): Either[L, U] = + self.right.map(fn) + } + + implicit def monad[L]: Monad[Either[L, _]] = new Error[L] + + def assert[L](truth: Boolean, failure: => L): Either[L, Unit] = + if (truth) Right(()) else Left(failure) +} diff --git a/algebird-core/src/main/scala-2.12/monad/Reader.scala b/algebird-core/src/main/scala-2.12/monad/Reader.scala new file mode 100644 index 000000000..e0747af20 --- /dev/null +++ b/algebird-core/src/main/scala-2.12/monad/Reader.scala @@ -0,0 +1,76 @@ +/* + Copyright 2013 Twitter, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +package com.twitter.algebird.monad + +import com.twitter.algebird.Monad + +// TODO this is general, move somewhere better + +// Reader Monad, represents a series of operations that mutate some environment +// type (the input to the function) + +sealed trait Reader[-Env, +T] { + def apply(env: Env): T + def flatMap[E1 <: Env, U](next: T => Reader[E1, U]): Reader[E1, U] = + FlatMappedReader[E1, T, U](this, next) + def map[U](thatFn: T => U): Reader[Env, U] = + FlatMappedReader(this, (t: T) => ConstantReader(thatFn(t))) +} + +final case class ConstantReader[+T](get: T) extends Reader[Any, T] { + override def apply(env: Any): T = get + override def map[U](fn: T => U): ConstantReader[U] = ConstantReader(fn(get)) + override def flatMap[E1 <: Any, U](next: T => Reader[E1, U]): Reader[E1, U] = + next(get) +} +final case class ReaderFn[E, +T](fn: E => T) extends Reader[E, T] { + override def apply(env: E): T = fn(env) +} +final case class FlatMappedReader[E, U, +T](first: Reader[E, U], fn: U => Reader[E, T]) extends Reader[E, T] { + override def apply(env: E): T = { + @annotation.tailrec + def loop(r: Reader[E, Any], stack: List[(Any) => Reader[E, Any]]): Any = + r match { + case ConstantReader(get) => + stack match { + case head :: tail => loop(head(get), tail) + case Nil => get + } + case ReaderFn(fn) => + stack match { + case head :: tail => loop(head(fn(env)), tail) + case Nil => fn(env) + } + case FlatMappedReader(first, nextFn) => loop(first, nextFn :: stack) + } + loop(first, List(fn.asInstanceOf[(Any) => Reader[E, Any]])).asInstanceOf[T] + } +} + +object Reader { + def const[T](t: T): Reader[Any, T] = ConstantReader(t) + implicit def apply[E, T](fn: (E) => T): Reader[E, T] = ReaderFn(fn) + + class ReaderM[Env] extends Monad[Reader[Env, _]] { + override def apply[T](t: T): ConstantReader[T] = ConstantReader(t) + override def flatMap[T, U](self: Reader[Env, T])(next: T => Reader[Env, U]): Reader[Env, U] = + self.flatMap(next) + override def map[T, U](self: Reader[Env, T])(fn: T => U): Reader[Env, U] = self.map(fn) + } + + implicit def monad[Env]: Monad[Reader[Env, _]] = new ReaderM[Env] +} diff --git a/algebird-core/src/main/scala-2.12/monad/StateWithError.scala b/algebird-core/src/main/scala-2.12/monad/StateWithError.scala new file mode 100644 index 000000000..e15a9ebc3 --- /dev/null +++ b/algebird-core/src/main/scala-2.12/monad/StateWithError.scala @@ -0,0 +1,130 @@ +/* + Copyright 2013 Twitter, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +package com.twitter.algebird.monad + +import com.twitter.algebird.{Monad, Semigroup} + +/** + * Monad to handle mutating input state and possible failures. This is used to interact in the planning phase + * with existing mutable APIs (like storm or cascading), but retain the ability to compose carefully. + */ +sealed trait StateWithError[S, +F, +T] { + def join[F1 >: F, U]( + that: StateWithError[S, F1, U], + mergeErr: (F1, F1) => F1, + mergeState: (S, S) => S + ): StateWithError[S, F1, (T, U)] = + join(that)(Semigroup.from(mergeErr), Semigroup.from(mergeState)) + + def join[F1 >: F, U](that: StateWithError[S, F1, U])(implicit + sgf: Semigroup[F1], + sgs: Semigroup[S] + ): // TODO: deep joins could blow the stack, not yet using trampoline here + StateWithError[S, F1, (T, U)] = + StateFn { (requested: S) => + (run(requested), that.run(requested)) match { + case (Right((s1, r1)), Right((s2, r2))) => + Right((sgs.plus(s1, s2), (r1, r2))) + case (Left(err1), Left(err2)) => + Left(sgf.plus(err1, err2)) // Our earlier is not ready + case (Left(err), _) => Left(err) + case (_, Left(err)) => Left(err) + } + } + + def apply(state: S): Either[F, (S, T)] = run(state) + + def run(state: S): Either[F, (S, T)] + + def flatMap[F1 >: F, U](next: T => StateWithError[S, F1, U]): StateWithError[S, F1, U] = + FlatMappedState(this, next) + + def map[U](fn: (T) => U): StateWithError[S, F, U] = + FlatMappedState(this, (t: T) => StateWithError.const(fn(t))) +} + +/** Simple wrapper of a function in the Monad */ +final case class StateFn[S, F, T](fn: S => Either[F, (S, T)]) extends StateWithError[S, F, T] { + override def run(state: S): Either[F, (S, T)] = fn(state) +} + +/** + * A Trampolining instance that should prevent stack overflow at the expense of performance + */ +final case class FlatMappedState[S, F, T, U](start: StateWithError[S, F, T], fn: T => StateWithError[S, F, U]) + extends StateWithError[S, F, U] { + override def run(state: S): Either[F, (S, U)] = { + @annotation.tailrec + def loop(inState: S, st: StateWithError[S, F, Any], stack: List[Any => StateWithError[S, F, Any]]): Any = + st match { + case StateFn(fn) => + fn(inState) match { + case err @ Left(_) => err // bail at first error + case noError @ Right((newState, out)) => + stack match { + case head :: tailStack => loop(newState, head(out), tailStack) + case Nil => noError // recursion ends + } + } + case FlatMappedState(st, next) => loop(inState, st, next :: stack) + } + loop(state, this, Nil).asInstanceOf[Either[F, (S, U)]] + } +} + +object StateWithError { + def getState[S]: StateWithError[S, Nothing, S] = + StateFn((state: S) => Right((state, state))) + def putState[S](newState: S): StateWithError[S, Nothing, Unit] = + StateFn((_: S) => Right((newState, ()))) + def swapState[S](newState: S): StateWithError[S, Nothing, S] = + StateFn((old: S) => Right((newState, old))) + + def const[S, T](t: T): StateWithError[S, Nothing, T] = + StateFn((state: S) => Right((state, t))) + def lazyVal[S, T](t: => T): StateWithError[S, Nothing, T] = + StateFn((state: S) => Right((state, t))) + def failure[S, F](f: F): StateWithError[S, F, Nothing] = + StateFn(_ => Left(f)) + + /** + * Use like fromEither[Int](Right("good")) to get a constant Either in the monad + */ + def fromEither[S]: ConstantStateMaker[S] = new ConstantStateMaker[S] + class ConstantStateMaker[S] { + def apply[F, T](either: Either[F, T]): StateWithError[S, F, T] = { (s: S) => either.right.map((s, _)) } + } + + class FunctionLifter[S] { + def apply[I, F, T](fn: I => Either[F, T]): (I => StateWithError[S, F, T]) = { (i: I) => + StateFn((s: S) => fn(i).right.map((s, _))) + } + } + // TODO this should move to Monad and work for any Monad + def toKleisli[S]: FunctionLifter[S] = new FunctionLifter[S] + + implicit def apply[S, F, T](fn: S => Either[F, (S, T)]): StateWithError[S, F, T] = StateFn(fn) + implicit def monad[S, F]: Monad[StateWithError[S, F, _]] = new StateFMonad[F, S] + + class StateFMonad[F, S] extends Monad[StateWithError[S, F, _]] { + override def apply[T](const: T): StateWithError[S, Nothing, T] = { (s: S) => Right((s, const)) } + override def flatMap[T, U]( + earlier: StateWithError[S, F, T] + )(next: T => StateWithError[S, F, U]): StateWithError[S, F, U] = + earlier.flatMap(next) + } +} diff --git a/algebird-core/src/main/scala-2.13/Aggregator.scala b/algebird-core/src/main/scala-2.13/Aggregator.scala new file mode 100644 index 000000000..8a4d2b230 --- /dev/null +++ b/algebird-core/src/main/scala-2.13/Aggregator.scala @@ -0,0 +1,637 @@ +package com.twitter.algebird + +import java.util.PriorityQueue +import scala.collection.compat._ +import scala.collection.generic.CanBuildFrom + +/** + * Aggregators compose well. + * + * To create a parallel aggregator that operates on a single input in parallel, use: + * GeneratedTupleAggregator.from2((agg1, agg2)) + */ +object Aggregator extends java.io.Serializable { + implicit def applicative[I]: Applicative[({ type L[O] = Aggregator[I, ?, O] })#L] = + new AggregatorApplicative[I] + + private val DefaultSeed = 471312384 + + /** + * This is a trivial aggregator that always returns a single value + */ + def const[T](t: T): MonoidAggregator[Any, Unit, T] = + prepareMonoid { (_: Any) => () }.andThenPresent(_ => t) + + /** + * Using Aggregator.prepare,present you can add to this aggregator + */ + def fromReduce[T](red: (T, T) => T): Aggregator[T, T, T] = + fromSemigroup(Semigroup.from(red)) + def fromSemigroup[T](implicit sg: Semigroup[T]): Aggregator[T, T, T] = + new Aggregator[T, T, T] { + override def prepare(input: T): T = input + override def semigroup: Semigroup[T] = sg + override def present(reduction: T): T = reduction + } + def fromMonoid[T](implicit mon: Monoid[T]): MonoidAggregator[T, T, T] = + prepareMonoid(identity[T]) + // Uses the product from the ring + def fromRing[T](implicit rng: Ring[T]): RingAggregator[T, T, T] = + fromRing[T, T](rng, identity[T]) + + def fromMonoid[F, T](implicit mon: Monoid[T], prep: F => T): MonoidAggregator[F, T, T] = + prepareMonoid(prep)(mon) + + def prepareSemigroup[F, T](prep: F => T)(implicit sg: Semigroup[T]): Aggregator[F, T, T] = + new Aggregator[F, T, T] { + override def prepare(input: F): T = prep(input) + override def semigroup: Semigroup[T] = sg + override def present(reduction: T): T = reduction + } + def prepareMonoid[F, T](prep: F => T)(implicit m: Monoid[T]): MonoidAggregator[F, T, T] = + new MonoidAggregator[F, T, T] { + override def prepare(input: F): T = prep(input) + override def monoid: Monoid[T] = m + override def present(reduction: T): T = reduction + } + // Uses the product from the ring + def fromRing[F, T](implicit rng: Ring[T], prep: F => T): RingAggregator[F, T, T] = + new RingAggregator[F, T, T] { + override def prepare(input: F): T = prep(input) + override def ring: Ring[T] = rng + override def present(reduction: T): T = reduction + } + + /** + * Obtain an [[Aggregator]] that uses an efficient append operation for faster aggregation. Equivalent to + * {{{appendSemigroup(prep, appnd, identity[T]_)(sg)}}} + */ + def appendSemigroup[F, T](prep: F => T, appnd: (T, F) => T)(implicit + sg: Semigroup[T] + ): Aggregator[F, T, T] = + appendSemigroup(prep, appnd, identity[T])(sg) + + /** + * Obtain an [[Aggregator]] that uses an efficient append operation for faster aggregation + * @tparam F + * Data input type + * @tparam T + * Aggregating [[Semigroup]] type + * @tparam P + * Presentation (output) type + * @param prep + * The preparation function. Expected to construct an instance of type T from a single data element. + * @param appnd + * Function that appends the [[Semigroup]]. Defines the [[Aggregator.append]] method for this aggregator. + * Analogous to the 'seqop' function in Scala's sequence 'aggregate' method + * @param pres + * The presentation function + * @param sg + * The [[Semigroup]] type class + * @note + * The functions 'appnd' and 'prep' are expected to obey the law: {{{appnd(t, f) == sg.plus(t, prep(f))}}} + */ + def appendSemigroup[F, T, P](prep: F => T, appnd: (T, F) => T, pres: T => P)(implicit + sg: Semigroup[T] + ): Aggregator[F, T, P] = + new Aggregator[F, T, P] { + override def semigroup: Semigroup[T] = sg + override def prepare(input: F): T = prep(input) + override def present(reduction: T): P = pres(reduction) + + override def apply(inputs: TraversableOnce[F]): P = + applyOption(inputs).get + + override def applyOption(inputs: TraversableOnce[F]): Option[P] = + agg(inputs).map(pres) + + override def append(l: T, r: F): T = appnd(l, r) + + override def appendAll(old: T, items: TraversableOnce[F]): T = + if (items.iterator.isEmpty) old else reduce(old, agg(items).get) + + private def agg(inputs: TraversableOnce[F]): Option[T] = + if (inputs.iterator.isEmpty) None + else { + val itr = inputs.iterator + val t = prepare(itr.next) + Some(itr.foldLeft(t)(appnd)) + } + } + + /** + * Obtain a [[MonoidAggregator]] that uses an efficient append operation for faster aggregation. Equivalent + * to {{{appendMonoid(appnd, identity[T]_)(m)}}} + */ + def appendMonoid[F, T](appnd: (T, F) => T)(implicit m: Monoid[T]): MonoidAggregator[F, T, T] = + appendMonoid(appnd, identity[T])(m) + + /** + * Obtain a [[MonoidAggregator]] that uses an efficient append operation for faster aggregation + * @tparam F + * Data input type + * @tparam T + * Aggregating [[Monoid]] type + * @tparam P + * Presentation (output) type + * @param appnd + * Function that appends the [[Monoid]]. Defines the [[MonoidAggregator.append]] method for this + * aggregator. Analogous to the 'seqop' function in Scala's sequence 'aggregate' method + * @param pres + * The presentation function + * @param m + * The [[Monoid]] type class + * @note + * The function 'appnd' is expected to obey the law: {{{appnd(t, f) == m.plus(t, appnd(m.zero, f))}}} + */ + def appendMonoid[F, T, P](appnd: (T, F) => T, pres: T => P)(implicit + m: Monoid[T] + ): MonoidAggregator[F, T, P] = + new MonoidAggregator[F, T, P] { + override def monoid: Monoid[T] = m + override def prepare(input: F): T = appnd(m.zero, input) + override def present(reduction: T): P = pres(reduction) + + override def apply(inputs: TraversableOnce[F]): P = present(agg(inputs)) + + override def applyOption(inputs: TraversableOnce[F]): Option[P] = + if (inputs.isEmpty) None else Some(apply(inputs)) + + override def append(l: T, r: F): T = appnd(l, r) + + override def appendAll(old: T, items: TraversableOnce[F]): T = + reduce(old, agg(items)) + + override def appendAll(items: TraversableOnce[F]): T = agg(items) + + private def agg(inputs: TraversableOnce[F]): T = + inputs.foldLeft(m.zero)(append) + } + + /** + * How many items satisfy a predicate + */ + def count[T](pred: T => Boolean): MonoidAggregator[T, Long, Long] = + prepareMonoid { (t: T) => if (pred(t)) 1L else 0L } + + /** + * Do any items satisfy some predicate + */ + def exists[T](pred: T => Boolean): MonoidAggregator[T, Boolean, Boolean] = + prepareMonoid(pred)(OrVal.unboxedMonoid) + + /** + * Do all items satisfy a predicate + */ + def forall[T](pred: T => Boolean): MonoidAggregator[T, Boolean, Boolean] = + prepareMonoid(pred)(AndVal.unboxedMonoid) + + /** + * Take the first (left most in reduce order) item found + */ + def head[T]: Aggregator[T, T, T] = fromReduce[T]((l, _) => l) + + /** + * Take the last (right most in reduce order) item found + */ + def last[T]: Aggregator[T, T, T] = fromReduce[T]((_, r) => r) + + /** + * Get the maximum item + */ + def max[T: Ordering]: Aggregator[T, T, T] = new MaxAggregator[T] + def maxBy[U, T: Ordering](fn: U => T): Aggregator[U, U, U] = { + implicit val ordU: Ordering[U] = Ordering.by(fn) + max[U] + } + + /** + * Get the minimum item + */ + def min[T: Ordering]: Aggregator[T, T, T] = new MinAggregator[T] + def minBy[U, T: Ordering](fn: U => T): Aggregator[U, U, U] = { + implicit val ordU: Ordering[U] = Ordering.by(fn) + min[U] + } + + /** + * This returns the number of items we find + */ + def size: MonoidAggregator[Any, Long, Long] = + prepareMonoid((_: Any) => 1L) + + /** + * Take the smallest `count` items using a heap + */ + def sortedTake[T: Ordering](count: Int): MonoidAggregator[T, PriorityQueue[T], Seq[T]] = + new mutable.PriorityQueueToListAggregator[T](count) + + /** + * Same as sortedTake, but using a function that returns a value that has an Ordering. + * + * This function is like writing list.sortBy(fn).take(count). + */ + def sortByTake[T, U: Ordering](count: Int)(fn: T => U): MonoidAggregator[T, PriorityQueue[T], Seq[T]] = + Aggregator.sortedTake(count)(Ordering.by(fn)) + + /** + * Take the largest `count` items using a heap + */ + def sortedReverseTake[T: Ordering](count: Int): MonoidAggregator[T, PriorityQueue[T], Seq[T]] = + new mutable.PriorityQueueToListAggregator[T](count)(implicitly[Ordering[T]].reverse) + + /** + * Same as sortedReverseTake, but using a function that returns a value that has an Ordering. + * + * This function is like writing list.sortBy(fn).reverse.take(count). + */ + def sortByReverseTake[T, U: Ordering]( + count: Int + )(fn: T => U): MonoidAggregator[T, PriorityQueue[T], Seq[T]] = + Aggregator.sortedReverseTake(count)(Ordering.by(fn)) + + /** + * Immutable version of sortedTake, for frameworks that check immutability of reduce functions. + */ + def immutableSortedTake[T: Ordering](count: Int): MonoidAggregator[T, TopK[T], Seq[T]] = + new TopKToListAggregator[T](count) + + /** + * Immutable version of sortedReverseTake, for frameworks that check immutability of reduce functions. + */ + def immutableSortedReverseTake[T: Ordering](count: Int): MonoidAggregator[T, TopK[T], Seq[T]] = + new TopKToListAggregator[T](count)(implicitly[Ordering[T]].reverse) + + /** + * Randomly selects input items where each item has an independent probability 'prob' of being selected. + * This assumes that all sampled records can fit in memory, so use this only when the expected number of + * sampled values is small. + */ + def randomSample[T]( + prob: Double, + seed: Int = DefaultSeed + ): MonoidAggregator[T, Option[Batched[T]], List[T]] = { + assert(prob >= 0 && prob <= 1, "randomSample.prob must lie in [0, 1]") + val rng = new java.util.Random(seed) + Preparer[T] + .filter(_ => rng.nextDouble() <= prob) + .monoidAggregate(toList) + } + + /** + * Selects exactly 'count' of the input records randomly (or all of the records if there are less then + * 'count' total records). This assumes that all 'count' of the records can fit in memory, so use this only + * for small values of 'count'. + */ + def reservoirSample[T]( + count: Int, + seed: Int = DefaultSeed + ): MonoidAggregator[T, PriorityQueue[(Double, T)], Seq[T]] = { + val rng = new java.util.Random(seed) + Preparer[T] + .map(rng.nextDouble() -> _) + .monoidAggregate(sortByTake(count)(_._1)) + .andThenPresent(_.map(_._2)) + } + + /** + * Put everything in a List. Note, this could fill the memory if the List is very large. + */ + def toList[T]: MonoidAggregator[T, Option[Batched[T]], List[T]] = + new MonoidAggregator[T, Option[Batched[T]], List[T]] { + override def prepare(t: T): Option[Batched[T]] = Some(Batched(t)) + override def monoid: Monoid[Option[Batched[T]]] = + Monoid.optionMonoid(Batched.semigroup) + override def present(o: Option[Batched[T]]): List[T] = + o.map(_.toList).getOrElse(Nil) + } + + /** + * Put everything in a Set. Note, this could fill the memory if the Set is very large. + */ + def toSet[T]: MonoidAggregator[T, Set[T], Set[T]] = + prepareMonoid { (t: T) => Set(t) } + + /** + * This builds an in-memory Set, and then finally gets the size of that set. This may not be scalable if the + * Uniques are very large. You might check the approximateUniqueCount or HyperLogLog Aggregator to get an + * approximate version of this that is scalable. + */ + def uniqueCount[T]: MonoidAggregator[T, Set[T], Int] = + toSet[T].andThenPresent(_.size) + + /** + * Using a constant amount of memory, give an approximate unique count (~ 1% error). This uses an exact set + * for up to 100 items, then HyperLogLog (HLL) with an 1.2% standard error which uses at most 8192 bytes for + * each HLL. For more control, see HyperLogLogAggregator. + */ + def approximateUniqueCount[T: Hash128]: MonoidAggregator[T, Either[HLL, Set[T]], Long] = + SetSizeHashAggregator[T](hllBits = 13, maxSetSize = 100) + + /** + * Returns the lower bound of a given percentile where the percentile is between (0,1] The items that are + * iterated over cannot be negative. + */ + def approximatePercentile[T](percentile: Double, k: Int = QTreeAggregator.DefaultK)(implicit + num: Numeric[T] + ): QTreeAggregatorLowerBound[T] = + QTreeAggregatorLowerBound[T](percentile, k) + + /** + * Returns the intersection of a bounded percentile where the percentile is between (0,1] The items that are + * iterated over cannot be negative. + */ + def approximatePercentileBounds[T](percentile: Double, k: Int = QTreeAggregator.DefaultK)(implicit + num: Numeric[T] + ): QTreeAggregator[T] = + QTreeAggregator[T](percentile, k) + + /** + * An aggregator that sums Numeric values into Doubles. + * + * This is really no more than converting to Double and then summing. The conversion to double means we + * don't have the overflow semantics of integer types on the jvm (e.g. Int.MaxValue + 1 == Int.MinValue). + * + * Note that if you instead wanted to aggregate Numeric values of a type T into the same type T (e.g. if you + * want MonoidAggregator[T, T, T] for some Numeric type T), you can directly use Aggregator.fromMonoid[T] + * after importing the numericRing implicit: + * + * > import com.twitter.algebird.Ring.numericRing > def numericAggregator[T: Numeric]: MonoidAggregator[T, + * T, T] = Aggregator.fromMonoid[T] + */ + def numericSum[T](implicit num: Numeric[T]): MonoidAggregator[T, Double, Double] = + Preparer[T].map(num.toDouble).monoidAggregate(Aggregator.fromMonoid) + +} + +/** + * This is a type that models map/reduce(map). First each item is mapped, then we reduce with a semigroup, + * then finally we present the results. + * + * Unlike Fold, Aggregator keeps it's middle aggregation type externally visible. This is because Aggregators + * are useful in parallel map/reduce systems where there may be some additional types needed to cross the + * map/reduce boundary (such a serialization and intermediate storage). If you don't care about the middle + * type, an _ may be used and the main utility of the instance is still preserved (e.g. def operate[T](ag: + * Aggregator[T, _, Int]): Int) + * + * Note, join is very useful to combine multiple aggregations with one pass. Also + * GeneratedTupleAggregator.fromN((agg1, agg2, ... aggN)) can glue these together well. + * + * This type is the the Fold.M from Haskell's fold package: + * https://hackage.haskell.org/package/folds-0.6.2/docs/Data-Fold-M.html + */ +trait Aggregator[-A, B, +C] extends java.io.Serializable { self => + def prepare(input: A): B + def semigroup: Semigroup[B] + def present(reduction: B): C + + /* ***** + * All the following are in terms of the above + */ + + /** + * combine two inner values + */ + def reduce(l: B, r: B): B = semigroup.plus(l, r) + + /** + * This may error if items is empty. To be safe you might use reduceOption if you don't know that items is + * non-empty + */ + def reduce(items: TraversableOnce[B]): B = semigroup.sumOption(items).get + + /** + * This is the safe version of the above. If the input in empty, return None, else reduce the items + */ + def reduceOption(items: TraversableOnce[B]): Option[B] = + semigroup.sumOption(items) + + /** + * This may error if inputs are empty (for Monoid Aggregators it never will, instead you see + * present(Monoid.zero[B]) + */ + def apply(inputs: TraversableOnce[A]): C = + present(reduce(inputs.iterator.map(prepare))) + + /** + * This returns None if the inputs are empty + */ + def applyOption(inputs: TraversableOnce[A]): Option[C] = + reduceOption(inputs.iterator.map(prepare)) + .map(present) + + /** + * This returns the cumulative sum of its inputs, in the same order. If the inputs are empty, the result + * will be empty too. + */ + def cumulativeIterator(inputs: Iterator[A]): Iterator[C] = + inputs + .scanLeft(None: Option[B]) { + case (None, a) => Some(prepare(a)) + case (Some(b), a) => Some(append(b, a)) + } + .collect { case Some(b) => present(b) } + + /** + * This returns the cumulative sum of its inputs, in the same order. If the inputs are empty, the result + * will be empty too. + */ + def applyCumulatively[In <: TraversableOnce[A], Out]( + inputs: In + )(implicit bf: CanBuildFrom[In, C, Out]): Out = + (bf: BuildFrom[In, C, Out]).fromSpecific(inputs)(cumulativeIterator(inputs.iterator)) + + def append(l: B, r: A): B = reduce(l, prepare(r)) + + def appendAll(old: B, items: TraversableOnce[A]): B = + if (items.iterator.isEmpty) old else reduce(old, reduce(items.iterator.map(prepare))) + + /** Like calling andThen on the present function */ + def andThenPresent[D](present2: C => D): Aggregator[A, B, D] = + new Aggregator[A, B, D] { + override def prepare(input: A): B = self.prepare(input) + override def semigroup: Semigroup[B] = self.semigroup + override def present(reduction: B): D = present2(self.present(reduction)) + } + + /** Like calling compose on the prepare function */ + def composePrepare[A1](prepare2: A1 => A): Aggregator[A1, B, C] = + new Aggregator[A1, B, C] { + override def prepare(input: A1): B = self.prepare(prepare2(input)) + override def semigroup: Semigroup[B] = self.semigroup + override def present(reduction: B): C = self.present(reduction) + } + + /** + * This allows you to run two aggregators on the same data with a single pass + */ + def join[A2 <: A, B2, C2](that: Aggregator[A2, B2, C2]): Aggregator[A2, (B, B2), (C, C2)] = + GeneratedTupleAggregator.from2((this, that)) + + /** + * This allows you to join two aggregators into one that takes a tuple input, which in turn allows you to + * chain .composePrepare onto the result if you have an initial input that has to be prepared differently + * for each of the joined aggregators. + * + * The law here is: ag1.zip(ag2).apply(as.zip(bs)) == (ag1(as), ag2(bs)) + */ + def zip[A2, B2, C2](ag2: Aggregator[A2, B2, C2]): Aggregator[(A, A2), (B, B2), (C, C2)] = { + val ag1 = this + new Aggregator[(A, A2), (B, B2), (C, C2)] { + override def prepare(a: (A, A2)): (B, B2) = (ag1.prepare(a._1), ag2.prepare(a._2)) + override val semigroup = new Tuple2Semigroup()(ag1.semigroup, ag2.semigroup) + override def present(b: (B, B2)): (C, C2) = (ag1.present(b._1), ag2.present(b._2)) + } + } + + /** + * An Aggregator can be converted to a Fold, but not vice-versa Note, a Fold is more constrained so only do + * this if you require joining a Fold with an Aggregator to produce a Fold + */ + def toFold: Fold[A, Option[C]] = + Fold.fold[Option[B], A, Option[C]]( + { + case (None, a) => Some(self.prepare(a)) + case (Some(b), a) => Some(self.append(b, a)) + }, + None, + _.map(self.present) + ) + + def lift: MonoidAggregator[A, Option[B], Option[C]] = + new MonoidAggregator[A, Option[B], Option[C]] { + override def prepare(input: A): Option[B] = Some(self.prepare(input)) + override def present(reduction: Option[B]): Option[C] = reduction.map(self.present) + override def monoid = new OptionMonoid[B]()(self.semigroup) + } +} + +/** + * Aggregators are Applicatives, but this hides the middle type. If you need a join that does not hide the + * middle type use join on the trait, or GeneratedTupleAggregator.fromN + */ +class AggregatorApplicative[I] extends Applicative[({ type L[O] = Aggregator[I, ?, O] })#L] { + override def map[T, U](mt: Aggregator[I, ?, T])(fn: T => U): Aggregator[I, ?, U] = + mt.andThenPresent(fn) + override def apply[T](v: T): Aggregator[I, ?, T] = + Aggregator.const(v) + override def join[T, U](mt: Aggregator[I, ?, T], mu: Aggregator[I, ?, U]): Aggregator[I, ?, (T, U)] = + mt.join(mu) + override def join[T1, T2, T3]( + m1: Aggregator[I, ?, T1], + m2: Aggregator[I, ?, T2], + m3: Aggregator[I, ?, T3] + ): Aggregator[I, ?, (T1, T2, T3)] = + GeneratedTupleAggregator.from3((m1, m2, m3)) + + override def join[T1, T2, T3, T4]( + m1: Aggregator[I, ?, T1], + m2: Aggregator[I, ?, T2], + m3: Aggregator[I, ?, T3], + m4: Aggregator[I, ?, T4] + ): Aggregator[I, ?, (T1, T2, T3, T4)] = + GeneratedTupleAggregator.from4((m1, m2, m3, m4)) + + override def join[T1, T2, T3, T4, T5]( + m1: Aggregator[I, ?, T1], + m2: Aggregator[I, ?, T2], + m3: Aggregator[I, ?, T3], + m4: Aggregator[I, ?, T4], + m5: Aggregator[I, ?, T5] + ): Aggregator[I, ?, (T1, T2, T3, T4, T5)] = + GeneratedTupleAggregator.from5((m1, m2, m3, m4, m5)) +} + +trait MonoidAggregator[-A, B, +C] extends Aggregator[A, B, C] { self => + def monoid: Monoid[B] + override def semigroup: Monoid[B] = monoid + final override def reduce(items: TraversableOnce[B]): B = + monoid.sum(items) + + def appendAll(items: TraversableOnce[A]): B = reduce(items.iterator.map(prepare)) + + override def andThenPresent[D](present2: C => D): MonoidAggregator[A, B, D] = { + val self = this + new MonoidAggregator[A, B, D] { + override def prepare(a: A): B = self.prepare(a) + override def monoid: Monoid[B] = self.monoid + override def present(b: B): D = present2(self.present(b)) + } + } + override def composePrepare[A2](prepare2: A2 => A): MonoidAggregator[A2, B, C] = { + val self = this + new MonoidAggregator[A2, B, C] { + override def prepare(a: A2): B = self.prepare(prepare2(a)) + override def monoid: Monoid[B] = self.monoid + override def present(b: B): C = self.present(b) + } + } + + /** + * Build a MonoidAggregator that either takes left or right input and outputs the pair from both + */ + def either[A2, B2, C2]( + that: MonoidAggregator[A2, B2, C2] + ): MonoidAggregator[Either[A, A2], (B, B2), (C, C2)] = + new MonoidAggregator[Either[A, A2], (B, B2), (C, C2)] { + override def prepare(e: Either[A, A2]): (B, B2) = e match { + case Left(a) => (self.prepare(a), that.monoid.zero) + case Right(a2) => (self.monoid.zero, that.prepare(a2)) + } + override val monoid = new Tuple2Monoid[B, B2]()(self.monoid, that.monoid) + override def present(bs: (B, B2)): (C, C2) = (self.present(bs._1), that.present(bs._2)) + } + + /** + * Only transform values where the function is defined, else discard + */ + def collectBefore[A2](fn: PartialFunction[A2, A]): MonoidAggregator[A2, B, C] = + new MonoidAggregator[A2, B, C] { + override def prepare(a: A2): B = + if (fn.isDefinedAt(a)) self.prepare(fn(a)) else self.monoid.zero + override def monoid: Monoid[B] = self.monoid + override def present(b: B): C = self.present(b) + } + + /** + * Only aggregate items that match a predicate + */ + def filterBefore[A1 <: A](pred: A1 => Boolean): MonoidAggregator[A1, B, C] = + new MonoidAggregator[A1, B, C] { + override def prepare(a: A1): B = if (pred(a)) self.prepare(a) else self.monoid.zero + override def monoid: Monoid[B] = self.monoid + override def present(b: B): C = self.present(b) + } + + /** + * This maps the inputs to Bs, then sums them, effectively flattening the inputs to the MonoidAggregator + */ + def sumBefore: MonoidAggregator[TraversableOnce[A], B, C] = + new MonoidAggregator[TraversableOnce[A], B, C] { + override def monoid: Monoid[B] = self.monoid + override def prepare(input: TraversableOnce[A]): B = + monoid.sum(input.iterator.map(self.prepare)) + override def present(reduction: B): C = self.present(reduction) + } + + /** + * This allows you to join two aggregators into one that takes a tuple input, which in turn allows you to + * chain .composePrepare onto the result if you have an initial input that has to be prepared differently + * for each of the joined aggregators. + * + * The law here is: ag1.zip(ag2).apply(as.zip(bs)) == (ag1(as), ag2(bs)) + */ + def zip[A2, B2, C2](ag2: MonoidAggregator[A2, B2, C2]): MonoidAggregator[(A, A2), (B, B2), (C, C2)] = { + val ag1 = self + new MonoidAggregator[(A, A2), (B, B2), (C, C2)] { + override def prepare(a: (A, A2)): (B, B2) = (ag1.prepare(a._1), ag2.prepare(a._2)) + override val monoid = new Tuple2Monoid[B, B2]()(ag1.monoid, ag2.monoid) + override def present(b: (B, B2)): (C, C2) = (ag1.present(b._1), ag2.present(b._2)) + } + } +} + +trait RingAggregator[-A, B, +C] extends MonoidAggregator[A, B, C] { + def ring: Ring[B] + override def monoid: Monoid[B] = Ring.asTimesMonoid(ring) +} diff --git a/algebird-core/src/main/scala-2.13/CountMinSketch.scala b/algebird-core/src/main/scala-2.13/CountMinSketch.scala new file mode 100644 index 000000000..826aebd5a --- /dev/null +++ b/algebird-core/src/main/scala-2.13/CountMinSketch.scala @@ -0,0 +1,1420 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ + +package com.twitter.algebird + +import algebra.CommutativeMonoid + +import scala.collection.compat._ + +/** + * A Count-Min sketch is a probabilistic data structure used for summarizing streams of data in sub-linear + * space. + * + * It works as follows. Let `(eps, delta)` be two parameters that describe the confidence in our error + * estimates, and let `d = ceil(ln 1/delta)` and `w = ceil(e / eps)`. + * + * Note: Throughout the code `d` and `w` are called `depth` and `width`, respectively. + * + * Then: + * + * - Take `d` pairwise independent hash functions `h_i`, each of which maps onto the domain `[0, w - 1]`. + * - Create a 2-dimensional table of counts, with `d` rows and `w` columns, initialized with all zeroes. + * - When a new element x arrives in the stream, update the table of counts by setting `counts[i, h_i[x]] += + * 1`, for each `1 <= i <= d`. + * - (Note the rough similarity to a Bloom filter.) + * + * As an example application, suppose you want to estimate the number of times an element `x` has appeared in + * a data stream so far. The Count-Min sketch estimate of this frequency is + * + * min_i { counts[i, h_i[x]] } + * + * With probability at least `1 - delta`, this estimate is within `eps * N` of the true frequency (i.e., `true + * frequency <= estimate <= true frequency + eps * N`), where N is the total size of the stream so far. + * + * See http://www.eecs.harvard.edu/~michaelm/CS222/countmin.pdf for technical details, including proofs of the + * estimates and error bounds used in this implementation. + * + * Parts of this implementation are taken from + * https://github.com/clearspring/stream-lib/blob/master/src/main/java/com/clearspring/analytics/stream/frequency/CountMinSketch.java + * + * @author + * Edwin Chen + */ +/** + * Monoid for adding CMS sketches. + * + * =Usage= + * + * `eps` and `delta` are parameters that bound the error of each query estimate. For example, errors in + * answering point queries (e.g., how often has element x appeared in the stream described by the sketch?) are + * often of the form: "with probability p >= 1 - delta, the estimate is close to the truth by some factor + * depending on eps." + * + * The type `K` is the type of items you want to count. You must provide an implicit `CMSHasher[K]` for `K`, + * and Algebird ships with several such implicits for commonly used types such as `Long` and `BigInt`. + * + * If your type `K` is not supported out of the box, you have two options: 1) You provide a "translation" + * function to convert items of your (unsupported) type `K` to a supported type such as Double, and then use + * the `contramap` function of [[CMSHasher]] to create the required `CMSHasher[K]` for your type (see the + * documentation of [[CMSHasher]] for an example); 2) You implement a `CMSHasher[K]` from scratch, using the + * existing CMSHasher implementations as a starting point. + * + * Note: Because Arrays in Scala/Java not have sane `equals` and `hashCode` implementations, you cannot safely + * use types such as `Array[Byte]`. Extra work is required for Arrays. For example, you may opt to convert + * `Array[T]` to a `Seq[T]` via `toSeq`, or you can provide appropriate wrapper classes. Algebird provides one + * such wrapper class, [[Bytes]], to safely wrap an `Array[Byte]` for use with CMS. + * + * @param eps + * One-sided error bound on the error of each point query, i.e. frequency estimate. + * @param delta + * A bound on the probability that a query estimate does not lie within some small interval (an interval + * that depends on `eps`) around the truth. + * @param seed + * A seed to initialize the random number generator used to create the pairwise independent hash functions. + * @param maxExactCountOpt + * An Option parameter about how many exact counts a sparse CMS wants to keep. + * @tparam K + * The type used to identify the elements to be counted. For example, if you want to count the occurrence of + * user names, you could map each username to a unique numeric ID expressed as a `Long`, and then count the + * occurrences of those `Long`s with a CMS of type `K=Long`. Note that this mapping between the elements of + * your problem domain and their identifiers used for counting via CMS should be bijective. We require a + * [[CMSHasher]] context bound for `K`, see [[CMSHasherImplicits]] for available implicits that can be + * imported. Which type K should you pick in practice? For domains that have less than `2^64` unique + * elements, you'd typically use `Long`. For larger domains you can try `BigInt`, for example. Other + * possibilities include Spire's `SafeLong` and `Numerical` data types (https://github.com/non/spire), + * though Algebird does not include the required implicits for CMS-hashing (cf. [[CMSHasherImplicits]]. + */ +class CMSMonoid[K: CMSHasher](eps: Double, delta: Double, seed: Int, maxExactCountOpt: Option[Int] = None) + extends Monoid[CMS[K]] + with CommutativeMonoid[CMS[K]] { + + val params: CMSParams[K] = { + val hashes: Seq[CMSHash[K]] = CMSFunctions.generateHashes(eps, delta, seed) + CMSParams(hashes, eps, delta, maxExactCountOpt) + } + + override val zero: CMS[K] = CMSZero[K](params) + + /** + * Combines the two sketches. + * + * The sketches must use the same hash functions. + */ + override def plus(left: CMS[K], right: CMS[K]): CMS[K] = { + require(left.params.hashes == right.params.hashes, "The sketches must use the same hash functions.") + left ++ right + } + + /** + * Creates a sketch out of a single item. + */ + def create(item: K): CMS[K] = CMSItem[K](item, 1L, params) + + /** + * Creates a sketch out of multiple items. + */ + def create(data: Seq[K]): CMS[K] = { + val summation = new CMSSummation(params) + data.foreach(k => summation.insert(k, 1L)) + summation.result + } + + override def sumOption(sketches: TraversableOnce[CMS[K]]): Option[CMS[K]] = + if (sketches.iterator.isEmpty) None else Some(sum(sketches)) + + override def sum(sketches: TraversableOnce[CMS[K]]): CMS[K] = { + val summation = new CMSSummation(params) + summation.updateAll(sketches) + summation.result + } +} + +/** + * This mutable builder can be used when speed is essential and you can be sure the scope of the mutability + * cannot escape in an unsafe way. The intended use is to allocate and call result in one method without + * letting a reference to the instance escape into a closure. + */ +class CMSSummation[K](params: CMSParams[K]) { + private[this] val hashes = params.hashes.toArray + private[this] val height = CMSFunctions.depth(params.delta) + private[this] val width = CMSFunctions.width(params.eps) + private[this] val cells = new Array[Long](height * width) + private[this] var totalCount = 0L + + final def insert(k: K, count: Long): Unit = { + var row = 0 + var offset = 0 + val hs = hashes + while (row < hs.length) { + cells(offset + hs(row)(k)) += count + offset += width + row += 1 + } + totalCount += count + } + + def updateAll(sketches: TraversableOnce[CMS[K]]): Unit = + sketches.iterator.foreach(updateInto) + + def updateInto(cms: CMS[K]): Unit = + cms match { + case CMSZero(_) => + () + case CMSItem(item, count, _) => + insert(item, count) + case SparseCMS(table, _, _) => + table.foreach { case (item, c) => + insert(item, c) + } + case CMSInstance(CMSInstance.CountsTable(matrix), count, _) => + var offset = 0 + val rit = matrix.iterator + while (rit.hasNext) { + var col = 0 + val cit = rit.next().iterator + while (cit.hasNext) { + cells(offset + col) += cit.next() + col += 1 + } + offset += width + } + totalCount += count + } + + def result: CMS[K] = + if (totalCount == 0L) CMSZero(params) + else { + def vectorize(row: Int): Vector[Long] = { + val offset = row * width + val b = Vector.newBuilder[Long] + var col = 0 + while (col < width) { + b += cells(offset + col) + col += 1 + } + b.result() + } + + val b = Vector.newBuilder[Vector[Long]] + var row = 0 + while (row < height) { + b += vectorize(row) + row += 1 + } + CMSInstance(CMSInstance.CountsTable(b.result()), totalCount, params) + } +} + +/** + * An Aggregator for [[CMS]]. Can be created using CMS.aggregator. + */ +case class CMSAggregator[K](cmsMonoid: CMSMonoid[K]) extends MonoidAggregator[K, CMS[K], CMS[K]] { + override val monoid: CMSMonoid[K] = cmsMonoid + + override def prepare(value: K): CMS[K] = monoid.create(value) + + override def present(cms: CMS[K]): CMS[K] = cms + +} + +/** + * Configuration parameters for [[CMS]]. + * + * @param hashes + * Pair-wise independent hashes functions. We need `N=depth` such functions (`depth` can be derived from + * `delta`). + * @param eps + * One-sided error bound on the error of each point query, i.e. frequency estimate. + * @param delta + * A bound on the probability that a query estimate does not lie within some small interval (an interval + * that depends on `eps`) around the truth. + * @param maxExactCountOpt + * An Option parameter about how many exact counts a sparse CMS wants to keep. + * @tparam K + * The type used to identify the elements to be counted. + */ +case class CMSParams[K]( + hashes: Seq[CMSHash[K]], + eps: Double, + delta: Double, + maxExactCountOpt: Option[Int] = None +) { + + require(0 < eps && eps < 1, "eps must lie in (0, 1)") + require(0 < delta && delta < 1, "delta must lie in (0, 1)") + require( + hashes.size >= CMSFunctions.depth(delta), + s"we require at least ${CMSFunctions.depth(delta)} hash functions" + ) + +} + +/** + * Helper functions to generate or to translate between various CMS parameters (cf. [[CMSParams]]). + */ +object CMSFunctions { + + /** + * Translates from `width` to `eps`. + */ + def eps(width: Int): Double = scala.math.exp(1.0) / width + + /** + * Translates from `depth` to `delta`. + */ + @throws[IllegalArgumentException]("if depth is too large, causing precision errors when computing delta") + def delta(depth: Int): Double = { + val i = scala.math.exp(-depth) + require( + i > 0.0, + s"depth must be smaller as it causes precision errors when computing delta ($depth led to an invalid delta of $i)" + ) + i + } + + /** + * Translates from `delta` to `depth`. + */ + @throws[IllegalArgumentException]("if delta is is not in (0, 1)") + def depth(delta: Double): Int = { + require(0 < delta && delta < 1, "delta must lie in (0, 1)") + scala.math.ceil(scala.math.log(1.0 / delta)).toInt + } + + /** + * Translates from `eps` to `width`. + */ + def width(eps: Double): Int = + scala.math.ceil(truncatePrecisionError(scala.math.exp(1) / eps)).toInt + + /** + * Compute maxExactCount from parameters or `depth` and `width` + */ + def maxExactCount(maxExactCountOpt: Option[Int], depth: Int, width: Int): Int = + maxExactCountOpt.getOrElse(math.max(width * depth / 100, 50)) + + // Eliminates precision errors such as the following: + // + // scala> val width = 39 + // scala> scala.math.exp(1) / CMSFunctions.eps(width) + // res171: Double = 39.00000000000001 <<< should be 39.0 + // + // Because of the actual types on which CMSFunctions operates (i.e. Int and Double), the maximum number of decimal + // places should be 6. + private def truncatePrecisionError(i: Double, decimalPlaces: Int = 6) = + BigDecimal(i) + .setScale(decimalPlaces, BigDecimal.RoundingMode.HALF_UP) + .toDouble + + /** + * Generates `N=depth` pair-wise independent hash functions. + * + * @param eps + * One-sided error bound on the error of each point query, i.e. frequency estimate. + * @param delta + * Error bound on the probability that a query estimate does NOT lie within some small interval around the + * truth. + * @param seed + * Seed for the random number generator. + * @tparam K + * The type used to identify the elements to be counted. + * @return + * The generated hash functions. + */ + def generateHashes[K: CMSHasher](eps: Double, delta: Double, seed: Int): Seq[CMSHash[K]] = { + // Typically, we would use d -- aka depth -- pair-wise independent hash functions of the form + // + // h_i(x) = a_i * x + b_i (mod p) + // + // But for this particular application, setting b_i does not matter (since all it does is shift the results of a + // particular hash), so we omit it (by setting b_i to 0) and simply use hash functions of the form + // + // h_i(x) = a_i * x (mod p) + // + val r = new scala.util.Random(seed) + val numHashes = depth(delta) + val numCounters = width(eps) + (0 to (numHashes - 1)).map(_ => CMSHash[K](r.nextInt(), 0, numCounters)) + } + +} + +/** + * A trait for CMS implementations that can count elements in a data stream and that can answer point queries + * (i.e. frequency estimates) for these elements. + * + * Known implementations: [[CMS]], [[TopCMS]]. + * + * @tparam K + * The type used to identify the elements to be counted. + * @tparam C + * The type of the actual CMS that implements this trait. + */ +trait CMSCounting[K, C[_]] { + + /** + * Returns the one-sided error bound on the error of each point query, i.e. frequency estimate. + */ + def eps: Double + + /** + * Returns the bound on the probability that a query estimate does NOT lie within some small interval (an + * interval that depends on `eps`) around the truth. + */ + def delta: Double + + /** + * Number of hash functions (also: number of rows in the counting table). This number is derived from + * `delta`. + */ + def depth: Int = CMSFunctions.depth(delta) + + /** + * Number of counters per hash function (also: number of columns in the counting table). This number is + * derived from `eps`. + */ + def width: Int = CMSFunctions.width(eps) + + /** + * An Option parameter about how many exact counts a sparse CMS wants to keep + */ + def maxExactCountOpt: Option[Int] + + /** + * Number of exact counts a sparse CMS wants to keep. This number is derived from `maxExactCountOpt`. + */ + def maxExactCount: Int = + CMSFunctions.maxExactCount(maxExactCountOpt, depth, width) + + /** + * Returns a new sketch that is the combination of this sketch and the other sketch. + */ + def ++(other: C[K]): C[K] + + /** + * Counts the item and returns the result as a new sketch. + */ + def +(item: K): C[K] = this + (item, 1L) + + /** + * Counts the item `count` times and returns the result as a new sketch. + */ + def +(item: K, count: Long): C[K] + + /** + * Returns an estimate of the total number of times this item has been seen in the stream so far. This + * estimate is an upper bound. + * + * It is always true that `estimatedFrequency >= trueFrequency`. With probability `p >= 1 - delta`, it also + * holds that `estimatedFrequency <= trueFrequency + eps * totalCount`. + */ + def frequency(item: K): Approximate[Long] + + /** + * Returns an estimate of the inner product against another data stream. + * + * In other words, let a_i denote the number of times element i has been seen in the data stream summarized + * by this CMS, and let b_i denote the same for the other CMS. Then this returns an estimate of ` = + * \sum a_i b_i`. + * + * Note: This can also be viewed as the join size between two relations. + * + * It is always true that actualInnerProduct <= estimatedInnerProduct. With probability `p >= 1 - delta`, it + * also holds that `estimatedInnerProduct <= actualInnerProduct + eps * thisTotalCount * otherTotalCount`. + */ + def innerProduct(other: C[K]): Approximate[Long] + + /** + * Total number of elements counted (i.e. seen in the data stream) so far. + */ + def totalCount: Long + + /** + * The first frequency moment is the total number of elements in the stream. + */ + def f1: Long = totalCount + + /** + * The second frequency moment is `\sum a_i^2`, where `a_i` is the count of the i-th element. + */ + def f2: Approximate[Long] + +} + +/** + * A trait for CMS implementations that can track heavy hitters in a data stream. + * + * It is up to the implementation how the semantics of tracking heavy hitters are defined. For instance, one + * implementation could track the "top %" heavy hitters whereas another implementation could track the "top N" + * heavy hitters. + * + * Known implementations: [[TopCMS]]. + * + * @tparam K + * The type used to identify the elements to be counted. + */ +trait CMSHeavyHitters[K] { + + /** + * The pluggable logic of how heavy hitters are being tracked. + */ + def heavyHittersLogic: HeavyHittersLogic[K] + + /** + * Returns the set of heavy hitters. + */ + def heavyHitters: Set[K] + +} + +object CMS { + + def monoid[K: CMSHasher](eps: Double, delta: Double, seed: Int): CMSMonoid[K] = + monoid(eps, delta, seed, None) + def monoid[K: CMSHasher]( + eps: Double, + delta: Double, + seed: Int, + maxExactCountOpt: Option[Int] + ): CMSMonoid[K] = + new CMSMonoid[K](eps, delta, seed, maxExactCountOpt) + + def monoid[K: CMSHasher](depth: Int, width: Int, seed: Int): CMSMonoid[K] = + monoid(depth, width, seed, None) + def monoid[K: CMSHasher](depth: Int, width: Int, seed: Int, maxExactCountOpt: Option[Int]): CMSMonoid[K] = + monoid(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, maxExactCountOpt) + + def aggregator[K: CMSHasher](eps: Double, delta: Double, seed: Int): CMSAggregator[K] = + aggregator(eps, delta, seed, None) + def aggregator[K: CMSHasher]( + eps: Double, + delta: Double, + seed: Int, + maxExactCountOpt: Option[Int] + ): CMSAggregator[K] = + new CMSAggregator[K](monoid(eps, delta, seed, maxExactCountOpt)) + + def aggregator[K: CMSHasher](depth: Int, width: Int, seed: Int): CMSAggregator[K] = + aggregator(depth, width, seed, None) + def aggregator[K: CMSHasher]( + depth: Int, + width: Int, + seed: Int, + maxExactCountOpt: Option[Int] + ): CMSAggregator[K] = + aggregator(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, maxExactCountOpt) + + /** + * Returns a fresh, zeroed CMS instance. + */ + def apply[K: CMSHasher]( + eps: Double, + delta: Double, + seed: Int, + maxExactCountOpt: Option[Int] = None + ): CMS[K] = { + val params = { + val hashes: Seq[CMSHash[K]] = + CMSFunctions.generateHashes(eps, delta, seed) + CMSParams(hashes, eps, delta, maxExactCountOpt) + } + CMSZero[K](params) + } + +} + +/** + * A Count-Min sketch data structure that allows for counting and frequency estimation of elements in a data + * stream. + * + * Tip: If you also need to track heavy hitters ("Top N" problems), take a look at [[TopCMS]]. + * + * =Usage= + * + * This example demonstrates how to count `Long` elements with [[CMS]], i.e. `K=Long`. + * + * Note that the actual counting is always performed with a `Long`, regardless of your choice of `K`. That is, + * the counting table behind the scenes is backed by `Long` values (at least in the current implementation), + * and thus the returned frequency estimates are always instances of `Approximate[Long]`. + * + * @example + * {{{ + * + * // Creates a monoid for a CMS that can count `Long` elements. val cmsMonoid: CMSMonoid[Long] = { val eps = + * 0.001 val delta = 1E-10 val seed = 1 CMS.monoid[Long](eps, delta, seed) } + * + * // Creates a CMS instance that has counted the element `1L`. val cms: CMS[Long] = cmsMonoid.create(1L) + * + * // Estimates the frequency of `1L` val estimate: Approximate[Long] = cms.frequency(1L) + * }}} + * + * @tparam K + * The type used to identify the elements to be counted. + */ +sealed abstract class CMS[K](val params: CMSParams[K]) extends java.io.Serializable with CMSCounting[K, CMS] { + + override val eps: Double = params.eps + + override val delta: Double = params.delta + + override val maxExactCountOpt: Option[Int] = params.maxExactCountOpt + + override def f2: Approximate[Long] = innerProduct(this) + +} + +/** + * Zero element. Used for initialization. + */ +case class CMSZero[K](override val params: CMSParams[K]) extends CMS[K](params) { + + override val totalCount: Long = 0L + + override def +(item: K, count: Long): CMS[K] = CMSItem[K](item, count, params) + + override def ++(other: CMS[K]): CMS[K] = other + + override def frequency(item: K): Approximate[Long] = Approximate.exact(0L) + + override def innerProduct(other: CMS[K]): Approximate[Long] = + Approximate.exact(0L) + +} + +/** + * Used for holding a single element, to avoid repeatedly adding elements from sparse counts tables. + */ +case class CMSItem[K](item: K, override val totalCount: Long, override val params: CMSParams[K]) + extends CMS[K](params) { + + override def +(x: K, count: Long): CMS[K] = + SparseCMS[K](params) + (item, totalCount) + (x, count) + + override def ++(other: CMS[K]): CMS[K] = + other match { + case _: CMSZero[?] => this + case other: CMSItem[K] => + CMSInstance[K](params) + (item, totalCount) + (other.item, other.totalCount) + case _ => other + item + } + + override def frequency(x: K): Approximate[Long] = + if (item == x) Approximate.exact(totalCount) else Approximate.exact(0L) + + override def innerProduct(other: CMS[K]): Approximate[Long] = + Approximate.exact(totalCount) * other.frequency(item) + +} + +/** + * A sparse Count-Min sketch structure, used for situations where the key is highly skewed. + */ +case class SparseCMS[K]( + exactCountTable: Map[K, Long], + override val totalCount: Long, + override val params: CMSParams[K] +) extends CMS[K](params) { + import SparseCMS._ + + override def +(x: K, count: Long): CMS[K] = { + val currentCount = exactCountTable.getOrElse(x, 0L) + val newTable = exactCountTable.updated(x, currentCount + count) + if (newTable.size < maxExactCount) { + // still sparse + SparseCMS(newTable, totalCount = totalCount + count, params = params) + } else { + toDense(newTable, params) + } + } + + override def ++(other: CMS[K]): CMS[K] = + other match { + case _: CMSZero[?] => this + case other: CMSItem[K] => this + (other.item, other.totalCount) + case other: SparseCMS[K] => + // This SparseCMS's maxExactCount is used, so ++ is not communitive + val newTable = Semigroup.plus(exactCountTable, other.exactCountTable) + if (newTable.size < maxExactCount) { + // still sparse + SparseCMS(newTable, totalCount = totalCount + other.totalCount, params = params) + } else { + toDense(newTable, params) + } + + case other: CMSInstance[K] => other ++ this + } + + override def frequency(x: K): Approximate[Long] = + Approximate.exact(exactCountTable.getOrElse(x, 0L)) + + override def innerProduct(other: CMS[K]): Approximate[Long] = + exactCountTable.iterator + .map { case (x, count) => Approximate.exact(count) * other.frequency(x) } + .reduceOption(_ + _) + .getOrElse(Approximate.exact(0L)) +} + +object SparseCMS { + + /** + * Creates a new [[SparseCMS]] with empty exactCountTable + */ + def apply[K](params: CMSParams[K]): SparseCMS[K] = { + val exactCountTable = Map[K, Long]() + SparseCMS[K](exactCountTable, totalCount = 0, params = params) + } + + /** + * Creates a new [[CMSInstance]] from a Map[K, Long] + */ + def toDense[K](exactCountTable: Map[K, Long], params: CMSParams[K]): CMS[K] = + // Create new CMSInstace + exactCountTable.foldLeft(CMSInstance[K](params)) { case (cms, (x, count)) => + cms + (x, count) + } +} + +/** + * The general Count-Min sketch structure, used for holding any number of elements. + */ +case class CMSInstance[K]( + countsTable: CMSInstance.CountsTable[K], + override val totalCount: Long, + override val params: CMSParams[K] +) extends CMS[K](params) { + + override def ++(other: CMS[K]): CMS[K] = + other match { + case _: CMSZero[?] => this + case other: CMSItem[K] => this + other.item + case other: SparseCMS[K] => + other.exactCountTable.foldLeft(this) { case (cms, (x, count)) => + cms + (x, count) + } + case other: CMSInstance[K] => + val newTable = countsTable ++ other.countsTable + val newTotalCount = totalCount + other.totalCount + CMSInstance[K](newTable, newTotalCount, params) + } + + private def makeApprox(est: Long): Approximate[Long] = + if (est == 0L) Approximate.exact(0L) + else { + val lower = math.max(0L, est - (eps * totalCount).toLong) + Approximate(lower, est, est, 1 - delta) + } + + override def frequency(item: K): Approximate[Long] = { + var freq = Long.MaxValue + val hs = params.hashes + val it = countsTable.counts.iterator + var i = 0 + while (it.hasNext) { + val row = it.next() + val count = row(hs(i)(item)) + if (count < freq) freq = count + i += 1 + } + makeApprox(freq) + } + + /** + * Let X be a CMS, and let count_X[j, k] denote the value in X's 2-dimensional count table at row j and + * column k. Then the Count-Min sketch estimate of the inner product between A and B is the minimum inner + * product between their rows: estimatedInnerProduct = min_j (\sum_k count_A[j, k] * count_B[j, k]|) + */ + override def innerProduct(other: CMS[K]): Approximate[Long] = + other match { + case other: CMSInstance[?] => + require(other.depth == depth && other.width == width, "Tables must have the same dimensions.") + + def innerProductAtDepth(d: Int) = + (0 to (width - 1)).iterator.map { w => + countsTable.getCount((d, w)) * other.countsTable.getCount((d, w)) + }.sum + + val est = (0 to (depth - 1)).iterator.map(innerProductAtDepth).min + val minimum = + math.max(est - (eps * totalCount * other.totalCount).toLong, 0) + Approximate(minimum, est, est, 1 - delta) + case _ => other.innerProduct(this) + } + + override def +(item: K, count: Long): CMSInstance[K] = { + require(count >= 0, "count must be >= 0 (negative counts not implemented") + if (count != 0L) { + val newCountsTable = + (0 to (depth - 1)).foldLeft(countsTable) { case (table, row) => + val pos = (row, params.hashes(row)(item)) + table + (pos, count) + } + CMSInstance[K](newCountsTable, totalCount + count, params) + } else this + } + +} + +object CMSInstance { + + /** + * Initializes a [[CMSInstance]] with all zeroes, i.e. nothing has been counted yet. + */ + def apply[K](params: CMSParams[K]): CMSInstance[K] = { + val countsTable = CountsTable[K](CMSFunctions.depth(params.delta), CMSFunctions.width(params.eps)) + CMSInstance[K](countsTable, 0, params) + } + + /** + * The 2-dimensional table of counters used in the Count-Min sketch. Each row corresponds to a particular + * hash function. + */ + // TODO: implement a dense matrix type, and use it here + case class CountsTable[K](counts: Vector[Vector[Long]]) { + require(depth > 0, "Table must have at least 1 row.") + require(width > 0, "Table must have at least 1 column.") + + def depth: Int = counts.size + + def width: Int = counts(0).size + + def getCount(pos: (Int, Int)): Long = { + val (row, col) = pos + require(row < depth && col < width, "Position must be within the bounds of this table.") + counts(row)(col) + } + + /** + * Updates the count of a single cell in the table. + */ + def +(pos: (Int, Int), count: Long): CountsTable[K] = { + val (row, col) = pos + val currCount = getCount(pos) + val newCounts = + counts.updated(row, counts(row).updated(col, currCount + count)) + CountsTable[K](newCounts) + } + + /** + * Adds another counts table to this one, through element-wise addition. + */ + def ++(other: CountsTable[K]): CountsTable[K] = { + require(depth == other.depth && width == other.width, "Tables must have the same dimensions.") + val xss = this.counts.iterator + val yss = other.counts.iterator + val rows = Vector.newBuilder[Vector[Long]] + while (xss.hasNext) { + val xs = xss.next().iterator + val ys = yss.next().iterator + val row = Vector.newBuilder[Long] + while (xs.hasNext) row += (xs.next() + ys.next()) + rows += row.result() + } + CountsTable[K](rows.result()) + } + } + + object CountsTable { + + /** + * Creates a new [[CountsTable]] with counts initialized to all zeroes. + */ + def apply[K](depth: Int, width: Int): CountsTable[K] = + CountsTable[K](Vector.fill[Long](depth, width)(0L)) + + } + +} + +case class TopCMSParams[K](logic: HeavyHittersLogic[K]) + +/** + * A Count-Min sketch data structure that allows for (a) counting and frequency estimation of elements in a + * data stream and (b) tracking the heavy hitters among these elements. + * + * The logic of how heavy hitters are computed is pluggable, see [[HeavyHittersLogic]]. + * + * Tip: If you do not need to track heavy hitters, take a look at [[CMS]], which is more efficient in this + * case. + * + * =Usage= + * + * This example demonstrates how to count `Long` elements with [[TopCMS]], i.e. `K=Long`. + * + * Note that the actual counting is always performed with a `Long`, regardless of your choice of `K`. That is, + * the counting table behind the scenes is backed by `Long` values (at least in the current implementation), + * and thus the returned frequency estimates are always instances of `Approximate[Long]`. + * + * @example + * {{{ // Creates a monoid for a CMS that can count `Long` elements. val topPctCMSMonoid: + * TopPctCMSMonoid[Long] = { val eps = 0.001 val delta = 1E-10 val seed = 1 val heavyHittersPct = 0.1 + * TopPctCMS.monoid[Long](eps, delta, seed, heavyHittersPct) } + * + * // Creates a TopCMS instance that has counted the element `1L`. val topCMS: TopCMS[Long] = + * topPctCMSMonoid.create(1L) + * + * // Estimates the frequency of `1L` val estimate: Approximate[Long] = topCMS.frequency(1L) + * + * // What are the heavy hitters so far? val heavyHitters: Set[Long] = topCMS.heavyHitters }}} + * + * @tparam K + * The type used to identify the elements to be counted. + */ +sealed abstract class TopCMS[K](val cms: CMS[K], params: TopCMSParams[K]) + extends java.io.Serializable + with CMSCounting[K, TopCMS] + with CMSHeavyHitters[K] { + + override val eps: Double = cms.eps + + override val delta: Double = cms.delta + + override val totalCount: Long = cms.totalCount + + override val maxExactCountOpt: Option[Int] = cms.maxExactCountOpt + + override def frequency(item: K): Approximate[Long] = cms.frequency(item) + + override def innerProduct(other: TopCMS[K]): Approximate[Long] = + cms.innerProduct(other.cms) + + override def f2: Approximate[Long] = innerProduct(this) + + /** + * The pluggable logic with which heavy hitters are being tracked. + */ + override def heavyHittersLogic: HeavyHittersLogic[K] = params.logic + +} + +/** + * Zero element. Used for initialization. + */ +case class TopCMSZero[K](override val cms: CMS[K], params: TopCMSParams[K]) extends TopCMS[K](cms, params) { + + override val heavyHitters: Set[K] = Set.empty[K] + + override def +(item: K, count: Long): TopCMS[K] = + TopCMSInstance(cms, params) + (item, count) + + override def ++(other: TopCMS[K]): TopCMS[K] = other + +} + +/** + * Used for holding a single element, to avoid repeatedly adding elements from sparse counts tables. + */ +case class TopCMSItem[K](item: K, override val cms: CMS[K], params: TopCMSParams[K]) + extends TopCMS[K](cms, params) { + + override val heavyHitters: Set[K] = Set(item) + + override def +(x: K, count: Long): TopCMS[K] = toCMSInstance + (x, count) + + override def ++(other: TopCMS[K]): TopCMS[K] = other match { + case _: TopCMSZero[?] => this + case other: TopCMSItem[K] => toCMSInstance + other.item + case other: TopCMSInstance[K] => other + item + } + + private def toCMSInstance: TopCMSInstance[K] = { + val hhs = HeavyHitters.from(HeavyHitter(item, 1L)) + TopCMSInstance(cms, hhs, params) + } + +} + +object TopCMSInstance { + + def apply[K](cms: CMS[K], params: TopCMSParams[K]): TopCMSInstance[K] = + TopCMSInstance[K](cms, HeavyHitters.empty[K], params) + +} + +case class TopCMSInstance[K](override val cms: CMS[K], hhs: HeavyHitters[K], params: TopCMSParams[K]) + extends TopCMS[K](cms, params) { + + override def heavyHitters: Set[K] = hhs.items + + override def +(item: K, count: Long): TopCMSInstance[K] = { + require(count >= 0, "count must be >= 0 (negative counts not implemented") + if (count != 0L) { + val newCms = cms + (item, count) + val newHhs = + heavyHittersLogic.updateHeavyHitters(cms, newCms)(hhs, item, count) + TopCMSInstance[K](newCms, newHhs, params) + } else this + } + + override def ++(other: TopCMS[K]): TopCMS[K] = other match { + case _: TopCMSZero[?] => this + case other: TopCMSItem[K] => this + other.item + case other: TopCMSInstance[K] => + val newCms = cms ++ other.cms + val newHhs = heavyHittersLogic.updateHeavyHitters(newCms)(hhs, other.hhs) + TopCMSInstance(newCms, newHhs, params) + } + +} + +class TopCMSMonoid[K](emptyCms: CMS[K], logic: HeavyHittersLogic[K]) extends Monoid[TopCMS[K]] { + + val params: TopCMSParams[K] = TopCMSParams(logic) + + override val zero: TopCMS[K] = TopCMSZero[K](emptyCms, params) + + /** + * Combines the two sketches. + * + * The sketches must use the same hash functions. + */ + override def plus(left: TopCMS[K], right: TopCMS[K]): TopCMS[K] = { + require( + left.cms.params.hashes == right.cms.params.hashes, + "The sketches must use the same hash functions." + ) + left ++ right + } + + /** + * Creates a sketch out of a single item. + */ + def create(item: K): TopCMS[K] = + TopCMSItem[K](item, emptyCms + item, params) + + /** + * Creates a sketch out of multiple items. + */ + def create(data: Seq[K]): TopCMS[K] = + data.foldLeft(zero) { case (acc, x) => plus(acc, create(x)) } + + override def sum(sketches: TraversableOnce[TopCMS[K]]): TopCMS[K] = { + val topCandidates = scala.collection.mutable.Set.empty[K] + val summation = new CMSSummation(emptyCms.params) + sketches.iterator.foreach { sketch => + summation.updateInto(sketch.cms) + topCandidates ++= sketch.heavyHitters + } + val cms = summation.result + val ests = + topCandidates.map(k => HeavyHitter(k, cms.frequency(k).estimate)).toSet + val hhs = logic.purgeHeavyHitters(cms)(HeavyHitters(ests)) + TopCMSInstance(cms, hhs, params) + } + + override def sumOption(sketches: TraversableOnce[TopCMS[K]]): Option[TopCMS[K]] = + if (sketches.iterator.isEmpty) None else Some(sum(sketches)) +} + +class TopCMSAggregator[K](cmsMonoid: TopCMSMonoid[K]) extends MonoidAggregator[K, TopCMS[K], TopCMS[K]] { + + override def monoid: TopCMSMonoid[K] = cmsMonoid + + override def prepare(value: K): TopCMS[K] = monoid.create(value) + + override def present(cms: TopCMS[K]): TopCMS[K] = cms + +} + +/** + * Controls how a CMS that implements [[CMSHeavyHitters]] tracks heavy hitters. + */ +abstract class HeavyHittersLogic[K] extends java.io.Serializable { + + def updateHeavyHitters( + oldCms: CMS[K], + newCms: CMS[K] + )(hhs: HeavyHitters[K], item: K, count: Long): HeavyHitters[K] = { + val oldItemCount = oldCms.frequency(item).estimate + val oldHh = HeavyHitter[K](item, oldItemCount) + val newItemCount = oldItemCount + count + val newHh = HeavyHitter[K](item, newItemCount) + purgeHeavyHitters(newCms)(hhs - oldHh + newHh) + } + + def updateHeavyHitters(cms: CMS[K])(left: HeavyHitters[K], right: HeavyHitters[K]): HeavyHitters[K] = { + val candidates = (left.items ++ right.items).map { case i => + HeavyHitter[K](i, cms.frequency(i).estimate) + } + val newHhs = HeavyHitters.from(candidates) + purgeHeavyHitters(cms)(newHhs) + } + + def purgeHeavyHitters(cms: CMS[K])(hhs: HeavyHitters[K]): HeavyHitters[K] + +} + +/** + * Finds all heavy hitters, i.e., elements in the stream that appear at least `(heavyHittersPct * totalCount)` + * times. + * + * Every item that appears at least `(heavyHittersPct * totalCount)` times is output, and with probability `p + * >= 1 - delta`, no item whose count is less than `(heavyHittersPct - eps) * totalCount` is output. + * + * This also means that this parameter is an upper bound on the number of heavy hitters that will be tracked: + * the set of heavy hitters contains at most `1 / heavyHittersPct` elements. For example, if + * `heavyHittersPct=0.01` (or 0.25), then at most `1 / 0.01 = 100` items (or `1 / 0.25 = 4` items) will be + * tracked/returned as heavy hitters. This parameter can thus control the memory footprint required for + * tracking heavy hitters. + */ +case class TopPctLogic[K](heavyHittersPct: Double) extends HeavyHittersLogic[K] { + + require(0 < heavyHittersPct && heavyHittersPct < 1, "heavyHittersPct must lie in (0, 1)") + + override def purgeHeavyHitters(cms: CMS[K])(hitters: HeavyHitters[K]): HeavyHitters[K] = { + val minCount = heavyHittersPct * cms.totalCount + HeavyHitters[K](hitters.hhs.filter(_.count >= minCount)) + } + +} + +/** + * Tracks the top N heavy hitters, where `N` is defined by `heavyHittersN`. + * + * '''Warning:''' top-N computations are not associative. The effect is that a top-N CMS has an ordering bias + * (with regard to heavy hitters) when merging instances. This means merging heavy hitters across CMS + * instances may lead to incorrect, biased results: the outcome is biased by the order in which CMS instances + * / heavy hitters are being merged, with the rule of thumb being that the earlier a set of heavy hitters is + * being merged, the more likely is the end result biased towards these heavy hitters. + * + * @see + * Discussion in [[https://github.com/twitter/algebird/issues/353 Algebird issue 353]] + */ +case class TopNLogic[K](heavyHittersN: Int) extends HeavyHittersLogic[K] { + + require(heavyHittersN > 0, "heavyHittersN must be > 0") + + override def purgeHeavyHitters(cms: CMS[K])(hitters: HeavyHitters[K]): HeavyHitters[K] = { + val sorted = + hitters.hhs.toSeq.sortBy(hh => hh.count).takeRight(heavyHittersN) + HeavyHitters[K](sorted.toSet) + } + +} + +/** + * Containers for holding heavy hitter items and their associated counts. + */ +case class HeavyHitters[K](hhs: Set[HeavyHitter[K]]) extends java.io.Serializable { + + def -(hh: HeavyHitter[K]): HeavyHitters[K] = HeavyHitters[K](hhs - hh) + + def +(hh: HeavyHitter[K]): HeavyHitters[K] = HeavyHitters[K](hhs + hh) + + def ++(other: HeavyHitters[K]): HeavyHitters[K] = + HeavyHitters[K](hhs ++ other.hhs) + + def items: Set[K] = hhs.map(_.item) + +} + +object HeavyHitters { + + def empty[K]: HeavyHitters[K] = HeavyHitters(emptyHhs) + + private def emptyHhs[K]: Set[HeavyHitter[K]] = Set[HeavyHitter[K]]() + + def from[K](hhs: Set[HeavyHitter[K]]): HeavyHitters[K] = + hhs.foldLeft(empty[K])(_ + _) + + def from[K](hh: HeavyHitter[K]): HeavyHitters[K] = HeavyHitters(emptyHhs + hh) + +} + +case class HeavyHitter[K](item: K, count: Long) extends java.io.Serializable + +/** + * Monoid for Top-% based [[TopCMS]] sketches. + * + * =Usage= + * + * The type `K` is the type of items you want to count. You must provide an implicit `CMSHasher[K]` for `K`, + * and Algebird ships with several such implicits for commonly used types such as `Long` and `BigInt`. + * + * If your type `K` is not supported out of the box, you have two options: 1) You provide a "translation" + * function to convert items of your (unsupported) type `K` to a supported type such as Double, and then use + * the `contramap` function of [[CMSHasher]] to create the required `CMSHasher[K]` for your type (see the + * documentation of [[CMSHasher]] for an example); 2) You implement a `CMSHasher[K]` from scratch, using the + * existing CMSHasher implementations as a starting point. + * + * Note: Because Arrays in Scala/Java not have sane `equals` and `hashCode` implementations, you cannot safely + * use types such as `Array[Byte]`. Extra work is required for Arrays. For example, you may opt to convert + * `Array[T]` to a `Seq[T]` via `toSeq`, or you can provide appropriate wrapper classes. Algebird provides one + * such wrapper class, [[Bytes]], to safely wrap an `Array[Byte]` for use with CMS. + * + * @param cms + * A [[CMS]] instance, which is used for the counting and the frequency estimation performed by this class. + * @param heavyHittersPct + * A threshold for finding heavy hitters, i.e., elements that appear at least (heavyHittersPct * totalCount) + * times in the stream. + * @tparam K + * The type used to identify the elements to be counted. For example, if you want to count the occurrence of + * user names, you could map each username to a unique numeric ID expressed as a `Long`, and then count the + * occurrences of those `Long`s with a CMS of type `K=Long`. Note that this mapping between the elements of + * your problem domain and their identifiers used for counting via CMS should be bijective. We require a + * [[CMSHasher]] context bound for `K`, see [[CMSHasher]] for available implicits that can be imported. + * Which type K should you pick in practice? For domains that have less than `2^64` unique elements, you'd + * typically use `Long`. For larger domains you can try `BigInt`, for example. + */ +class TopPctCMSMonoid[K](cms: CMS[K], heavyHittersPct: Double = 0.01) + extends TopCMSMonoid[K](cms, TopPctLogic[K](heavyHittersPct)) + +object TopPctCMS { + + def monoid[K: CMSHasher]( + eps: Double, + delta: Double, + seed: Int, + heavyHittersPct: Double + ): TopPctCMSMonoid[K] = + new TopPctCMSMonoid[K](CMS(eps, delta, seed), heavyHittersPct) + + def monoid[K: CMSHasher](depth: Int, width: Int, seed: Int, heavyHittersPct: Double): TopPctCMSMonoid[K] = + monoid(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersPct) + + def aggregator[K: CMSHasher]( + eps: Double, + delta: Double, + seed: Int, + heavyHittersPct: Double + ): TopPctCMSAggregator[K] = + new TopPctCMSAggregator[K](monoid(eps, delta, seed, heavyHittersPct)) + + def aggregator[K: CMSHasher]( + depth: Int, + width: Int, + seed: Int, + heavyHittersPct: Double + ): TopPctCMSAggregator[K] = + aggregator(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersPct) + +} + +/** + * An Aggregator for [[TopPctCMS]]. Can be created using [[TopPctCMS.aggregator]]. + */ +case class TopPctCMSAggregator[K](cmsMonoid: TopPctCMSMonoid[K]) extends TopCMSAggregator(cmsMonoid) + +/** + * Monoid for top-N based [[TopCMS]] sketches. '''Use with care! (see warning below)''' + * + * =Warning: Adding top-N CMS instances (`++`) is an unsafe operation= + * + * Top-N computations are not associative. The effect is that a top-N CMS has an ordering bias (with regard to + * heavy hitters) when ''merging'' CMS instances (e.g. via `++`). This means merging heavy hitters across CMS + * instances may lead to incorrect, biased results: the outcome is biased by the order in which CMS instances + * / heavy hitters are being merged, with the rule of thumb being that the earlier a set of heavy hitters is + * being merged, the more likely is the end result biased towards these heavy hitters. + * + * The warning above only applies when ''adding CMS instances'' (think: `cms1 ++ cms2`). In comparison, heavy + * hitters are correctly computed when: + * + * - a top-N CMS instance is created from a single data stream, i.e. `Seq[K]` + * - items are added/counted individually, i.e. `cms + item` or `cms + (item, count)`. + * + * See the discussion in [[https://github.com/twitter/algebird/issues/353 Algebird issue 353]] for further + * details. + * + * =Alternatives= + * + * The following, alternative data structures may be better picks than a top-N based CMS given the warning + * above: + * + * - [[TopPctCMS]]: Has safe merge semantics for its instances including heavy hitters. + * - [[SpaceSaver]]: Has the same ordering bias than a top-N CMS, but at least it provides bounds on the + * bias. + * + * =Usage= + * + * The type `K` is the type of items you want to count. You must provide an implicit `CMSHasher[K]` for `K`, + * and Algebird ships with several such implicits for commonly used types such as `Long` and `BigInt`. + * + * If your type `K` is not supported out of the box, you have two options: 1) You provide a "translation" + * function to convert items of your (unsupported) type `K` to a supported type such as [[Double]], and then + * use the `contramap` function of [[CMSHasher]] to create the required `CMSHasher[K]` for your type (see the + * documentation of [[CMSHasher]] for an example); 2) You implement a `CMSHasher[K]` from scratch, using the + * existing CMSHasher implementations as a starting point. + * + * Note: Because Arrays in Scala/Java not have sane `equals` and `hashCode` implementations, you cannot safely + * use types such as `Array[Byte]`. Extra work is required for Arrays. For example, you may opt to convert + * `Array[T]` to a `Seq[T]` via `toSeq`, or you can provide appropriate wrapper classes. Algebird provides one + * such wrapper class, [[Bytes]], to safely wrap an `Array[Byte]` for use with CMS. + * + * @param cms + * A [[CMS]] instance, which is used for the counting and the frequency estimation performed by this class. + * @param heavyHittersN + * The maximum number of heavy hitters to track. + * @tparam K + * The type used to identify the elements to be counted. For example, if you want to count the occurrence of + * user names, you could map each username to a unique numeric ID expressed as a `Long`, and then count the + * occurrences of those `Long`s with a CMS of type `K=Long`. Note that this mapping between the elements of + * your problem domain and their identifiers used for counting via CMS should be bijective. We require a + * [[CMSHasher]] context bound for `K`, see [[CMSHasher]] for available implicits that can be imported. + * Which type K should you pick in practice? For domains that have less than `2^64` unique elements, you'd + * typically use `Long`. For larger domains you can try `BigInt`, for example. + */ +class TopNCMSMonoid[K](cms: CMS[K], heavyHittersN: Int = 100) + extends TopCMSMonoid[K](cms, TopNLogic[K](heavyHittersN)) + +object TopNCMS { + + def monoid[K: CMSHasher](eps: Double, delta: Double, seed: Int, heavyHittersN: Int): TopNCMSMonoid[K] = + new TopNCMSMonoid[K](CMS(eps, delta, seed), heavyHittersN) + + def monoid[K: CMSHasher](depth: Int, width: Int, seed: Int, heavyHittersN: Int): TopNCMSMonoid[K] = + monoid(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersN) + + def aggregator[K: CMSHasher]( + eps: Double, + delta: Double, + seed: Int, + heavyHittersN: Int + ): TopNCMSAggregator[K] = + new TopNCMSAggregator[K](monoid(eps, delta, seed, heavyHittersN)) + + def aggregator[K: CMSHasher](depth: Int, width: Int, seed: Int, heavyHittersN: Int): TopNCMSAggregator[K] = + aggregator(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersN) + +} + +/** + * An Aggregator for [[TopNCMS]]. Can be created using [[TopNCMS.aggregator]]. + */ +case class TopNCMSAggregator[K](cmsMonoid: TopNCMSMonoid[K]) extends TopCMSAggregator(cmsMonoid) + +/** + * K1 defines a scope for the CMS. For each k1, keep the top heavyHittersN associated k2 values. + */ +case class ScopedTopNLogic[K1, K2](heavyHittersN: Int) extends HeavyHittersLogic[(K1, K2)] { + + require(heavyHittersN > 0, "heavyHittersN must be > 0") + + override def purgeHeavyHitters( + cms: CMS[(K1, K2)] + )(hitters: HeavyHitters[(K1, K2)]): HeavyHitters[(K1, K2)] = { + val grouped = hitters.hhs.groupBy(hh => hh.item._1) + val (underLimit, overLimit) = grouped.partition { + _._2.size <= heavyHittersN + } + val sorted = overLimit.transform { case (_, hhs) => + hhs.toSeq.sortBy(hh => hh.count) + } + val purged = sorted.transform { case (_, hhs) => + hhs.takeRight(heavyHittersN) + } + HeavyHitters[(K1, K2)](purged.values.flatten.toSet ++ underLimit.values.flatten.toSet) + } + +} + +/* + * Monoid for Top-N values per key in an associative [[TopCMS]]. + * + * Typical use case for this might be (Country, City) pairs. For a stream of such + * pairs, we might want to keep track of the most popular cities for each country. + * + * This can, of course, be achieved using a Map[Country, TopNCMS[City]], but this + * requires storing one CMS per distinct Country. + * + * Similarly, one could attempt to use a TopNCMS[(Country, City)], but less common + * countries may not make the cut if N is not "very large". + * + * ScopedTopNCMSMonoid[Country, City] will avoid having one Country drown others + * out, while still only using a single CMS. + * + * In general the eviction of K1 is not supported, and all distinct K1 values must + * be retained. Therefore it is important to only use this Monoid when the number + * of distinct K1 values is known to be reasonably bounded. + */ +class ScopedTopNCMSMonoid[K1, K2](cms: CMS[(K1, K2)], heavyHittersN: Int = 100) + extends TopCMSMonoid[(K1, K2)](cms, ScopedTopNLogic[K1, K2](heavyHittersN)) + +object ScopedTopNCMS { + + def scopedHasher[K1: CMSHasher, K2: CMSHasher]: CMSHasher[(K1, K2)] = new CMSHasher[(K1, K2)] { + private val k1Hasher = implicitly[CMSHasher[K1]] + private val k2Hasher = implicitly[CMSHasher[K2]] + + override def hash(a: Int, b: Int, width: Int)(x: (K1, K2)): Int = { + val (k1, k2) = x + val xs = Seq(k1Hasher.hash(a, b, width)(k1), k2Hasher.hash(a, b, width)(k2), a, b) + (scala.util.hashing.MurmurHash3.seqHash(xs) & Int.MaxValue) % width + } + } + + def monoid[K1: CMSHasher, K2: CMSHasher]( + eps: Double, + delta: Double, + seed: Int, + heavyHittersN: Int + ): ScopedTopNCMSMonoid[K1, K2] = + new ScopedTopNCMSMonoid[K1, K2](CMS(eps, delta, seed)(scopedHasher[K1, K2]), heavyHittersN) + + def monoid[K1: CMSHasher, K2: CMSHasher]( + depth: Int, + width: Int, + seed: Int, + heavyHittersN: Int + ): ScopedTopNCMSMonoid[K1, K2] = + monoid(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersN) + + def aggregator[K1: CMSHasher, K2: CMSHasher]( + eps: Double, + delta: Double, + seed: Int, + heavyHittersN: Int + ): TopCMSAggregator[(K1, K2)] = + new TopCMSAggregator(monoid(eps, delta, seed, heavyHittersN)) + + def aggregator[K1: CMSHasher, K2: CMSHasher]( + depth: Int, + width: Int, + seed: Int, + heavyHittersN: Int + ): TopCMSAggregator[(K1, K2)] = + aggregator(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersN) + +} + +case class CMSHash[K: CMSHasher](a: Int, b: Int, width: Int) extends java.io.Serializable { + + /** + * Returns `a * x + b (mod p) (mod width)`. + */ + def apply(x: K): Int = implicitly[CMSHasher[K]].hash(a, b, width)(x) + +} + +/** + * This formerly held the instances that moved to object CMSHasher + * + * These instances are slow, but here for compatibility with old serialized data. For new code, avoid these + * and instead use the implicits found in the CMSHasher companion object. + */ +object CMSHasherImplicits { + + implicit object CMSHasherBigInt extends CMSHasher[BigInt] { + override def hash(a: Int, b: Int, width: Int)(x: BigInt): Int = + CMSHasher.hashBytes(a, b, width)(x.toByteArray) + } + + implicit object CMSHasherString extends CMSHasher[String] { + override def hash(a: Int, b: Int, width: Int)(x: String): Int = + CMSHasher.hashBytes(a, b, width)(x.getBytes("UTF-8")) + } + + def cmsHasherShort: CMSHasher[Short] = CMSHasher.cmsHasherShort +} diff --git a/algebird-core/src/main/scala-2.13/DecayedVector.scala b/algebird-core/src/main/scala-2.13/DecayedVector.scala new file mode 100644 index 000000000..18e816fe4 --- /dev/null +++ b/algebird-core/src/main/scala-2.13/DecayedVector.scala @@ -0,0 +1,75 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ + +package com.twitter.algebird + +/** + * Represents a container class together with time. Its monoid consists of exponentially scaling the older + * value and summing with the newer one. + */ +object DecayedVector extends CompatDecayedVector { + def buildWithHalflife[C[_]](vector: C[Double], time: Double, halfLife: Double): DecayedVector[C] = + DecayedVector(vector, time * scala.math.log(2.0) / halfLife) + + def monoidWithEpsilon[C[_]]( + eps: Double + )(implicit vs: VectorSpace[Double, C], metric: Metric[C[Double]]): Monoid[DecayedVector[C]] = + new Monoid[DecayedVector[C]] { + override val zero = DecayedVector(vs.group.zero, Double.NegativeInfinity) + override def plus(left: DecayedVector[C], right: DecayedVector[C]) = + if (left.scaledTime <= right.scaledTime) { + scaledPlus(right, left, eps) + } else { + scaledPlus(left, right, eps) + } + } + + def forMap[K](m: Map[K, Double], scaledTime: Double): DecayedVector[Map[K, _]] = + DecayedVector[Map[K, _]](m, scaledTime) + def forMapWithHalflife[K](m: Map[K, Double], time: Double, halfLife: Double): DecayedVector[Map[K, _]] = + forMap(m, time * scala.math.log(2.0) / halfLife) + + def mapMonoidWithEpsilon[K]( + eps: Double + )(implicit + vs: VectorSpace[Double, Map[K, _]], + metric: Metric[Map[K, Double]] + ): Monoid[DecayedVector[Map[K, _]]] = + monoidWithEpsilon[Map[K, _]](eps) + + implicit def mapMonoid[K](implicit + vs: VectorSpace[Double, Map[K, _]], + metric: Metric[Map[K, Double]] + ): Monoid[DecayedVector[Map[K, _]]] = + mapMonoidWithEpsilon(-1.0) + + def scaledPlus[C[_]](newVal: DecayedVector[C], oldVal: DecayedVector[C], eps: Double)(implicit + vs: VectorSpace[Double, C], + metric: Metric[C[Double]] + ): DecayedVector[C] = { + implicit val mon: Monoid[C[Double]] = vs.group + val expFactor = scala.math.exp(oldVal.scaledTime - newVal.scaledTime) + val newVector = + Monoid.plus(newVal.vector, vs.scale(expFactor, oldVal.vector)) + if (eps < 0.0 || Metric.norm(newVector) > eps) { + DecayedVector(newVector, newVal.scaledTime) + } else { + DecayedVector(mon.zero, Double.NegativeInfinity) + } + } +} + +case class DecayedVector[C[_]](vector: C[Double], scaledTime: Double) diff --git a/algebird-core/src/main/scala-2.13/DecayingCMS.scala b/algebird-core/src/main/scala-2.13/DecayingCMS.scala new file mode 100644 index 000000000..54809e2a8 --- /dev/null +++ b/algebird-core/src/main/scala-2.13/DecayingCMS.scala @@ -0,0 +1,650 @@ +package com.twitter.algebird + +import java.lang.Double.{compare => cmp} +import java.lang.Math +import java.util.Arrays.deepHashCode +import scala.concurrent.duration.Duration +import scala.util.Random + +/** + * DecayingCMS is a module to build count-min sketch instances whose counts decay exponentially. + * + * Similar to a Map[K, com.twitter.algebird.DecayedValue], each key is associated with a single count value + * that decays over time. Unlike a map, the decyaing CMS is an approximate count -- in exchange for the + * possibility of over-counting, we can bound its size in memory. + * + * The intended use case is for metrics or machine learning where exact values aren't needed. + * + * You can expect the keys with the biggest values to be fairly accurate but the very small values (rare keys + * or very old keys) to be lost in the noise. For both metrics and ML this should be fine: you can't learn too + * much from very rare values. + * + * We recommend depth of at least 5, and width of at least 100, but you should do some experiments to + * determine the smallest parameters that will work for your use case. + */ +final class DecayingCMS[K]( + seed: Long, + val halfLife: Duration, + val depth: Int, // number of hashing functions + val width: Int, // number of table cells per hashing function + hasher: CMSHasher[K] +) extends Serializable { module => + + override def toString: String = + s"DecayingCMS(seed=$seed, halfLife=$halfLife, depth=$depth, width=$width)" + + @inline private def getNextLogScale( + logScale: Double, + oldTimeInHL: Double, + nowInHL: Double + ): Double = + if (nowInHL == oldTimeInHL) logScale else logScale + (nowInHL - oldTimeInHL) * log2 + + @inline private def getScale(logScale: Double, oldTimeInHL: Double, nowInHL: Double): Double = { + val logScale1 = getNextLogScale(logScale, oldTimeInHL, nowInHL) + Math.exp(-logScale1) + } + + val empty: CMS = + new CMS(Array.fill(depth)(Vector.fill[Double](width)(0.0)), 0.0, Double.NegativeInfinity) + + /** + * Represents a decaying scalar value at a particular point in time. + * + * The value decays according to halfLife. Another way to think about DoubleAt is that it represents a + * particular decay curve (and in particular, a point along that curve). Two DoubleAt values may be + * equivalent if they are two points on the same curve. + * + * The `timeToZero` and `timeToUnit` methods can be used to "normalize" DoubleAt values. If two DoubleAt + * values do not produce the same (approximate) Double values from these methods, they represent different + * curves. + */ + class DoubleAt private[algebird] (val value: Double, val timeInHL: Double) extends Serializable { + lhs => + + // this is not public because it's not safe in general -- you need + // to run a function that is time-commutative. + private[algebird] def map(f: Double => Double): DoubleAt = + new DoubleAt(f(value), timeInHL) + + // this is not public because it's not safe in general -- you need + // to run a function that is time-commutative. + private[algebird] def map2(rhs: DoubleAt)(f: (Double, Double) => Double): DoubleAt = + if (lhs.timeInHL < rhs.timeInHL) { + val x = lhs.scaledAt(rhs.timeInHL) + new DoubleAt(f(x, rhs.value), rhs.timeInHL) + } else if (lhs.timeInHL == rhs.timeInHL) { + new DoubleAt(f(lhs.value, rhs.value), rhs.timeInHL) + } else { + val y = rhs.scaledAt(lhs.timeInHL) + new DoubleAt(f(lhs.value, y), lhs.timeInHL) + } + + def unary_- : DoubleAt = new DoubleAt(-value, timeInHL) + def abs: DoubleAt = new DoubleAt(Math.abs(value), timeInHL) + def *(n: Double): DoubleAt = new DoubleAt(value * n, timeInHL) + + def +(rhs: DoubleAt): DoubleAt = map2(rhs)(_ + _) + def -(rhs: DoubleAt): DoubleAt = map2(rhs)(_ - _) + def min(rhs: DoubleAt): DoubleAt = map2(rhs)(Math.min) + def max(rhs: DoubleAt): DoubleAt = map2(rhs)(Math.max) + + def /(rhs: DoubleAt): Double = map2(rhs)(_ / _).value + + /** + * We consider two DoubleAt values equal not just if their elements are equal, but also if they represent + * the same value at different points of decay. + */ + def compare(rhs: DoubleAt): Int = { + val vc = cmp(lhs.value, rhs.value) + val tc = cmp(lhs.timeInHL, rhs.timeInHL) + if (vc == tc) vc + else if (tc == 0) vc + else if (vc == 0) tc + else if (tc < 0) cmp(lhs.scaledAt(rhs.timeInHL), rhs.value) + else cmp(lhs.value, rhs.scaledAt(lhs.timeInHL)) + } + + /** + * Time when this value will reach the smallest double value bigger than zero, unless we are already at + * zero in which case we return the current time + */ + def timeToZero: Double = + if (java.lang.Double.isNaN(value)) Double.NaN + else if (java.lang.Double.isInfinite(value)) Double.PositiveInfinity + else if (value == 0.0) timeInHL + else timeToUnit + DoubleAt.TimeFromUnitToZero + + /** + * This is the scaled time when the current value will reach 1 (or -1 for negative values) + * + * This method is a way of collapsing a DoubleAt into a single value (the time in the past or future where + * its value would be 1, the unit value). + */ + def timeToUnit: Double = + if (java.lang.Double.isNaN(value)) Double.NaN + else if (java.lang.Double.isInfinite(value)) Double.PositiveInfinity + else if (value == 0.0) Double.NegativeInfinity + else { + // solve for result: + // + // 1 = value * module.getScale(0.0, timeInHL, result) + // 1 = value * Math.exp(-getNextLogScale(0.0, timeInHL, result)) + // 1 / value = Math.exp(-getNextLogScale(0.0, timeInHL, result)) + // log(1 / value) = -getNextLogScale(0.0, timeInHL, result) + // -log(1 / value) = getNextLogScale(0.0, timeInHL, result) + // log(value) = getNextLogScale(0.0, timeInHL, result) + // log(value) = if (result == timeInHL) 0 else 0 + (result - timeInHL) * log2 + // log(value) = if (result == timeInHL) 0 else (result - timeInHL) * log2 + // + // log(value) = (result - timeInHL) * log2 + // log(value) / log2 = result - timeInHL + // log(value) / log2 + timeInHL = result + Math.log(Math.abs(value)) / log2 + timeInHL + } + + override def equals(that: Any): Boolean = + that match { + case d: DoubleAt => compare(d) == 0 + case _ => false + } + + override def hashCode: Int = + timeToUnit.## + + override def toString: String = + s"DoubleAt($value, $timeInHL)" + + def <(rhs: DoubleAt): Boolean = (lhs.compare(rhs)) < 0 + def <=(rhs: DoubleAt): Boolean = (lhs.compare(rhs)) <= 0 + def >(rhs: DoubleAt): Boolean = (lhs.compare(rhs)) > 0 + def >=(rhs: DoubleAt): Boolean = (lhs.compare(rhs)) >= 0 + + def time: Long = + toTimestamp(timeInHL) + + private def scaledAt(t: Double): Double = + if (value == 0.0) 0.0 + else value * module.getScale(0.0, timeInHL, t) + + def at(time: Long): Double = + if (value == 0.0) 0.0 + else value * module.getScale(0.0, timeInHL, fromTimestamp(time)) + } + + object DoubleAt { + def apply(x: Double, t: Long): DoubleAt = + new DoubleAt(x, fromTimestamp(t)) + + val zero: DoubleAt = + new DoubleAt(0.0, Double.NegativeInfinity) + + private val TimeFromUnitToZero: Double = + -Math.log(Double.MinPositiveValue) / log2 + } + + val totalCells: Int = depth * width + + val halfLifeSecs: Double = + halfLife.toMillis.toDouble / 1000.0 + + // TODO: consider a smaller number? + // we are trading accuracy for possible performence + private[this] val maxLogScale: Double = 20.0 + + /** + * Allocate an empty array of row. + * + * The elements start as null. It's an important optimization _not_ to allocate vectors here, since we're + * often building up cells mutably. + */ + private def allocCells(): Array[Vector[Double]] = + new Array[Vector[Double]](depth) + + def toTimestamp(t: Double): Long = + (t * halfLifeSecs * 1000.0).toLong + + def fromTimestamp(t: Long): Double = + (t.toDouble / 1000.0) / halfLifeSecs + + val hashFns: Array[K => Int] = { + val rng = new Random(seed) + def genPos(): Int = + rng.nextInt() match { + case 0 => genPos() + case n => n & 0x7fffffff + } + + (0 until depth).map { _ => + val n = genPos() + (k: K) => hasher.hash(n, 0, width)(k) + }.toArray + } + + private final val log2 = Math.log(2.0) + + /** + * The idealized formula for the updating current value for a key (y0 -> y1) is given as: + * + * delta = (t1 - t0) / halflife y1 = y0 * 2^(-delta) + n + * + * However, we want to avoid having to rescale every single cell every time we update; i.e. a cell with a + * zero value should continue to have a zero value when n=0. + * + * Therefore, we introduce a change of variable to cell values (z) along with a scale factor (scale), and + * the following formula: + * + * (1) zN = yN * scaleN + * + * Our constraint is expressed as: + * + * (2) If n=0, z1 = z0 + * + * In that case: + * + * (3) If n=0, (y1 * scale1) = (y0 * scale0) (4) Substituting for y1, (y0 * 2^(-delta) + 0) * scale1 = y0 * + * scale0 (5) 2^(-delta) * scale1 = scale0 (6) scale1 = scale0 * 2^(delta) + * + * Also, to express z1 in terms of z0, we say: + * + * (7) z1 = y1 * scale1 (8) z1 = (y0 * 2^(-delta) + n) * scale1 (9) z1 = ((z0 / scale0) * 2^(-delta) + n) * + * scale1 (10) z1 / scale1 = (z0 / (scale1 * 2^(-delta))) * 2^(-delta) + n (11) z1 / scale1 = z0 / scale1 + + * n (12) z1 = z0 + n * scale1 + * + * So, for cells where n=0, we just update scale0 to scale1, and for cells where n is non-zero, we update z1 + * in terms of z0 and scale1. + * + * If we convert scale to logscale, we have: + * + * (13) logscale1 = logscale0 + delta * log(2) (14) z1 = z0 + n * exp(logscale1) + * + * When logscale1 gets big, we start to distort z1. For example, exp(36) is close to 2^53. We can measure + * when n * exp(logscale1) gets big, and in those cases we can rescale all our cells (set each z to its + * corresponding y) and set the logscale to 0. + * + * (15) y1 = z1 / scale1 (16) y1 = z1 / exp(logscale1) (17) y1 = z1 * exp(-logscale1) + */ + final class CMS( + val cells: Array[Vector[Double]], + val logScale: Double, + val timeInHL: Double + ) extends Serializable { + + @inline private def scale: Double = + Math.exp(-logScale) + + override def toString: String = { + val s = cells.iterator.map(_.toString).mkString("Array(", ", ", ")") + s"CMS($s, $logScale, $timeInHL)" + } + + override def hashCode: Int = + deepHashCode(cells.asInstanceOf[Array[Object]]) * 59 + + logScale.## * 17 + + timeInHL.## * 37 + + 19 + + // unfortunately we can't check the path-dependent type of this + // CMS, which we signal by using a type projection here. + override def equals(any: Any): Boolean = + any match { + case that: DecayingCMS[?]#CMS => + this.logScale == that.logScale && + this.timeInHL == that.timeInHL && + this.cells.length == that.cells.length && { + var i = 0 + while (i < depth) { + if (this.cells(i) != that.cells(i)) return false + i += 1 + } + true + } + case _ => + false + } + + def lastUpdateTime: Long = + toTimestamp(timeInHL) + + /** + * Provide lower and upper bounds on values returned for any possible key. + * + * The first value is a lower bound: even keys that have never been counted will return this value or + * greater. This will be zero unless the CMS is saturated. + * + * The second value is an upper bound: the key with the largest cardinality will not be reported as being + * larger than this value (though it might be reported as being smaller). + * + * Together these values indicate how saturated and skewed the CMS might be. + */ + def range: (DoubleAt, DoubleAt) = { + var minMinimum = Double.PositiveInfinity + var minMaximum = Double.PositiveInfinity + var i = 0 + while (i < cells.length) { + val it = cells(i).iterator + var localMax = it.next() // we know it doesn't start empty + if (localMax < minMinimum) minMinimum = localMax + while (it.hasNext) { + val n = it.next() + if (n > localMax) localMax = n + else if (n < minMinimum) minMinimum = n + } + if (localMax < minMaximum) minMaximum = localMax + i += 1 + } + + val s = scale + def sc(x: Double): DoubleAt = + new DoubleAt(if (x == 0.0) 0.0 else x * s, timeInHL) + + (sc(minMinimum), sc(minMaximum)) + } + + /** + * Returns the square-root of the inner product of two decaying CMSs. + * + * We want the result to decay at the same rate as the CMS for this method to be valid. Taking the square + * root ensures that this is true. Without it, we would violate the following equality (assuming we had + * at() on a CMS): + * + * x.innerProduct(y).at(t) = x.at(t).innerProduct(y.at(t)) + * + * This is why we don't support innerProduct, only innerProductRoot. + */ + def innerProductRoot(that: CMS): DoubleAt = { + var i = 0 + var res = Double.PositiveInfinity + val t = Math.max(this.timeInHL, that.timeInHL) + val scale = this.getScale(t) * that.getScale(t) + while (i < depth) { + var sum = 0.0 + val it0 = this.cells(i).iterator + val it1 = that.cells(i).iterator + while (it0.hasNext) { + val x = it0.next() * it1.next() + if (x != 0.0) sum += x + } + if (sum < res) res = sum + i += 1 + } + val x = if (res != 0.0) Math.sqrt(res * scale) else 0.0 + new DoubleAt(x, t) + } + + def l2Norm: DoubleAt = + innerProductRoot(this) + + def scale(x: Double): CMS = + if (java.lang.Double.isNaN(x)) { + throw new IllegalArgumentException(s"invalid scale: $x") + } else if (x < 0.0) { + throw new IllegalArgumentException(s"negative scale is not allowed: $x") + } else if (x == 0.0) { + module.empty + } else { + val s = logScale + Math.log(x) + val c = new CMS(cells, s, timeInHL) + if (s > maxLogScale) c.rescaleTo(timeInHL) else c + } + + /** + * Get the total count of all items in the CMS. + * + * The total is the same as the l1Norm, since we don't allow negative values. + * + * Total is one of the few non-approximate statistics that DecayingCMS supports. We expect the total to be + * exact (except for floating-point error). + */ + def total: DoubleAt = { + val n = cells(0).sum + val x = if (n == 0.0) 0.0 else scale * n + new DoubleAt(x, timeInHL) + } + + def get(k: K): DoubleAt = { + var minValue = Double.PositiveInfinity + var didx = 0 + while (didx < depth) { + val i = hashFns(didx)(k) + val inner = cells(didx) + val value = inner(i) + if (value < minValue) minValue = value + didx += 1 + } + val x = if (minValue == 0.0) 0.0 else scale * minValue + new DoubleAt(x, timeInHL) + } + + def getScale(t: Double): Double = + module.getScale(logScale, timeInHL, t) + + private final def nextLogScale(t: Double): Double = + module.getNextLogScale(logScale, timeInHL, t) + + def +(other: CMS): CMS = { + val x = this + val y = other + val timeInHL = Math.max(x.timeInHL, y.timeInHL) + val cms = new CMS(allocCells(), 0.0, timeInHL) + + val xscale = x.getScale(timeInHL) + val yscale = y.getScale(timeInHL) + + // a zero count is zero, no matter, how big the scale is. + @inline def prod(x: Double, y: Double): Double = + if (x == 0.0) 0.0 else x * y + + var i = 0 + while (i < depth) { + val left = x.cells(i) + val right = y.cells(i) + var j = 0 + val bldr = rowBuilder() + while (j < width) { + bldr += prod(left(j), xscale) + prod(right(j), yscale) + j += 1 + } + cms.cells(i) = bldr.result() + i += 1 + } + cms + } + + def add(t: Long, k: K, n: Double): CMS = + scaledAdd(fromTimestamp(t), k, n) + + // TODO: we could allocate a mutable scratch pad, write all the + // values into it, and then build a CMS out of it. if items is + // very small, this would be less efficient than what we're doing + // now. probably the "ideal" solution would be determine how many + // items there are. if we have fewer than ~width items, this + // approach is fine. for more, a scratch pad would be better + // (assuming we wrote that code). + // + // alternately, you could map items into (zero + item) and then + // use the monoid's sum to boil it down. + // + // we only use this in testing currently so the current code is + // fine until we rely on it in production. any change here should + // probably include benchmarks justifying the design. + def bulkAdd(items: Iterable[(Long, K, Double)]): CMS = + items.foldLeft(this) { case (c, (t, k, v)) => c.add(t, k, v) } + + private[algebird] def scaledAdd(ts1: Double, k: K, n: Double): CMS = + if (n < 0.0) { + val t = toTimestamp(ts1) + throw new IllegalArgumentException( + s"we can only add non-negative numbers to a CMS, got $n for key: $k at time: $t" + ) + } else if (n == 0.0) { + this + } else { + val logScale1 = nextLogScale(ts1) + if (logScale1 > maxLogScale) { + rescaleTo(ts1).scaledAdd(ts1, k, n) + } else { + val increment = n * Math.exp(logScale1) + val cells1 = allocCells() + var didx = 0 + while (didx < depth) { + val cell = cells(didx) + val w = hashFns(didx)(k) + cells1(didx) = cell.updated(w, cell(w) + increment) + didx += 1 + } + new CMS(cells1, logScale1, ts1) + } + } + + // Set the scale back to 0.0 + // input time is in half-lives + private[algebird] def rescaleTo(ts: Double): CMS = { + val logScale1 = nextLogScale(ts) + val expL = Math.exp(-logScale1) + if (expL == 0.0) { + new CMS(monoid.zero.cells, 0.0, ts) + } else { + val cms = new CMS(allocCells(), 0.0, ts) + var i = 0 + while (i < depth) { + val ci = cells(i) + cms.cells(i) = ci.map(_ * expL) + i += 1 + } + cms + } + } + } + + private def rowBuilder() = { + val bldr = Vector.newBuilder[Double] + bldr.sizeHint(width) + bldr + } + + object CMS { + + implicit val monoidForCMS: Monoid[CMS] = + new Monoid[CMS] { + + def zero: CMS = module.empty + + def plus(x: CMS, y: CMS): CMS = + x + y + + /** + * Turn a flat array into an array of vectors. + */ + private def scratchToCells(scratch: Array[Double]): Array[Vector[Double]] = { + val cells = new Array[Vector[Double]](depth) + var i = 0 + while (i < depth) { + var j = i * width + val limit = j + width + val bldr = rowBuilder() + while (j < limit) { + bldr += scratch(j) + j += 1 + } + cells(i) = bldr.result() + i += 1 + } + cells + } + + /** + * This method sums the first `num` items in `arr`. + */ + private def innerSum(arr: Array[CMS], num: Int): CMS = + if (num == 0) zero + else if (num == 1) arr(0) + else if (num == 2) plus(arr(0), arr(1)) + else { + // start with zero + val scratch: Array[Double] = new Array(totalCells) + + val latestTimeInHL: Double = + arr.iterator.take(num).map(cms => cms.timeInHL).max + + var i = 0 + while (i < num) { + val cms = arr(i) + val scale = cms.getScale(latestTimeInHL) + var j = 0 + while (j < depth) { + val row = cms.cells(j) + val stride = j * width + var k = 0 + while (k < width) { + val n = row(k) + if (n > 0.0) { + scratch(stride + k) += scale * n + } + k += 1 + } + j += 1 + } + i += 1 + } + + val cells = scratchToCells(scratch) + + new CMS(cells, 0.0, latestTimeInHL) + } + + override def sumOption(xs: TraversableOnce[CMS]): Option[CMS] = { + + val it: Iterator[CMS] = xs.toIterator + val ChunkSize = 1000 + + // the idea here is that we read up to 1000 CMS values into + // a fixed array, crunch them down to a single CMS, store it + // in the first array index, read up to 999 more CMS values + // in, crunch them down, and so on. + var i = 0 + val arr = new Array[CMS](ChunkSize) + while (it.hasNext) { + while (it.hasNext && i < ChunkSize) { + arr(i) = it.next() + i += 1 + } + if (i > 1) { + arr(0) = innerSum(arr, i) + } + i = 1 + } + if (i == 0) None else Some(arr(0)) + } + } + } + + val monoid: Monoid[CMS] = CMS.monoidForCMS +} + +object DecayingCMS { + + /** + * Construct a DecayingCMS module. + * + * The seed is used to initialize the hash families used by the count-min sketch. Using the same seed will + * always produce the same hash family. + * + * Half-life determines the rate at which values in the CMS decay. If a key was counted once at time t, by + * time (t + halfLife), the value for that key will be 0.5. After enough half lives the value will decay to + * zero. + * + * The size of the CMS in bytes is O(depth * width). + * + * Width controls the relative error due to over-counting (approximately 1/width). For 1% error, use + * width=100, for 0.1% error, use width=1000, etc. + * + * Depth controls the probability the error bounds are broken and that probability scales with exp(-alpha * + * depth) so, a small depth (e.g. 5-10) is fine. Each update requires O(depth) work so you want to keep this + * as small as possible. + */ + def apply[K](seed: Long, halfLife: Duration, depth: Int, width: Int)(implicit + hasher: CMSHasher[K] + ): DecayingCMS[K] = + new DecayingCMS(seed, halfLife, depth, width, hasher) +} diff --git a/algebird-core/src/main/scala-2.13/Fold.scala b/algebird-core/src/main/scala-2.13/Fold.scala new file mode 100644 index 000000000..0b89f2d62 --- /dev/null +++ b/algebird-core/src/main/scala-2.13/Fold.scala @@ -0,0 +1,352 @@ +/* +Copyright 2014 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.algebird + +import java.io.Serializable +import scala.collection.compat._ + +/** + * Folds are first-class representations of "Traversable.foldLeft." They have the nice property that they can + * be fused to work in parallel over an input sequence. + * + * A Fold accumulates inputs (I) into some internal type (X), converting to a defined output type (O) when + * done. We use existential types to hide internal details and to allow for internal and external (X and O) + * types to differ for "map" and "join." + * + * In discussing this type we draw parallels to Function1 and related types. You can think of a fold as a + * function "Seq[I] => O" but in reality we do not have to materialize the input sequence at once to "run" the + * fold. + * + * The traversal of the input data structure is NOT done by Fold itself. Instead we expose some methods like + * "overTraversable" that know how to iterate through various sequence types and drive the fold. We also + * expose some internal state so library authors can fold over their own types. + * + * See the companion object for constructors. + */ +sealed trait Fold[-I, +O] extends Serializable { + + /** + * Users can ignore this type. + * + * The internal accumulator type. No one outside this Fold needs to know what this is, and that's a good + * thing. It keeps type signatures sane and makes this easy to use for the amount of flexibility it + * provides. + */ + type X + + /** + * Users can ignore this method. It is exposed so library authors can run folds over their own sequence + * types. + * + * "build" constructs a FoldState, which tells us how to run the fold. It is expected that we can run the + * same Fold many times over different data structures, but we must build a new FoldState every time. + * + * See FoldState for information on how to use this for your own sequence types. + */ + def build(): FoldState[X, I, O] + + /** + * Transforms the output of the Fold after iteration is complete. This is analogous to "Future.map" or + * "Function1.compose." + */ + def map[P](f: O => P): Fold[I, P] = { + val self = this + new Fold[I, P] { + type X = self.X + override def build(): FoldState[X, I, P] = + self.build().map(f) + } + } + + /** + * Joins two folds into one and combines the results. The fused fold accumulates with both at the same time + * and combines at the end. + */ + def joinWith[I2 <: I, P, Q](other: Fold[I2, P])(f: (O, P) => Q): Fold[I2, Q] = { + val self = this + new Fold[I2, Q] { + type X = (self.X, other.X) + override def build(): FoldState[X, I2, Q] = { + val first = self.build() + val second = other.build() + new FoldState( + { case ((x, y), i) => (first.add(x, i), second.add(y, i)) }, + (first.start, second.start), + { case (x, y) => f(first.end(x), second.end(y)) } + ) + } + } + } + + /** + * Convenient shorthand for joining Folds without combining at the end. + */ + def join[I2 <: I, P](other: Fold[I2, P]): Fold[I2, (O, P)] = + joinWith(other) { case (o, p) => (o, p) } + + /** + * Transforms the input of the fold before every accumulation. (The name comes from "contravariant map.") + * This is analogous to "Function1.andThen." + */ + def contramap[H](f: H => I): Fold[H, O] = { + val self = this + new Fold[H, O] { + type X = self.X + override def build(): FoldState[X, H, O] = + self.build().contramap(f) + } + } + + /** + * Trivially runs a Fold over an empty sequence. + */ + def overEmpty: O = { + // build is a "def" so we construct the state once and use the pieces to run the fold + val state = build() + state.end(state.start) + } + + /** + * Trivially runs a Fold over a single element sequence. + */ + def overSingleton(i: I): O = { + val state = build() + state.end(state.add(state.start, i)) + } + + /** + * Runs a Fold over a Traversable. + */ + def overTraversable(is: TraversableOnce[I]): O = { + val state = build() + state.end(is.iterator.foldLeft(state.start)(state.add)) + } +} + +/** + * A FoldState defines a left fold with a "hidden" accumulator type. It is exposed so library authors can run + * Folds over their own sequence types. + * + * The fold can be executed correctly according to the properties of "add" and your traversed data structure. + * For example, the "add" function of a monoidal fold will be associative. A FoldState is valid for only one + * iteration because the accumulator (seeded by "start" may be mutable. + * + * The three components of a fold are add: (X, I) => X - updates and returns internal state for every input I + * start: X - the initial state end: X => O - transforms internal state to a final result + * + * Folding over Seq(x, y) would produce the result end(add(add(start, x), y)) + */ +final class FoldState[X, -I, +O] private[algebird] (val add: (X, I) => X, val start: X, val end: X => O) + extends Serializable { + + /** + * Transforms the output type of the FoldState (see Fold.map). + */ + def map[P](f: O => P): FoldState[X, I, P] = + new FoldState(add, start, end.andThen(f)) + + /** + * Transforms the input type of the FoldState (see Fold.contramap). + */ + def contramap[H](f: H => I): FoldState[X, H, O] = + new FoldState((x, h) => add(x, f(h)), start, end) +} + +/** + * Methods to create and run Folds. + * + * The Folds defined here are immutable and serializable, which we expect by default. It is important that you + * as a user indicate mutability or non-serializability when defining new Folds. Additionally, it is + * recommended that "end" functions not mutate the accumulator in order to support scans (producing a stream + * of intermediate outputs by calling "end" at each step). + */ +object Fold extends CompatFold { + + /** + * "import Fold.applicative" will bring the Applicative instance into scope. See FoldApplicative. + */ + implicit def applicative[I]: Applicative[Fold[I, _]] = + new FoldApplicative[I] + + /** + * Turn a common Scala foldLeft into a Fold. The accumulator MUST be immutable and serializable. + */ + def foldLeft[I, O](o: O)(add: (O, I) => O): Fold[I, O] = + fold[O, I, O](add, o, o => o) + + /** + * A general way of defining Folds that supports a separate accumulator type. The accumulator MUST be + * immutable and serializable. + */ + def fold[M, I, O](add: (M, I) => M, start: M, end: M => O): Fold[I, O] = + new Fold[I, O] { + type X = M + override def build(): FoldState[X, I, O] = + new FoldState(add, start, end) + } + + /** + * A general way of defining Folds that supports constructing mutable or non-serializable accumulators. + */ + def foldMutable[M, I, O](add: (M, I) => M, start: Unit => M, end: M => O): Fold[I, O] = + new Fold[I, O] { + type X = M + override def build(): FoldState[X, I, O] = + new FoldState(add, start(()), end) + } + + /** + * Fuse a sequence of Folds into one that outputs the result of each. + */ + def sequence[I, O](ms: Seq[Fold[I, O]]): Fold[I, Seq[O]] = + new Fold[I, Seq[O]] { + type X = Seq[Any] + override def build(): FoldState[Seq[Any], I, Seq[O]] = { + val bs: Seq[FoldState[Any, I, O]] = + ms.map(_.build().asInstanceOf[FoldState[Any, I, O]]) + val adds = + bs.map(_.add) + val ends = + bs.map(_.end) + val starts: Seq[Any] = + bs.map(_.start) + val add: (Seq[Any], I) => Seq[Any] = { (xs, i) => adds.zip(xs).map { case (f, x) => f(x, i) } } + val end: (Seq[Any] => Seq[O]) = { xs => ends.zip(xs).map { case (f, x) => f(x) } } + new FoldState(add, starts, end) + } + } + + /** + * An even simpler Fold that collects into a Seq. Shorthand for "container[I, Seq];" fewer type arguments, + * better type inferrence. + */ + def seq[I]: Fold[I, Seq[I]] = + container[I, Seq] + + /** + * A Fold that does no work and returns a constant. Analogous to Function1 const: def const[A, B](b: B): (A + * \=> B) = { _ => b } + */ + def const[O](value: O): Fold[Any, O] = + Fold.foldLeft(value) { case (u, _) => u } + + /** + * A Fold that runs the given side effect for every element. + */ + def foreach[I](e: I => Unit): Fold[I, Unit] = + Fold.foldLeft(()) { case (_, i) => e(i) } + + /** + * A Fold that returns the first value in a sequence. + */ + def first[I]: Fold[I, Option[I]] = + Fold.foldLeft[I, Option[I]](None) { + case (None, i) => Some(i) + case (x, _) => x + } + + /** + * A Fold that returns the last value in a sequence. + */ + def last[I]: Fold[I, Option[I]] = + Fold.foldLeft[I, Option[I]](None) { case (_, i) => Some(i) } + + /** + * A Fold that returns the max value in a sequence. (Biased to earlier equal values.) + */ + def max[I](implicit ordering: Ordering[I]): Fold[I, Option[I]] = + Fold.foldLeft[I, Option[I]](None) { + case (None, i) => Some(i) + case (Some(y), i) if ordering.compare(y, i) < 0 => Some(i) + case (x, _) => x + } + + /** + * A Fold that returns a min value in a sequence. (Biased to earlier equal values.) + */ + def min[I](implicit ordering: Ordering[I]): Fold[I, Option[I]] = + Fold.foldLeft[I, Option[I]](None) { + case (None, i) => Some(i) + case (Some(y), i) if ordering.compare(y, i) > 0 => Some(i) + case (x, _) => x + } + + /** + * A Fold that returns the sum of a numeric sequence. Does not protect against overflow. + */ + def sum[I](implicit numeric: Monoid[I]): Fold[I, I] = + Fold.foldLeft(numeric.zero) { case (x, i) => numeric.plus(x, i) } + + /** + * For a semigroup, if we get more than 0 items, use plus + */ + def sumOption[T](implicit sg: Semigroup[T]): Fold[T, Option[T]] = + Fold.foldLeft(None: Option[T]) { + case (None, i) => Some(i) + case (Some(l), r) => Some(sg.plus(l, r)) + } + + /** + * A Fold that returns the product of a numeric sequence. Does not protect against overflow. + */ + def product[I](implicit numeric: Ring[I]): Fold[I, I] = + Fold.foldLeft(numeric.one) { case (x, i) => numeric.times(x, i) } + + /** + * A Fold that returns the length of a sequence. + */ + def size: Fold[Any, Long] = + Fold.foldLeft(0L) { case (x, _) => x + 1 } + + /** + * A Fold that returns "true" if all elements of the sequence statisfy the predicate. Note this does not + * short-circuit enumeration of the sequence. + */ + def forall[I](pred: I => Boolean): Fold[I, Boolean] = + foldLeft(true)((b, i) => b && pred(i)) + + /** + * A Fold that returns "true" if any element of the sequence statisfies the predicate. Note this does not + * short-circuit enumeration of the sequence. + */ + def exists[I](pred: I => Boolean): Fold[I, Boolean] = + foldLeft(false)((b, i) => b || pred(i)) + + /** + * A Fold that counts the number of elements satisfying the predicate. + */ + def count[I](pred: I => Boolean): Fold[I, Long] = + foldLeft(0L) { + case (c, i) if pred(i) => c + 1L + case (c, _) => c + } +} + +/** + * Folds are Applicatives! + */ +class FoldApplicative[I] extends Applicative[Fold[I, _]] { + override def map[T, U](mt: Fold[I, T])(fn: T => U): Fold[I, U] = + mt.map(fn) + override def apply[T](v: T): Fold[I, T] = + Fold.const(v) + override def join[T, U](mt: Fold[I, T], mu: Fold[I, U]): Fold[I, (T, U)] = + mt.join(mu) + override def sequence[T](ms: Seq[Fold[I, T]]): Fold[I, Seq[T]] = + Fold.sequence(ms) + override def joinWith[T, U, V](mt: Fold[I, T], mu: Fold[I, U])(fn: (T, U) => V): Fold[I, V] = + mt.joinWith(mu)(fn) +} diff --git a/algebird-core/src/main/scala-2.13/Interval.scala b/algebird-core/src/main/scala-2.13/Interval.scala new file mode 100644 index 000000000..6a1645d16 --- /dev/null +++ b/algebird-core/src/main/scala-2.13/Interval.scala @@ -0,0 +1,380 @@ +/* + Copyright 2013 Twitter, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +package com.twitter.algebird + +// TODO this is clearly more general than summingbird, and should be extended to be a ring (add union, etc...) + +/** + * Represents a single interval on a T with an Ordering + */ +sealed trait Interval[T] extends java.io.Serializable { + def contains(t: T)(implicit ord: Ordering[T]): Boolean + + def intersect(that: Interval[T])(implicit ord: Ordering[T]): Interval[T] + final def apply(t: T)(implicit ord: Ordering[T]): Boolean = contains(t) + final def &&(that: Interval[T])(implicit ord: Ordering[T]): Interval[T] = intersect(that) + + /** + * Map the Interval with a non-decreasing function. If you use a non-monotonic function (like x^2) then the + * result is meaningless. TODO: It might be good to have types for these properties in algebird. + */ + def mapNonDecreasing[U](fn: T => U): Interval[U] +} + +case class Universe[T]() extends Interval[T] { + override def contains(t: T)(implicit ord: Ordering[T]): Boolean = true + override def intersect(that: Interval[T])(implicit ord: Ordering[T]): Interval[T] = + that + override def mapNonDecreasing[U](fn: T => U): Interval[U] = Universe() +} + +case class Empty[T]() extends Interval[T] { + override def contains(t: T)(implicit ord: Ordering[T]): Boolean = false + override def intersect(that: Interval[T])(implicit ord: Ordering[T]): Interval[T] = + this + override def mapNonDecreasing[U](fn: T => U): Interval[U] = Empty() +} + +object Interval extends java.io.Serializable { + + /** + * Class that only exists so that [[leftClosedRightOpen]] and [[leftOpenRightClosed]] can retain the type + * information of the returned interval. The compiler doesn't know anything about ordering, so without + * [[MaybeEmpty]] the only valid return type is Interval[T]. + */ + sealed abstract class MaybeEmpty[T, NonEmpty[t] <: Interval[t]] { + def isEmpty: Boolean + } + object MaybeEmpty { + + /** + * Represents an empty interval. + */ + case class SoEmpty[T, NonEmpty[t] <: Interval[t]]() extends MaybeEmpty[T, NonEmpty] { + override def isEmpty: Boolean = true + } + + /** + * Represents a non-empty interval. + */ + case class NotSoEmpty[T, NonEmpty[t] <: Interval[t]](get: NonEmpty[T]) extends MaybeEmpty[T, NonEmpty] { + override def isEmpty: Boolean = false + } + } + + type GenIntersection[T] = Intersection[Lower, Upper, T] + type InLowExUp[T] = Intersection[InclusiveLower, ExclusiveUpper, T] + type InLowInUp[T] = Intersection[InclusiveLower, InclusiveUpper, T] + type ExLowExUp[T] = Intersection[ExclusiveLower, ExclusiveUpper, T] + type ExLowInUp[T] = Intersection[ExclusiveLower, InclusiveUpper, T] + + implicit def monoid[T: Ordering]: Monoid[Interval[T]] = + Monoid.from[Interval[T]](Universe[T]())(_ && _) + + // Automatically convert from a MaybeEmpty instance + implicit def fromMaybeEmpty[T, NonEmpty[t] <: Interval[t]](me: MaybeEmpty[T, NonEmpty]): Interval[T] = + me match { + case MaybeEmpty.SoEmpty() => Empty() + case MaybeEmpty.NotSoEmpty(i) => i + } + + def leftClosedRightOpen[T: Ordering](lower: T, upper: T): MaybeEmpty[T, InLowExUp] = + if (Ordering[T].lt(lower, upper)) + MaybeEmpty.NotSoEmpty[T, InLowExUp](Intersection(InclusiveLower(lower), ExclusiveUpper(upper))) + else MaybeEmpty.SoEmpty[T, InLowExUp]() + + def leftOpenRightClosed[T: Ordering](lower: T, upper: T): MaybeEmpty[T, ExLowInUp] = + if (Ordering[T].lt(lower, upper)) + MaybeEmpty.NotSoEmpty[T, ExLowInUp](Intersection(ExclusiveLower(lower), InclusiveUpper(upper))) + else MaybeEmpty.SoEmpty[T, ExLowInUp]() + + def closed[T: Ordering](lower: T, upper: T): MaybeEmpty[T, InLowInUp] = + if (Ordering[T].lteq(lower, upper)) + MaybeEmpty.NotSoEmpty[T, InLowInUp](Intersection(InclusiveLower(lower), InclusiveUpper(upper))) + else MaybeEmpty.SoEmpty[T, InLowInUp]() + + def open[T: Ordering](lower: T, upper: T): MaybeEmpty[T, ExLowExUp] = + if (Ordering[T].lt(lower, upper)) + MaybeEmpty.NotSoEmpty[T, ExLowExUp](Intersection(ExclusiveLower(lower), ExclusiveUpper(upper))) + else MaybeEmpty.SoEmpty[T, ExLowExUp]() + + /** + * This is here for binary compatibility reasons. These methods should be moved to Interval, which should + * also be an abstract class for better binary compatibility at the next incompatible change + */ + implicit final class IntervalMethods[T](val intr: Interval[T]) extends AnyVal { + def isEmpty(implicit succ: Successible[T], pred: Predecessible[T]): Boolean = intr match { + case Empty() => true + case Universe() => false + case Intersection(InclusiveLower(l), ExclusiveUpper(u)) => + !succ.ordering.lt(l, u) + case Intersection(InclusiveLower(l), InclusiveUpper(u)) => + !succ.ordering.lteq(l, u) + case Intersection(ExclusiveLower(l), ExclusiveUpper(u)) => + !succ.next(l).exists(succ.ordering.lt(_, u)) + case Intersection(ExclusiveLower(l), InclusiveUpper(u)) => + !succ.next(l).exists(succ.ordering.lteq(_, u)) + case InclusiveLower(_) => false // we at least have l + case InclusiveUpper(_) => false // false // we at least have u + case ExclusiveLower(l) => + succ.next(l).isEmpty + case ExclusiveUpper(u) => + pred.prev(u).isEmpty + } + + /** + * If this returns Some(t), then intr.contains(t) and there is no s less than t such that intr.contains(s) + * + * if this returns None, it may be Empty, Upper or Universe + */ + def boundedLeast(implicit succ: Successible[T]): Option[T] = intr match { + case Empty() => None + case Universe() => None + case _: Upper[?] => None + case i @ Intersection(_, _) => i.least + case l: Lower[?] => l.least + } + + /** + * If this returns Some(t), then intr.contains(t) and there is no s greater than t such that + * intr.contains(s) + * + * if this returns None, it may be Empty, Lower, or Universe + */ + def boundedGreatest(implicit pred: Predecessible[T]): Option[T] = + intr match { + case Empty() => None + case Universe() => None + case _: Lower[?] => None + case i @ Intersection(_, _) => i.greatest + case u: Upper[?] => u.greatest + } + } +} + +// Marker traits to keep lower on the left in Intersection +sealed trait Lower[T] extends Interval[T] { + + /** + * This may give a false positive (but should try not to). Note the case of (0,1) for the integers. If they + * were doubles, this would intersect, but since there are no members of the set Int that are bigger than 0 + * and less than 1, they don't really intersect. So, ordering is not enough here. You need a stronger + * notion, which we don't have a typeclass for. + */ + def intersects(u: Upper[T])(implicit ord: Ordering[T]): Boolean + + /** + * The smallest value that is contained here This is an Option, because of cases like + * ExclusiveLower(Int.MaxValue) which are pathological and equivalent to Empty + */ + def least(implicit s: Successible[T]): Option[T] + def strictLowerBound(implicit p: Predecessible[T]): Option[T] + + /** + * Iterates all the items in this Lower[T] from lowest to highest + */ + def toIterable(implicit s: Successible[T]): Iterable[T] = + least match { + case Some(l) => s.iterateNext(l) + case None => Iterable.empty + } +} +sealed trait Upper[T] extends Interval[T] { + + /** + * The smallest value that is contained here This is an Option, because of cases like + * ExclusiveUpper(Int.MinValue), which are pathological and equivalent to Empty + */ + def greatest(implicit p: Predecessible[T]): Option[T] + // The smallest value that is not present + def strictUpperBound(implicit s: Successible[T]): Option[T] + + /** + * Iterates all the items in this Upper[T] from highest to lowest + */ + def toIterable(implicit p: Predecessible[T]): Iterable[T] = + greatest match { + case Some(g) => p.iteratePrev(g) + case None => Iterable.empty + } +} + +case class InclusiveLower[T](lower: T) extends Interval[T] with Lower[T] { + override def contains(t: T)(implicit ordering: Ordering[T]): Boolean = + ordering.lteq(lower, t) + override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match { + case Universe() => this + case Empty() => that + case ub @ InclusiveUpper(_) => + if (intersects(ub)) Intersection(this, ub) else Empty() + case ub @ ExclusiveUpper(_) => + if (intersects(ub)) Intersection(this, ub) else Empty() + case InclusiveLower(thatlb) => + if (ordering.gt(lower, thatlb)) this else that + case ExclusiveLower(thatlb) => + if (ordering.gt(lower, thatlb)) this else that + case Intersection(thatL, thatU) => (this && thatL) && thatU + } + override def intersects(u: Upper[T])(implicit ordering: Ordering[T]): Boolean = + u match { + case InclusiveUpper(upper) => ordering.lteq(lower, upper) + case ExclusiveUpper(upper) => ordering.lt(lower, upper) + } + override def least(implicit s: Successible[T]): Option[T] = Some(lower) + override def strictLowerBound(implicit p: Predecessible[T]): Option[T] = p.prev(lower) + override def mapNonDecreasing[U](fn: T => U): Interval[U] = InclusiveLower(fn(lower)) +} +case class ExclusiveLower[T](lower: T) extends Interval[T] with Lower[T] { + override def contains(t: T)(implicit ordering: Ordering[T]): Boolean = + ordering.lt(lower, t) + override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match { + case Universe() => this + case Empty() => that + case ub @ InclusiveUpper(_) => + if (intersects(ub)) Intersection(this, ub) else Empty() + case ub @ ExclusiveUpper(_) => + if (intersects(ub)) Intersection(this, ub) else Empty() + case InclusiveLower(thatlb) => + if (ordering.gteq(lower, thatlb)) this else that + case ExclusiveLower(thatlb) => + if (ordering.gteq(lower, thatlb)) this else that + case Intersection(thatL, thatU) => (this && thatL) && thatU + } + override def intersects(u: Upper[T])(implicit ordering: Ordering[T]): Boolean = + u match { + case InclusiveUpper(upper) => ordering.lt(lower, upper) + case ExclusiveUpper(upper) => + ordering.lt(lower, upper) // This is a false positive for (x, next(x)) + } + override def least(implicit s: Successible[T]): Option[T] = s.next(lower) + override def strictLowerBound(implicit p: Predecessible[T]): Option[T] = Some(lower) + override def mapNonDecreasing[U](fn: T => U): Interval[U] = ExclusiveLower(fn(lower)) +} +case class InclusiveUpper[T](upper: T) extends Interval[T] with Upper[T] { + override def contains(t: T)(implicit ordering: Ordering[T]): Boolean = + ordering.lteq(t, upper) + override def greatest(implicit p: Predecessible[T]): Option[T] = Some(upper) + // The smallest value that is not present + override def strictUpperBound(implicit s: Successible[T]): Option[T] = s.next(upper) + override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match { + case Universe() => this + case Empty() => that + case lb @ InclusiveLower(_) => + if (lb.intersects(this)) Intersection(lb, this) else Empty() + case lb @ ExclusiveLower(_) => + if (lb.intersects(this)) Intersection(lb, this) else Empty() + case InclusiveUpper(thatub) => + if (ordering.lt(upper, thatub)) this else that + case ExclusiveUpper(thatub) => + if (ordering.lt(upper, thatub)) this else that + case Intersection(thatL, thatU) => thatL && (this && thatU) + } + override def mapNonDecreasing[U](fn: T => U): Interval[U] = InclusiveUpper(fn(upper)) +} +case class ExclusiveUpper[T](upper: T) extends Interval[T] with Upper[T] { + override def contains(t: T)(implicit ordering: Ordering[T]): Boolean = + ordering.lt(t, upper) + override def greatest(implicit p: Predecessible[T]): Option[T] = p.prev(upper) + // The smallest value that is not present + override def strictUpperBound(implicit s: Successible[T]): Option[T] = Some(upper) + override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match { + case Universe() => this + case Empty() => that + case lb @ InclusiveLower(_) => + if (lb.intersects(this)) Intersection(lb, this) else Empty() + case lb @ ExclusiveLower(_) => + if (lb.intersects(this)) Intersection(lb, this) else Empty() + case InclusiveUpper(thatub) => + if (ordering.lteq(upper, thatub)) this else that + case ExclusiveUpper(thatub) => + if (ordering.lteq(upper, thatub)) this else that + case Intersection(thatL, thatU) => thatL && (this && thatU) + } + override def mapNonDecreasing[U](fn: T => U): Interval[U] = ExclusiveUpper(fn(upper)) +} + +case class Intersection[L[t] <: Lower[t], U[t] <: Upper[t], T](lower: L[T], upper: U[T]) extends Interval[T] { + override def contains(t: T)(implicit ordering: Ordering[T]): Boolean = + lower.contains(t) && upper.contains(t) + override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match { + case Universe() => this + case Empty() => that + case lb @ InclusiveLower(_) => (lb && lower) && upper + case lb @ ExclusiveLower(_) => (lb && lower) && upper + case ub @ InclusiveUpper(_) => lower && (ub && upper) + case ub @ ExclusiveUpper(_) => lower && (ub && upper) + case Intersection(thatL, thatU) => (lower && thatL) && (upper && thatU) + } + override def mapNonDecreasing[T1](fn: T => T1): Interval[T1] = { + val newLower = lower match { + case InclusiveLower(l) => InclusiveLower(fn(l)) + case ExclusiveLower(l) => ExclusiveLower(fn(l)) + } + val newUpper = upper match { + case InclusiveUpper(u) => InclusiveUpper(fn(u)) + case ExclusiveUpper(u) => ExclusiveUpper(fn(u)) + } + Intersection(newLower, newUpper) + } + + def least(implicit s: Successible[T]): Option[T] = + lower.least.filter(upper.contains(_)(s.ordering)) + + /** + * Goes from lowest to highest for all items that are contained in this Intersection + */ + def leastToGreatest(implicit s: Successible[T]): Iterable[T] = { + val self = this + implicit val ord: Ordering[T] = s.ordering + // TODO https://github.com/twitter/algebird/issues/263 + new AbstractIterable[T] { + // We have to do this because the normal takeWhile causes OOM on big intervals + override def iterator: Iterator[T] = lower.toIterable.iterator.takeWhile(self.upper.contains(_)) + } + } + + def greatest(implicit p: Predecessible[T]): Option[T] = + upper.greatest.filter(lower.contains(_)(p.ordering)) + + /** + * Goes from highest to lowest for all items that are contained in this Intersection + */ + def greatestToLeast(implicit p: Predecessible[T]): Iterable[T] = { + val self = this + implicit val ord: Ordering[T] = p.ordering + // TODO https://github.com/twitter/algebird/issues/263 + new AbstractIterable[T] { + // We have to do this because the normal takeWhile causes OOM on big intervals + override def iterator: Iterator[T] = upper.toIterable.iterator.takeWhile(self.lower.contains(_)) + } + } + + /** + * Some intervals can actually be synonyms for empty: (0,0) for instance, contains nothing. This cannot be + * normalized to [a, b) form, thus we return an option Also, there are cases like [Int.MinValue, + * Int.MaxValue] that cannot are actually equivalent to Universe. The bottom line: if this returns None, it + * just means you can't express it this way, it does not mean it is empty or universe, etc... (there are + * other cases). + */ + def toLeftClosedRightOpen(implicit + s: Successible[T] + ): Option[Intersection[InclusiveLower, ExclusiveUpper, T]] = + for { + l <- lower.least + g <- upper.strictUpperBound if s.ordering.lt(l, g) + } yield Intersection(InclusiveLower(l), ExclusiveUpper(g)) +} diff --git a/algebird-core/src/main/scala-2.13/InvariantAlgebras.scala b/algebird-core/src/main/scala-2.13/InvariantAlgebras.scala new file mode 100644 index 000000000..6f30ebc1c --- /dev/null +++ b/algebird-core/src/main/scala-2.13/InvariantAlgebras.scala @@ -0,0 +1,48 @@ +package com.twitter.algebird + +class InvariantSemigroup[T, U](val forward: T => U, val reverse: U => T)(implicit val semigroup: Semigroup[T]) + extends Semigroup[U] { + override def plus(l: U, r: U): U = + forward(semigroup.plus(reverse(l), reverse(r))) + override def sumOption(iter: TraversableOnce[U]): Option[U] = + semigroup.sumOption(iter.map(reverse)).map(forward) + + /* + * Note these work for the subclasses since in those cases semigroup + * will be the appropriate algebra. + */ + override val hashCode: Int = (forward, reverse, semigroup).hashCode + override def equals(that: Any): Boolean = + that match { + case r: InvariantSemigroup[?, ?] => + (hashCode == r.hashCode) && + (forward == r.forward) && + (reverse == r.reverse) && + (semigroup == r.semigroup) + case _ => false + } +} + +class InvariantMonoid[T, U](forward: T => U, reverse: U => T)(implicit val monoid: Monoid[T]) + extends InvariantSemigroup[T, U](forward, reverse) + with Monoid[U] { + override val zero: U = forward(monoid.zero) +} + +class InvariantGroup[T, U](forward: T => U, reverse: U => T)(implicit val group: Group[T]) + extends InvariantMonoid[T, U](forward, reverse) + with Group[U] { + override def negate(u: U): U = forward(group.negate(reverse(u))) + override def minus(l: U, r: U): U = + forward(group.minus(reverse(l), reverse(r))) +} + +class InvariantRing[T, U](forward: T => U, reverse: U => T)(implicit val ring: Ring[T]) + extends InvariantGroup[T, U](forward, reverse) + with Ring[U] { + override val one: U = forward(ring.one) + override def times(l: U, r: U): U = + forward(ring.times(reverse(l), reverse(r))) + override def product(iter: TraversableOnce[U]): U = + forward(ring.product(iter.map(reverse))) +} diff --git a/algebird-core/src/main/scala-2.13/JavaMonoids.scala b/algebird-core/src/main/scala-2.13/JavaMonoids.scala new file mode 100644 index 000000000..26ce54f0a --- /dev/null +++ b/algebird-core/src/main/scala-2.13/JavaMonoids.scala @@ -0,0 +1,147 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.algebird + +import java.lang.{ + Boolean => JBool, + Double => JDouble, + Float => JFloat, + Integer => JInt, + Long => JLong, + Short => JShort +} +import java.util.{ArrayList => JArrayList, HashMap => JHashMap, List => JList, Map => JMap} + +import scala.collection.JavaConverters._ + +object JIntRing extends Ring[JInt] { + override val zero: JInt = JInt.valueOf(0) + override val one: JInt = JInt.valueOf(1) + override def plus(x: JInt, y: JInt): JInt = x + y + override def negate(x: JInt): JInt = -x + override def minus(x: JInt, y: JInt): JInt = x - y + override def times(x: JInt, y: JInt): JInt = x * y +} + +object JShortRing extends Ring[JShort] { + override val zero: JShort = Short.box(0) + override val one: JShort = Short.box(1) + override def plus(x: JShort, y: JShort): JShort = (x + y).toShort + override def negate(x: JShort): JShort = (-x).toShort + override def minus(x: JShort, y: JShort): JShort = (x - y).toShort + override def times(x: JShort, y: JShort): JShort = (x * y).toShort +} + +object JLongRing extends Ring[JLong] { + override val zero: JLong = JLong.valueOf(0L) + override val one: JLong = JLong.valueOf(1L) + override def plus(x: JLong, y: JLong): JLong = x + y + override def negate(x: JLong): JLong = -x + override def minus(x: JLong, y: JLong): JLong = x - y + override def times(x: JLong, y: JLong): JLong = x * y +} + +object JFloatRing extends Ring[JFloat] { + override val zero: JFloat = JFloat.valueOf(0.0f) + override val one: JFloat = JFloat.valueOf(1.0f) + override def plus(x: JFloat, y: JFloat): JFloat = x + y + override def negate(x: JFloat): JFloat = -x + override def minus(x: JFloat, y: JFloat): JFloat = x - y + override def times(x: JFloat, y: JFloat): JFloat = x * y +} + +object JDoubleRing extends Ring[JDouble] { + override val zero: JDouble = JDouble.valueOf(0.0) + override val one: JDouble = JDouble.valueOf(1.0) + override def plus(x: JDouble, y: JDouble): JDouble = x + y + override def negate(x: JDouble): JDouble = -x + override def minus(x: JDouble, y: JDouble): JDouble = x - y + override def times(x: JDouble, y: JDouble): JDouble = x * y +} + +object JBoolRing extends Ring[JBool] { + override val zero: JBool = JBool.FALSE + override val one: JBool = JBool.TRUE + override def plus(x: JBool, y: JBool): JBool = + JBool.valueOf(x.booleanValue ^ y.booleanValue) + override def negate(x: JBool): JBool = x + override def minus(x: JBool, y: JBool): JBool = plus(x, y) + override def times(x: JBool, y: JBool): JBool = + JBool.valueOf(x.booleanValue & y.booleanValue) +} + +/** + * Since Lists are mutable, this always makes a full copy. Prefer scala immutable Lists if you use scala + * immutable lists, the tail of the result of plus is always the right argument + */ +class JListMonoid[T] extends Monoid[JList[T]] { + override def isNonZero(x: JList[T]): Boolean = !x.isEmpty + override lazy val zero: JArrayList[T] = new JArrayList[T](0) + override def plus(x: JList[T], y: JList[T]): JArrayList[T] = { + val res = new JArrayList[T](x.size + y.size) + res.addAll(x) + res.addAll(y) + res + } +} + +/** + * Since maps are mutable, this always makes a full copy. Prefer scala immutable maps if you use scala + * immutable maps, this operation is much faster TODO extend this to Group, Ring + */ +class JMapMonoid[K, V: Semigroup] extends Monoid[JMap[K, V]] { + override lazy val zero: JHashMap[K, V] = new JHashMap[K, V](0) + + val nonZero: (V => Boolean) = implicitly[Semigroup[V]] match { + case mon: Monoid[?] => mon.isNonZero(_) + case _ => _ => true + } + + override def isNonZero(x: JMap[K, V]): Boolean = + !x.isEmpty && (implicitly[Semigroup[V]] match { + case mon: Monoid[?] => + x.values.asScala.exists(v => mon.isNonZero(v)) + case _ => true + }) + override def plus(x: JMap[K, V], y: JMap[K, V]): JHashMap[K, V] = { + val (big, small, bigOnLeft) = + if (x.size > y.size) { + (x, y, true) + } else { + (y, x, false) + } + val vsemi = implicitly[Semigroup[V]] + val result = new JHashMap[K, V](big.size + small.size) + result.putAll(big) + small.entrySet.asScala.foreach { kv => + val smallK = kv.getKey + val smallV = kv.getValue + if (big.containsKey(smallK)) { + val bigV = big.get(smallK) + val newV = + if (bigOnLeft) vsemi.plus(bigV, smallV) else vsemi.plus(smallV, bigV) + if (nonZero(newV)) + result.put(smallK, newV) + else + result.remove(smallK) + } else { + // No need to explicitly add with zero on V, just put in the small value + result.put(smallK, smallV) + } + } + result + } +} diff --git a/algebird-core/src/main/scala-2.13/MapAlgebra.scala b/algebird-core/src/main/scala-2.13/MapAlgebra.scala new file mode 100644 index 000000000..9ca370eaf --- /dev/null +++ b/algebird-core/src/main/scala-2.13/MapAlgebra.scala @@ -0,0 +1,320 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.algebird + +import com.twitter.algebird.macros.{Cuber, Roller} +import scala.collection.mutable.{Builder, Map => MMap} +import scala.collection.{Map => ScMap} +import algebra.ring.Rng +import scala.collection.compat._ + +trait MapOperations[K, V, M <: ScMap[K, V]] { + def add(oldMap: M, kv: (K, V)): M + def remove(oldMap: M, k: K): M + def fromMutable(mut: MMap[K, V]): M +} + +abstract class GenericMapMonoid[K, V, M <: ScMap[K, V]](implicit val semigroup: Semigroup[V]) + extends Monoid[M] + with MapOperations[K, V, M] { + + val nonZero: (V => Boolean) = semigroup match { + case mon: Monoid[?] => mon.isNonZero(_) + case _ => _ => true + } + + override def isNonZero(x: M): Boolean = + !x.isEmpty && (semigroup match { + case mon: Monoid[?] => + x.valuesIterator.exists(v => mon.isNonZero(v)) + case _ => true + }) + + override def plus(x: M, y: M): M = { + // Scala maps can reuse internal structure, so don't copy just add into the bigger one: + // This really saves computation when adding lots of small maps into big ones (common) + val (big, small, bigOnLeft) = + if (x.size > y.size) { + (x, y, true) + } else { + (y, x, false) + } + small match { + // Mutable maps create new copies of the underlying data on add so don't use the + // handleImmutable method. + // Cannot have a None so 'get' is safe here. + case _: MMap[?, ?] => sumOption(Seq(big, small)).get + case _ => handleImmutable(big, small, bigOnLeft) + } + } + + private def handleImmutable(big: M, small: M, bigOnLeft: Boolean) = + small.foldLeft(big) { (oldMap, kv) => + val newV = big + .get(kv._1) + .map { bigV => + if (bigOnLeft) + semigroup.plus(bigV, kv._2) + else + semigroup.plus(kv._2, bigV) + } + .getOrElse(kv._2) + if (nonZero(newV)) + add(oldMap, kv._1 -> newV) + else + remove(oldMap, kv._1) + } + override def sumOption(items: TraversableOnce[M]): Option[M] = + if (items.iterator.isEmpty) None + else { + val mutable = MMap[K, V]() + items.iterator.foreach { m => + m.foreach { case (k, v) => + val oldVOpt = mutable.get(k) + // sorry for the micro optimization here: avoiding a closure + val newV = + if (oldVOpt.isEmpty) v else Semigroup.plus(oldVOpt.get, v) + if (nonZero(newV)) + mutable.update(k, newV) + else + mutable.remove(k) + } + } + Some(fromMutable(mutable)) + } +} + +class MapMonoid[K, V](implicit semigroup: Semigroup[V]) extends GenericMapMonoid[K, V, Map[K, V]] { + override lazy val zero: Map[K, V] = Map[K, V]() + override def add(oldMap: Map[K, V], kv: (K, V)): Map[K, V] = oldMap + kv + override def remove(oldMap: Map[K, V], k: K): Map[K, V] = oldMap - k + override def fromMutable(mut: MMap[K, V]): Map[K, V] = + new MutableBackedMap(mut) +} + +class ScMapMonoid[K, V](implicit semigroup: Semigroup[V]) extends GenericMapMonoid[K, V, ScMap[K, V]] { + override lazy val zero: ScMap[K, V] = ScMap[K, V]() + override def add(oldMap: ScMap[K, V], kv: (K, V)): ScMap[K, V] = oldMap + kv + override def remove(oldMap: ScMap[K, V], k: K): ScMap[K, V] = oldMap - k + override def fromMutable(mut: MMap[K, V]): ScMap[K, V] = + new MutableBackedMap(mut) +} + +/** + * You can think of this as a Sparse vector group + */ +class MapGroup[K, V](implicit val group: Group[V]) extends MapMonoid[K, V]()(group) with Group[Map[K, V]] { + override def negate(kv: Map[K, V]): Map[K, V] = + kv.iterator.map { case (k, v) => + (k, group.negate(v)) + }.toMap +} + +class ScMapGroup[K, V](implicit val group: Group[V]) + extends ScMapMonoid[K, V]()(group) + with Group[ScMap[K, V]] { + override def negate(kv: ScMap[K, V]): ScMap[K, V] = + kv.iterator.map { case (k, v) => + (k, group.negate(v)) + }.toMap +} + +/** + * You can think of this as a Sparse vector ring + */ +trait GenericMapRing[K, V, M <: ScMap[K, V]] extends Rng[M] with MapOperations[K, V, M] { + + implicit def ring: Ring[V] + + override def times(x: M, y: M): M = { + val (big, small, bigOnLeft) = + if (x.size > y.size) { + (x, y, true) + } else { + (y, x, false) + } + small.foldLeft(zero) { (oldMap, kv) => + val bigV = big.getOrElse(kv._1, ring.zero) + val newV = + if (bigOnLeft) ring.times(bigV, kv._2) else ring.times(kv._2, bigV) + if (ring.isNonZero(newV)) { + add(oldMap, kv._1 -> newV) + } else { + remove(oldMap, kv._1) + } + } + } +} + +class MapRing[K, V](implicit override val ring: Ring[V]) + extends MapGroup[K, V]()(ring) + with GenericMapRing[K, V, Map[K, V]] + +class ScMapRing[K, V](implicit override val ring: Ring[V]) + extends ScMapGroup[K, V]()(ring) + with GenericMapRing[K, V, ScMap[K, V]] + +object MapAlgebra { + def rightContainsLeft[K, V: Equiv](l: Map[K, V], r: Map[K, V]): Boolean = + l.forall { case (k, v) => + r.get(k).exists(Equiv[V].equiv(_, v)) + } + + implicit def sparseEquiv[K, V: Monoid: Equiv]: Equiv[Map[K, V]] = + Equiv.fromFunction { (m1, m2) => + val cleanM1 = removeZeros(m1) + val cleanM2 = removeZeros(m2) + rightContainsLeft(cleanM1, cleanM2) && rightContainsLeft(cleanM2, cleanM1) + } + + def mergeLookup[T, U, V: Monoid]( + keys: TraversableOnce[T] + )(lookup: T => Option[V])(present: T => U): Map[U, V] = + sumByKey { + keys.iterator.map(k => present(k) -> lookup(k).getOrElse(Monoid.zero[V])) + } + + // Returns a new map with zero-value entries removed + def removeZeros[K, V: Monoid](m: Map[K, V]): Map[K, V] = + m.filter { case (_, v) => Monoid.isNonZero(v) } + + /** + * For each key, sum all the values. Note that if V is a Monoid, the current implementation will drop from + * the output any key where the values are all Monoid.zero. If the Semigroup is a Monoid, This function is + * equivalent to: + * + * pairs.filter(_._2 != Monoid.zero).groupBy(_._1).mapValues(_.map(_._2).sum) + * + * Otherwise, the function is equivalent to: + * + * pairs.groupBy(_._1).mapValues(_.map(_._2).sum) + */ + def sumByKey[K, V: Semigroup](pairs: TraversableOnce[(K, V)]): Map[K, V] = + Monoid.sum(pairs.iterator.map(Map(_))) + + /** + * For each key, creates a list of all values. This function is equivalent to: + * + * pairs.groupBy(_._1).mapValues(_.map(_._2)) + */ + def group[K, V](pairs: TraversableOnce[(K, V)]): Map[K, List[V]] = + if (pairs.iterator.isEmpty) Map.empty + else { + val mutable = MMap[K, Builder[V, List[V]]]() + pairs.iterator.foreach { case (k, v) => + val oldVOpt = mutable.get(k) + // sorry for the micro optimization here: avoiding a closure + val bldr = if (oldVOpt.isEmpty) { + val b = List.newBuilder[V] + mutable.update(k, b) + b + } else oldVOpt.get + bldr += v + } + mutable.iterator.map { case (k, bldr) => (k, bldr.result()) }.toMap + } + + // Consider this as edges from k -> v, produce a Map[K,Set[V]] + def toGraph[K, V](pairs: TraversableOnce[(K, V)]): Map[K, Set[V]] = + Monoid.sum(pairs.map { case (k, v) => Map(k -> Set(v)) }) + + /** join the keys of two maps (similar to outer-join in a DB) */ + def join[K, V, W](map1: Map[K, V], map2: Map[K, W]): Map[K, (Option[V], Option[W])] = + Monoid + .plus( + map1.transform { case (_, v) => + (List(v), List[W]()) + }, + map2.transform { case (_, w) => + (List[V](), List(w)) + } + ) + .transform { case (_, (v, w)) => (v.headOption, w.headOption) } + + /** + * Reverses a graph losslessly None key is for v's with no sources. + */ + def invertExact[K, V](m: Map[Option[K], Set[V]]): Map[Option[V], Set[K]] = { + def nonEmptyIter[T](i: Iterable[T]): Iterable[Option[T]] = + if (i.isEmpty) Iterable(None) + else { + i.map(Some(_)) + } + + Monoid.sum { + for { + (k, sv) <- m.view.toIterable + v <- nonEmptyIter(sv) + } yield Map(v -> k.toSet) + } + } + + /** + * Invert the Common case of exactly one value for each key + */ + def invert[K, V](m: Map[K, V]): Map[V, Set[K]] = + Monoid.sum(m.view.toIterable.map { case (k, v) => Map(v -> Set(k)) }) + + def dot[K, V](left: Map[K, V], right: Map[K, V])(implicit mring: Ring[Map[K, V]], mon: Monoid[V]): V = + Monoid.sum(mring.times(left, right).values) + + def cube[K, V](it: TraversableOnce[(K, V)])(implicit c: Cuber[K]): Map[c.K, List[V]] = { + val map: MMap[c.K, List[V]] = MMap[c.K, List[V]]() + it.iterator.foreach { case (k, v) => + c(k).iterator.foreach { ik => + map.get(ik) match { + case Some(vs) => map += ik -> (v :: vs) + case None => map += ik -> List(v) + } + } + } + map.foreach { case (k, v) => map(k) = v.reverse } + new MutableBackedMap(map) + } + + def cubeSum[K, V](it: TraversableOnce[(K, V)])(implicit c: Cuber[K], sg: Semigroup[V]): Map[c.K, V] = + sumByKey(it.iterator.flatMap { case (k, v) => c(k).map((_, v)) }) + + def cubeAggregate[T, K, U, V](it: TraversableOnce[T], agg: Aggregator[T, U, V])( + fn: T => K + )(implicit c: Cuber[K]): Map[c.K, V] = + sumByKey(it.iterator.flatMap(t => c(fn(t)).iterator.map((_, agg.prepare(t)))))(agg.semigroup) + .map { case (k, v) => (k, agg.present(v)) } + + def rollup[K, V](it: TraversableOnce[(K, V)])(implicit r: Roller[K]): Map[r.K, List[V]] = { + val map: MMap[r.K, List[V]] = MMap[r.K, List[V]]() + it.iterator.foreach { case (k, v) => + r(k).iterator.foreach { ik => + map.get(ik) match { + case Some(vs) => map += ik -> (v :: vs) + case None => map += ik -> List(v) + } + } + } + map.foreach { case (k, v) => map(k) = v.reverse } + new MutableBackedMap(map) + } + + def rollupSum[K, V](it: TraversableOnce[(K, V)])(implicit r: Roller[K], sg: Semigroup[V]): Map[r.K, V] = + sumByKey(it.iterator.flatMap { case (k, v) => r(k).iterator.map((_, v)) }) + + def rollupAggregate[T, K, U, V](it: TraversableOnce[T], agg: Aggregator[T, U, V])( + fn: T => K + )(implicit r: Roller[K]): Map[r.K, V] = + sumByKey(it.iterator.flatMap(t => r(fn(t)).iterator.map((_, agg.prepare(t)))))(agg.semigroup) + .map { case (k, v) => (k, agg.present(v)) } + +} diff --git a/algebird-core/src/main/scala-2.13/Scan.scala b/algebird-core/src/main/scala-2.13/Scan.scala new file mode 100644 index 000000000..2dc2ff9c2 --- /dev/null +++ b/algebird-core/src/main/scala-2.13/Scan.scala @@ -0,0 +1,333 @@ +package com.twitter.algebird + +import scala.collection.compat._ + +object Scan { + + /** + * Most consumers of Scan don't care about the type of the type State type variable. But for those that do, + * we make an effort to expose it in all of our combinators. + * @tparam I + * @tparam S + * @tparam O + */ + type Aux[-I, S, +O] = Scan[I, O] { type State = S } + + implicit def applicative[I]: Applicative[Scan[I, _]] = new ScanApplicative[I] + + def from[I, S, O](initState: S)(presentAndNextStateFn: (I, S) => (O, S)): Aux[I, S, O] = + new Scan[I, O] { + override type State = S + override val initialState = initState + override def presentAndNextState(i: I, s: State): (O, State) = presentAndNextStateFn(i, s) + } + + def fromFunction[I, O](f: I => O): Aux[I, Unit, O] = new Scan[I, O] { + override type State = Unit + override val initialState = () + override def presentAndNextState(i: I, stateBeforeProcessingI: Unit): (O, State) = (f(i), ()) + } + + /** + * Scans take streams of inputs to streams of outputs, but some scans have trivial inputs and just produce a + * stream of outputs. Streams can be thought of as being a hidden state that is queryable for a head + * element, and another hidden state that represents the rest of the stream. + * @param initState + * The initial state of the scan; think of this as an infinite stream. + * @param destructor + * This function decomposes a stream into the its head-element and tail-stream. + * @tparam S + * The hidden state of the stream that we are turning into a Scan. + * @tparam O + * The type of the elments of the stream that we are turning into a Scan + * @return + * A Scan whose inputs are irrelevant, and whose outputs are those that we would get from implementing a + * stream using the information provided to this method. + */ + def iterate[S, O](initState: S)(destructor: S => (O, S)): Aux[Any, S, O] = new Scan[Any, O] { + override type State = S + override val initialState = initState + override def presentAndNextState(i: Any, stateBeforeProcessingI: S): (O, S) = + destructor(stateBeforeProcessingI) + } + + /** + * A Scan whose `Nth` output is the number `N` (starting from 0). + */ + val index: Aux[Any, Long, Long] = iterate(0L)(n => (n, n + 1)) + + def identity[A]: Aux[A, Unit, A] = fromFunction[A, A](x => x) + + /** + * @param initStateCreator + * A call-by-name method that allocates new mutable state + * @param presentAndUpdateStateFn + * A function that both presents the output value, and has the side-effect of updating the mutable state + * @tparam I + * @tparam S + * @tparam O + * @return + * A Scan that safely encapsulates state while it's doing its thing. + */ + def mutable[I, S, O](initStateCreator: => S)(presentAndUpdateStateFn: (I, S) => O): Aux[I, S, O] = + new Scan[I, O] { + override type State = S + override def initialState = initStateCreator + override def presentAndNextState(i: I, s: S): (O, S) = (presentAndUpdateStateFn(i, s), s) + } + + /** + * The trivial scan that always returns the same value, regardless of input + * @param t + * @tparam T + */ + def const[T](t: T): Aux[Any, Unit, T] = fromFunction(_ => t) + + /** + * @param aggregator + * @param initState + * @tparam A + * @tparam B + * @tparam C + * @return + * A scan which, when given `[a_1, ..., a_n]` outputs `[c_1, ..., c_n]` where `c_i = initState + + * aggregator.prepare(a_1) + ... + aggregator.prepare(a_i)` + */ + def fromAggregator[A, B, C](aggregator: Aggregator[A, B, C], initState: B): Aux[A, B, C] = + from(initState) { (a: A, stateBeforeProcessingI: B) => + // nb: the order of the arguments to semigroup.plus here is what determines the order of the final summation; + // this matters because not all semigroups are commutative + val stateAfterProcessingA = + aggregator.append(stateBeforeProcessingI, a) + (aggregator.present(stateAfterProcessingA), stateAfterProcessingA) + } + + /** + * @param monoidAggregator + * @tparam A + * @tparam B + * @tparam C + * @return + * A scan which, when given `[a_1, ..., a_n]` outputs `[c_1, ..., c_n]` where `c_i = + * monoidAggregator.monoid.zero + aggregator.prepare(a_1) + ... + aggregator.prepare(a_i)` + */ + def fromMonoidAggregator[A, B, C](monoidAggregator: MonoidAggregator[A, B, C]): Aux[A, B, C] = + fromAggregator(monoidAggregator, monoidAggregator.monoid.zero) + +} + +/** + * The Scan trait is an alternative to the `scanLeft` method on iterators/other collections for a range of of + * use-cases where `scanLeft` is awkward to use. At a high level it provides some of the same functionality as + * `scanLeft`, but with a separation of "what is the state of the scan" from "what are the elements that I'm + * scanning over?". In particular, when scanning over an iterator with `N` elements, the output is an iterator + * with `N` elements (in contrast to scanLeft's `N+1`). + * + * If you find yourself writing a `scanLeft` over pairs of elements, where you only use one element of the + * pair within the `scanLeft`, then throw that element away in a `map` immediately after the scanLeft is done, + * then this abstraction is for you. + * + * The canonical method to use a scan is `apply`. + * + * @tparam I + * The type of elements that the computation is scanning over. + * @tparam O + * The output type of the scan (typically distinct from the hidden `State` of the scan). + */ +sealed abstract class Scan[-I, +O] extends Serializable { + + import Scan.{from, Aux} + + /** + * The computation of any given scan involves keeping track of a hidden state. + */ + type State + + /** + * The state of the scan before any elements have been processed + * @return + */ + def initialState: State + + /** + * @param i + * An element in the stream to process + * @param stateBeforeProcessingI + * The state of the scan before processing i + * @return + * The output of the scan corresponding to processing i with state stateBeforeProcessing, along with the + * result of updating stateBeforeProcessing with the information from i. + */ + def presentAndNextState(i: I, stateBeforeProcessingI: State): (O, State) + + /** + * @param iter + * @return + * If `iter = Iterator(a_1, ..., a_n)`, return:` `Iterator(o_1, ..., o_n)` where `(o_(i+1), state_(i+1)) = + * presentAndNextState(a_i, state_i)` and `state_0 = initialState` + */ + def scanIterator(iter: Iterator[I]): Iterator[O] = new AbstractIterator[O] { + override def hasNext: Boolean = iter.hasNext + var state: State = initialState + override def next(): O = { + val thisState = state + val thisA = iter.next() + val (thisC, nextState) = presentAndNextState(thisA, thisState) + state = nextState + thisC + } + } + + /** + * @param inputs + * @param bf + * @tparam In + * The type of the input collection + * @tparam Out + * The type of the output collection + * @return + * Given inputs as a collection of the form `[a_1, ..., a_n]` the output will be a collection of the form: + * `[o_1, ..., o_n]` where `(o_(i+1), state_(i+1)) = presentAndNextState(a_i, state_i)` and `state_0 = + * initialState`. + */ + def apply[In <: TraversableOnce[I], Out]( + inputs: In + )(implicit bf: BuildFrom[In, O, Out]): Out = + bf.fromSpecific(inputs)(scanIterator(inputs.toIterator)) + + // combinators + + /** + * Return a new scan that is the same as this scan, but with a different `initialState`. + * @param newInitialState + * @return + */ + def replaceState(newInitialState: => State): Aux[I, State, O] = + from(newInitialState)(presentAndNextState(_, _)) + + def composePrepare[I1](f: I1 => I): Aux[I1, State, O] = from(initialState) { (i, stateBeforeProcessingI) => + presentAndNextState(f(i), stateBeforeProcessingI) + } + + def andThenPresent[O1](g: O => O1): Aux[I, State, O1] = from(initialState) { (i, stateBeforeProcessingI) => + val (c, stateAfterProcessingA) = presentAndNextState(i, stateBeforeProcessingI) + (g(c), stateAfterProcessingA) + } + + /** + * Return a scan that is semantically identical to `this.join(Scan.identity[I1])`, but where we don't + * pollute the `State` by pairing it redundantly with `Unit`. + * @tparam I1 + * @return + * If this Scan's `apply` method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1, + * ..., o_n`, then this results in a Scan whose `apply` method returns `[(o_1, a_1), ..., (o_n, a_n)]` + * when given the same input. + */ + def joinWithInput[I1 <: I]: Aux[I1, State, (O, I1)] = from(initialState) { (i, stateBeforeProcessingI) => + val (o, stateAfterProcessingI) = presentAndNextState(i, stateBeforeProcessingI) + ((o, i), stateAfterProcessingI) + } + + /** + * Return a scan whose output is paired with the state of the scan before each input updates the state. + * @return + * If this Scan's `apply` method is given inputs [a_1, ..., a_n] resulting in outputs of the form `[o_1, + * ..., o_n]`, where `(o_(i+1), state_(i+1)) = presentAndNextState(a_i, state_i)` and `state_0 = + * initialState`, return a scan that whose apply method, when given inputs `[a_1, ..., a_n]` will return + * `[(o_1, state_0), ..., (o_n, state_(n-1))]`. + */ + def joinWithPriorState: Aux[I, State, (State, O)] = from(initialState) { (i, stateBeforeProcessingI) => + val (o, stateAfterProcessingA) = presentAndNextState(i, stateBeforeProcessingI) + ((stateBeforeProcessingI, o), stateAfterProcessingA) + } + + /** + * Return a scan whose output is paired with the state of the scan after each input updates the state. + * @return + * If this Scan's `apply` method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1, + * ..., o_n]`, where `(o_(i+1), state_(i+1)) = presentAndNextState(a_i, state_i)`` and state_0 = + * initialState, return a scan that whose apply method, when given inputs `[a_1, ..., a_n]` will return + * `[(o_1, state_1), ..., (o_n, state_n]`. + */ + def joinWithPosteriorState: Aux[I, State, (O, State)] = from(initialState) { (i, stateBeforeProcessingI) => + val (c, stateAfterProcessingA) = presentAndNextState(i, stateBeforeProcessingI) + ((c, stateAfterProcessingA), stateAfterProcessingA) + } + + /** + * For every `foo`, `scan.joinWithIndex(foo) == scan(foo).zipWithIndex`. + * @return + * If this Scan's `apply` method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1, + * ..., o_n]`, return a scan that whose apply method, when given the same input, will return `[(o_1, 1), + * ..., (o_n, n)]`. + */ + def joinWithIndex: Aux[I, (State, Long), (O, Long)] = join(Scan.index) + + /** + * Compose two scans pairwise such that, when given pairwise zipped inputs, the resulting scan will output + * pairwise zipped outputs. + * @param scan2 + * @tparam I2 + * @tparam O2 + * @return + * If this Scan's apply method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1, + * ..., o_n]`, and `scan2.apply([b_1, ..., b_n] = [p_1, ..., p_n]` then `zip` will return a scan whose + * apply method, when given input `[(a_1, b_1), ..., (a_n, b_n)]` results in the output `[(o_1, p_1), ..., + * (o_2, p_2)]`. In other words: `scan.zip(scan2)(foo.zip(bar)) == scan(foo).zip(scan2(bar))` + */ + def zip[I2, O2](scan2: Scan[I2, O2]): Aux[(I, I2), (State, scan2.State), (O, O2)] = + from((initialState, scan2.initialState)) { (i1i2, stateBeforeProcessingI1I2) => + val (o1, state1AfterProcesingI1) = + presentAndNextState(i1i2._1, stateBeforeProcessingI1I2._1) + val (o2, state2AfterProcesingI2) = + scan2.presentAndNextState(i1i2._2, stateBeforeProcessingI1I2._2) + ((o1, o2), (state1AfterProcesingI1, state2AfterProcesingI2)) + } + + /** + * Given a scan that takes compatible input to this one, pairwise compose the state and outputs of each scan + * on a common input stream. + * @param scan2 + * @tparam I2 + * @tparam O2 + * @return + * If this Scan's apply method is given inputs [a_1, ..., a_n] resulting in outputs of the form `[o_1, + * ..., o_n]`, and `scan2.apply([a_1, ..., a_n] = [p_1, ..., p_n]` then `join` will return a scan whose + * apply method returns `[(o_1, p_1), ..., (o_2, p_2)]`. In other words: `scan.join(scan2)(foo) == + * scan(foo).zip(scan2(foo))` + */ + def join[I2 <: I, O2](scan2: Scan[I2, O2]): Aux[I2, (State, scan2.State), (O, O2)] = + from((initialState, scan2.initialState)) { (i, stateBeforeProcessingI) => + val (o1, state1AfterProcesingI1) = presentAndNextState(i, stateBeforeProcessingI._1) + val (o2, state2AfterProcesingI2) = scan2.presentAndNextState(i, stateBeforeProcessingI._2) + ((o1, o2), (state1AfterProcesingI1, state2AfterProcesingI2)) + } + + /** + * Takes the output of this scan and feeds as input into scan2. + * @param scan2 + * @tparam P + * @return + * If this Scan's apply method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1, + * ..., o_n]`, and `scan2.apply([o_1, ..., o_n] = [p_1, ..., p_n]` then `compose` will return a scan which + * returns `[p_1, ..., p_n]`. + */ + def compose[P](scan2: Scan[O, P]): Aux[I, (State, scan2.State), P] = + from((initialState, scan2.initialState)) { (i, stateBeforeProcessingI) => + val (o, state1AfterProcesingI) = presentAndNextState(i, stateBeforeProcessingI._1) + val (p, state2AfterProcesingO) = scan2.presentAndNextState(o, stateBeforeProcessingI._2) + (p, (state1AfterProcesingI, state2AfterProcesingO)) + } + +} + +class ScanApplicative[I] extends Applicative[Scan[I, _]] { + override def map[T, U](mt: Scan[I, T])(fn: T => U): Scan[I, U] = + mt.andThenPresent(fn) + + override def apply[T](v: T): Scan[I, T] = + Scan.const(v) + + override def join[T, U](mt: Scan[I, T], mu: Scan[I, U]): Scan[I, (T, U)] = + mt.join(mu) +} diff --git a/algebird-core/src/main/scala-2.13/SpaceSaver.scala b/algebird-core/src/main/scala-2.13/SpaceSaver.scala new file mode 100644 index 000000000..5f9eee7e6 --- /dev/null +++ b/algebird-core/src/main/scala-2.13/SpaceSaver.scala @@ -0,0 +1,296 @@ +package com.twitter.algebird + +import java.nio.ByteBuffer + +import scala.collection.immutable.SortedMap +import scala.util.{Failure, Success, Try} + +object SpaceSaver { + + /** + * Construct SpaceSaver with given capacity containing a single item. This is the public api to create a new + * SpaceSaver. + */ + def apply[T](capacity: Int, item: T): SpaceSaver[T] = SSOne(capacity, item) + + /** + * Construct SpaceSaver with given capacity containing a single item with provided exact count. This is the + * public api to create a new SpaceSaver. + */ + def apply[T](capacity: Int, item: T, count: Long): SpaceSaver[T] = + SSMany(capacity, Map(item -> ((count, 0L)))) + + private[algebird] val ordering = + Ordering.by[(?, (Long, Long)), (Long, Long)] { case (_, (count, err)) => + (-count, err) + } + + implicit def spaceSaverSemiGroup[T]: Semigroup[SpaceSaver[T]] = + new SpaceSaverSemigroup[T] + + /** + * Encodes the SpaceSaver as a sequence of bytes containing in order + * - 1 byte: 1/2 => 1 = SSOne, 2 = SSMany + * - 4 bytes: the capacity + * - N bytes: the item/counters (counters as length + N*(item size + item + 2 * counters) + */ + def toBytes[T](ss: SpaceSaver[T], tSerializer: T => Array[Byte]): Array[Byte] = + ss match { + case SSOne(capacity, item) => + val itemAsBytes = tSerializer(item) + val itemLength = itemAsBytes.length + // 1 for the type, 4 for capacity, 4 for itemAsBytes.length + val buffer = new Array[Byte](1 + 4 + 4 + itemLength) + ByteBuffer + .wrap(buffer) + .put(1: Byte) + .putInt(capacity) + .putInt(itemLength) + .put(itemAsBytes) + buffer + + case SSMany( + capacity, + counters, + _ + ) => // We do not care about the buckets are thery are created by SSMany.apply + val buffer = scala.collection.mutable.ArrayBuffer.newBuilder[Byte] + buffer += (2: Byte) + + var buff = ByteBuffer.allocate(4) + buff.putInt(capacity) + buffer ++= buff.array() + + buff = ByteBuffer.allocate(4) + buff.putInt(counters.size) + buffer ++= buff.array() + counters.foreach { case (item, (a, b)) => + val itemAsBytes = tSerializer(item) + + buff = ByteBuffer.allocate(4) + buff.putInt(itemAsBytes.length) + buffer ++= buff.array() + + buffer ++= itemAsBytes + + buff = ByteBuffer.allocate(8 * 2) + buff.putLong(a) + buff.putLong(b) + buffer ++= buff.array() + } + buffer.result().toArray + } + + // Make sure to be reversible so fromBytes(toBytes(x)) == x + def fromBytes[T](bytes: Array[Byte], tDeserializer: Array[Byte] => Try[T]): Try[SpaceSaver[T]] = + fromByteBuffer(ByteBuffer.wrap(bytes), buffer => tDeserializer(buffer.array())) + + def fromByteBuffer[T](bb: ByteBuffer, tDeserializer: ByteBuffer => Try[T]): Try[SpaceSaver[T]] = + Try { + bb.get.toInt match { + case 1 => + val capacity = bb.getInt + val itemLength = bb.getInt + val itemAsBytes = new Array[Byte](itemLength) + bb.get(itemAsBytes) + tDeserializer(ByteBuffer.wrap(itemAsBytes)).map(item => SSOne(capacity, item)) + case 2 => + val capacity = bb.getInt + + var countersToDeserialize = bb.getInt + val counters = scala.collection.mutable.Map.empty[T, (Long, Long)] + while (countersToDeserialize != 0) { + val itemLength = bb.getInt() + val itemAsBytes = new Array[Byte](itemLength) + bb.get(itemAsBytes) + val item = tDeserializer(ByteBuffer.wrap(itemAsBytes)) + + val a = bb.getLong + val b = bb.getLong + + item match { + case Failure(e) => return Failure(e) + case Success(i) => + counters += ((i, (a, b))) + } + + countersToDeserialize -= 1 + } + + Success(SSMany(capacity, counters.toMap)) + } + }.flatten +} + +/** + * Data structure used in the Space-Saving Algorithm to find the approximate most frequent and top-k elements. + * The algorithm is described in "Efficient Computation of Frequent and Top-k Elements in Data Streams". See + * here: www.cs.ucsb.edu/research/tech_reports/reports/2005-23.pdf In the paper the data structure is called + * StreamSummary but we chose to call it SpaceSaver instead. Note that the adaptation to hadoop and + * parallelization were not described in the article and have not been proven to be mathematically correct or + * preserve the guarantees or benefits of the algorithm. + */ +sealed abstract class SpaceSaver[T] { + import SpaceSaver.ordering + + /** + * Maximum number of counters to keep (parameter "m" in the research paper). + */ + def capacity: Int + + /** + * Current lowest value for count + */ + def min: Long + + /** + * Map of item to counter, where each counter consists of an observed count and possible over-estimation + * (error) + */ + def counters: Map[T, (Long, Long)] + + def ++(other: SpaceSaver[T]): SpaceSaver[T] + + /** + * returns the frequency estimate for the item + */ + def frequency(item: T): Approximate[Long] = { + val (count, err) = counters.getOrElse(item, (min, min)) + Approximate(count - err, count, count, 1.0) + } + + /** + * Get the elements that show up more than thres times. Returns sorted in descending order: (item, + * Approximate[Long], guaranteed) + */ + def mostFrequent(thres: Int): Seq[(T, Approximate[Long], Boolean)] = + counters.iterator + .filter { case (_, (count, _)) => count >= thres } + .toList + .sorted(ordering) + .map { case (item, (count, err)) => + (item, Approximate(count - err, count, count, 1.0), thres <= count - err) + } + + /** + * Get the top-k elements. Returns sorted in descending order: (item, Approximate[Long], guaranteed) + */ + def topK(k: Int): Seq[(T, Approximate[Long], Boolean)] = { + require(k < capacity) + val si = counters.toList + .sorted(ordering) + val siK = si.take(k) + val countKPlus1 = si.drop(k).headOption.map(_._2._1).getOrElse(0L) + siK.map { case (item, (count, err)) => + (item, Approximate(count - err, count, count, 1.0), countKPlus1 < count - err) + } + } + + /** + * Check consistency with other SpaceSaver, useful for testing. Returns boolean indicating if they are + * consistent + */ + def consistentWith(that: SpaceSaver[T]): Boolean = + (counters.keys ++ that.counters.keys).forall(item => (frequency(item) - that.frequency(item)) ~ 0) +} + +case class SSOne[T] private[algebird] (override val capacity: Int, item: T) extends SpaceSaver[T] { + require(capacity > 1) + + override def min: Long = 0L + + override def counters: Map[T, (Long, Long)] = Map(item -> ((1L, 1L))) + + override def ++(other: SpaceSaver[T]): SpaceSaver[T] = other match { + case other: SSOne[?] => SSMany(this).add(other) + case other: SSMany[?] => other.add(this) + } +} + +object SSMany { + private def bucketsFromCounters[T](counters: Map[T, (Long, Long)]): SortedMap[Long, Set[T]] = + SortedMap[Long, Set[T]]() ++ counters.groupBy(_._2._1).mapValues(_.keySet).toMap + + private[algebird] def apply[T](capacity: Int, counters: Map[T, (Long, Long)]): SSMany[T] = + SSMany(capacity, counters, bucketsFromCounters(counters)) + + private[algebird] def apply[T](one: SSOne[T]): SSMany[T] = + SSMany(one.capacity, Map(one.item -> ((1L, 0L))), SortedMap(1L -> Set(one.item))) +} + +case class SSMany[T] private ( + override val capacity: Int, + override val counters: Map[T, (Long, Long)], + buckets: SortedMap[Long, Set[T]] +) extends SpaceSaver[T] { + private val exact: Boolean = counters.size < capacity + + override val min: Long = if (counters.size < capacity) 0L else buckets.firstKey + + // item is already present and just needs to be bumped up one + private def bump(item: T) = { + val (count, err) = counters(item) + val counters1 = counters + (item -> ((count + 1L, err))) // increment by one + val currBucket = buckets(count) // current bucket + val buckets1 = { + if (currBucket.size == 1) // delete current bucket since it will be empty + buckets - count + else // remove item from current bucket + buckets + (count -> (currBucket - item)) + } + (count + 1L -> (buckets.getOrElse(count + 1L, Set()) + item)) + SSMany(capacity, counters1, buckets1) + } + + // lose one item to meet capacity constraint + private def loseOne = { + val firstBucket = buckets(buckets.firstKey) + val itemToLose = firstBucket.head + val counters1 = counters - itemToLose + val buckets1 = + if (firstBucket.size == 1) + buckets - min + else + buckets + (min -> (firstBucket - itemToLose)) + SSMany(capacity, counters1, buckets1) + } + + // introduce new item + private def introduce(item: T, count: Long, err: Long) = { + val counters1 = counters + (item -> ((count, err))) + val buckets1 = buckets + (count -> (buckets.getOrElse(count, Set()) + item)) + SSMany(capacity, counters1, buckets1) + } + + // add a single element + private[algebird] def add(x: SSOne[T]): SSMany[T] = { + require(x.capacity == capacity) + if (counters.contains(x.item)) + bump(x.item) + else + (if (exact) this else this.loseOne).introduce(x.item, min + 1L, min) + } + + // merge two stream summaries + private def merge(x: SSMany[T]): SSMany[T] = { + require(x.capacity == capacity) + val counters1 = Map() ++ + (counters.keySet ++ x.counters.keySet).toList + .map { key => + val (count1, err1) = counters.getOrElse(key, (min, min)) + val (count2, err2) = x.counters.getOrElse(key, (x.min, x.min)) + key -> ((count1 + count2, err1 + err2)) + } + .sorted(SpaceSaver.ordering) + .take(capacity) + SSMany(capacity, counters1) + } + + override def ++(other: SpaceSaver[T]): SpaceSaver[T] = other match { + case other: SSOne[?] => add(other) + case other: SSMany[?] => merge(other) + } +} + +class SpaceSaverSemigroup[T] extends Semigroup[SpaceSaver[T]] { + override def plus(x: SpaceSaver[T], y: SpaceSaver[T]): SpaceSaver[T] = x ++ y +} diff --git a/algebird-core/src/main/scala-2.13/VectorSpace.scala b/algebird-core/src/main/scala-2.13/VectorSpace.scala new file mode 100644 index 000000000..f8818600c --- /dev/null +++ b/algebird-core/src/main/scala-2.13/VectorSpace.scala @@ -0,0 +1,59 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ + +package com.twitter.algebird + +import scala.annotation.implicitNotFound + +/** + * This class represents a vector space. For the required properties see: + * + * http://en.wikipedia.org/wiki/Vector_space#Definition + */ +object VectorSpace extends VectorSpaceOps with Implicits + +sealed trait VectorSpaceOps { + def scale[F, C[_]](v: F, c: C[F])(implicit vs: VectorSpace[F, C]): C[F] = + vs.scale(v, c) + def from[F, C[_]](scaleFn: (F, C[F]) => C[F])(implicit r: Ring[F], cGroup: Group[C[F]]): VectorSpace[F, C] = + new VectorSpace[F, C] { + override def ring: Ring[F] = r + override def group: Group[C[F]] = cGroup + override def scale(v: F, c: C[F]): C[F] = + if (r.isNonZero(v)) scaleFn(v, c) else cGroup.zero + } +} +private object VectorSpaceOps extends VectorSpaceOps + +sealed trait Implicits extends LowPrioImpicits { + implicit def indexedSeqSpace[T: Ring]: VectorSpace[T, IndexedSeq] = + VectorSpaceOps.from[T, IndexedSeq]((s, seq) => seq.map(Ring.times(s, _))) +} + +sealed trait LowPrioImpicits { + implicit def mapSpace[K, T: Ring]: VectorSpace[T, Map[K, _]] = + VectorSpaceOps.from[T, Map[K, _]] { (s, m) => + m.transform { case (_, v) => Ring.times(s, v) } + } +} + +@implicitNotFound(msg = "Cannot find VectorSpace type class for Container: ${C} and Ring: ${F}") +trait VectorSpace[F, C[_]] extends java.io.Serializable { + implicit def ring: Ring[F] + def field: Ring[F] = ring // this is for compatibility with older versions + implicit def group: Group[C[F]] + def scale(v: F, c: C[F]): C[F] +} diff --git a/algebird-core/src/main/scala-2.13/monad/EitherMonad.scala b/algebird-core/src/main/scala-2.13/monad/EitherMonad.scala new file mode 100644 index 000000000..b6d5e2ffc --- /dev/null +++ b/algebird-core/src/main/scala-2.13/monad/EitherMonad.scala @@ -0,0 +1,37 @@ +/* + Copyright 2013 Twitter, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +package com.twitter.algebird.monad + +import com.twitter.algebird.Monad + +// Monad for either, used for modeling Error where L is the type of the error +object EitherMonad { + class Error[L] extends Monad[Either[L, *]] { + override def apply[R](r: R): Right[L, R] = Right(r) + + override def flatMap[T, U](self: Either[L, T])(next: T => Either[L, U]): Either[L, U] = + self.right.flatMap(next) + + override def map[T, U](self: Either[L, T])(fn: T => U): Either[L, U] = + self.right.map(fn) + } + + implicit def monad[L]: Monad[Either[L, _]] = new Error[L] + + def assert[L](truth: Boolean, failure: => L): Either[L, Unit] = + if (truth) Right(()) else Left(failure) +} diff --git a/algebird-core/src/main/scala-2.13/monad/Reader.scala b/algebird-core/src/main/scala-2.13/monad/Reader.scala new file mode 100644 index 000000000..e0747af20 --- /dev/null +++ b/algebird-core/src/main/scala-2.13/monad/Reader.scala @@ -0,0 +1,76 @@ +/* + Copyright 2013 Twitter, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +package com.twitter.algebird.monad + +import com.twitter.algebird.Monad + +// TODO this is general, move somewhere better + +// Reader Monad, represents a series of operations that mutate some environment +// type (the input to the function) + +sealed trait Reader[-Env, +T] { + def apply(env: Env): T + def flatMap[E1 <: Env, U](next: T => Reader[E1, U]): Reader[E1, U] = + FlatMappedReader[E1, T, U](this, next) + def map[U](thatFn: T => U): Reader[Env, U] = + FlatMappedReader(this, (t: T) => ConstantReader(thatFn(t))) +} + +final case class ConstantReader[+T](get: T) extends Reader[Any, T] { + override def apply(env: Any): T = get + override def map[U](fn: T => U): ConstantReader[U] = ConstantReader(fn(get)) + override def flatMap[E1 <: Any, U](next: T => Reader[E1, U]): Reader[E1, U] = + next(get) +} +final case class ReaderFn[E, +T](fn: E => T) extends Reader[E, T] { + override def apply(env: E): T = fn(env) +} +final case class FlatMappedReader[E, U, +T](first: Reader[E, U], fn: U => Reader[E, T]) extends Reader[E, T] { + override def apply(env: E): T = { + @annotation.tailrec + def loop(r: Reader[E, Any], stack: List[(Any) => Reader[E, Any]]): Any = + r match { + case ConstantReader(get) => + stack match { + case head :: tail => loop(head(get), tail) + case Nil => get + } + case ReaderFn(fn) => + stack match { + case head :: tail => loop(head(fn(env)), tail) + case Nil => fn(env) + } + case FlatMappedReader(first, nextFn) => loop(first, nextFn :: stack) + } + loop(first, List(fn.asInstanceOf[(Any) => Reader[E, Any]])).asInstanceOf[T] + } +} + +object Reader { + def const[T](t: T): Reader[Any, T] = ConstantReader(t) + implicit def apply[E, T](fn: (E) => T): Reader[E, T] = ReaderFn(fn) + + class ReaderM[Env] extends Monad[Reader[Env, _]] { + override def apply[T](t: T): ConstantReader[T] = ConstantReader(t) + override def flatMap[T, U](self: Reader[Env, T])(next: T => Reader[Env, U]): Reader[Env, U] = + self.flatMap(next) + override def map[T, U](self: Reader[Env, T])(fn: T => U): Reader[Env, U] = self.map(fn) + } + + implicit def monad[Env]: Monad[Reader[Env, _]] = new ReaderM[Env] +} diff --git a/algebird-core/src/main/scala-2.13/monad/StateWithError.scala b/algebird-core/src/main/scala-2.13/monad/StateWithError.scala new file mode 100644 index 000000000..e15a9ebc3 --- /dev/null +++ b/algebird-core/src/main/scala-2.13/monad/StateWithError.scala @@ -0,0 +1,130 @@ +/* + Copyright 2013 Twitter, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +package com.twitter.algebird.monad + +import com.twitter.algebird.{Monad, Semigroup} + +/** + * Monad to handle mutating input state and possible failures. This is used to interact in the planning phase + * with existing mutable APIs (like storm or cascading), but retain the ability to compose carefully. + */ +sealed trait StateWithError[S, +F, +T] { + def join[F1 >: F, U]( + that: StateWithError[S, F1, U], + mergeErr: (F1, F1) => F1, + mergeState: (S, S) => S + ): StateWithError[S, F1, (T, U)] = + join(that)(Semigroup.from(mergeErr), Semigroup.from(mergeState)) + + def join[F1 >: F, U](that: StateWithError[S, F1, U])(implicit + sgf: Semigroup[F1], + sgs: Semigroup[S] + ): // TODO: deep joins could blow the stack, not yet using trampoline here + StateWithError[S, F1, (T, U)] = + StateFn { (requested: S) => + (run(requested), that.run(requested)) match { + case (Right((s1, r1)), Right((s2, r2))) => + Right((sgs.plus(s1, s2), (r1, r2))) + case (Left(err1), Left(err2)) => + Left(sgf.plus(err1, err2)) // Our earlier is not ready + case (Left(err), _) => Left(err) + case (_, Left(err)) => Left(err) + } + } + + def apply(state: S): Either[F, (S, T)] = run(state) + + def run(state: S): Either[F, (S, T)] + + def flatMap[F1 >: F, U](next: T => StateWithError[S, F1, U]): StateWithError[S, F1, U] = + FlatMappedState(this, next) + + def map[U](fn: (T) => U): StateWithError[S, F, U] = + FlatMappedState(this, (t: T) => StateWithError.const(fn(t))) +} + +/** Simple wrapper of a function in the Monad */ +final case class StateFn[S, F, T](fn: S => Either[F, (S, T)]) extends StateWithError[S, F, T] { + override def run(state: S): Either[F, (S, T)] = fn(state) +} + +/** + * A Trampolining instance that should prevent stack overflow at the expense of performance + */ +final case class FlatMappedState[S, F, T, U](start: StateWithError[S, F, T], fn: T => StateWithError[S, F, U]) + extends StateWithError[S, F, U] { + override def run(state: S): Either[F, (S, U)] = { + @annotation.tailrec + def loop(inState: S, st: StateWithError[S, F, Any], stack: List[Any => StateWithError[S, F, Any]]): Any = + st match { + case StateFn(fn) => + fn(inState) match { + case err @ Left(_) => err // bail at first error + case noError @ Right((newState, out)) => + stack match { + case head :: tailStack => loop(newState, head(out), tailStack) + case Nil => noError // recursion ends + } + } + case FlatMappedState(st, next) => loop(inState, st, next :: stack) + } + loop(state, this, Nil).asInstanceOf[Either[F, (S, U)]] + } +} + +object StateWithError { + def getState[S]: StateWithError[S, Nothing, S] = + StateFn((state: S) => Right((state, state))) + def putState[S](newState: S): StateWithError[S, Nothing, Unit] = + StateFn((_: S) => Right((newState, ()))) + def swapState[S](newState: S): StateWithError[S, Nothing, S] = + StateFn((old: S) => Right((newState, old))) + + def const[S, T](t: T): StateWithError[S, Nothing, T] = + StateFn((state: S) => Right((state, t))) + def lazyVal[S, T](t: => T): StateWithError[S, Nothing, T] = + StateFn((state: S) => Right((state, t))) + def failure[S, F](f: F): StateWithError[S, F, Nothing] = + StateFn(_ => Left(f)) + + /** + * Use like fromEither[Int](Right("good")) to get a constant Either in the monad + */ + def fromEither[S]: ConstantStateMaker[S] = new ConstantStateMaker[S] + class ConstantStateMaker[S] { + def apply[F, T](either: Either[F, T]): StateWithError[S, F, T] = { (s: S) => either.right.map((s, _)) } + } + + class FunctionLifter[S] { + def apply[I, F, T](fn: I => Either[F, T]): (I => StateWithError[S, F, T]) = { (i: I) => + StateFn((s: S) => fn(i).right.map((s, _))) + } + } + // TODO this should move to Monad and work for any Monad + def toKleisli[S]: FunctionLifter[S] = new FunctionLifter[S] + + implicit def apply[S, F, T](fn: S => Either[F, (S, T)]): StateWithError[S, F, T] = StateFn(fn) + implicit def monad[S, F]: Monad[StateWithError[S, F, _]] = new StateFMonad[F, S] + + class StateFMonad[F, S] extends Monad[StateWithError[S, F, _]] { + override def apply[T](const: T): StateWithError[S, Nothing, T] = { (s: S) => Right((s, const)) } + override def flatMap[T, U]( + earlier: StateWithError[S, F, T] + )(next: T => StateWithError[S, F, U]): StateWithError[S, F, U] = + earlier.flatMap(next) + } +} diff --git a/algebird-core/src/main/scala/com/twitter/algebird/macros/Cuber.scala b/algebird-core/src/main/scala-2/com/twitter/algebird/macros/Cuber.scala similarity index 100% rename from algebird-core/src/main/scala/com/twitter/algebird/macros/Cuber.scala rename to algebird-core/src/main/scala-2/com/twitter/algebird/macros/Cuber.scala diff --git a/algebird-core/src/main/scala/com/twitter/algebird/macros/GroupMacro.scala b/algebird-core/src/main/scala-2/com/twitter/algebird/macros/GroupMacro.scala similarity index 100% rename from algebird-core/src/main/scala/com/twitter/algebird/macros/GroupMacro.scala rename to algebird-core/src/main/scala-2/com/twitter/algebird/macros/GroupMacro.scala diff --git a/algebird-core/src/main/scala/com/twitter/algebird/macros/MonoidMacro.scala b/algebird-core/src/main/scala-2/com/twitter/algebird/macros/MonoidMacro.scala similarity index 100% rename from algebird-core/src/main/scala/com/twitter/algebird/macros/MonoidMacro.scala rename to algebird-core/src/main/scala-2/com/twitter/algebird/macros/MonoidMacro.scala diff --git a/algebird-core/src/main/scala/com/twitter/algebird/macros/RingMacro.scala b/algebird-core/src/main/scala-2/com/twitter/algebird/macros/RingMacro.scala similarity index 100% rename from algebird-core/src/main/scala/com/twitter/algebird/macros/RingMacro.scala rename to algebird-core/src/main/scala-2/com/twitter/algebird/macros/RingMacro.scala diff --git a/algebird-core/src/main/scala/com/twitter/algebird/macros/Roller.scala b/algebird-core/src/main/scala-2/com/twitter/algebird/macros/Roller.scala similarity index 100% rename from algebird-core/src/main/scala/com/twitter/algebird/macros/Roller.scala rename to algebird-core/src/main/scala-2/com/twitter/algebird/macros/Roller.scala diff --git a/algebird-core/src/main/scala/com/twitter/algebird/macros/SemigroupMacro.scala b/algebird-core/src/main/scala-2/com/twitter/algebird/macros/SemigroupMacro.scala similarity index 100% rename from algebird-core/src/main/scala/com/twitter/algebird/macros/SemigroupMacro.scala rename to algebird-core/src/main/scala-2/com/twitter/algebird/macros/SemigroupMacro.scala diff --git a/algebird-core/src/main/scala/com/twitter/algebird/macros/caseclass.scala b/algebird-core/src/main/scala-2/com/twitter/algebird/macros/caseclass.scala similarity index 100% rename from algebird-core/src/main/scala/com/twitter/algebird/macros/caseclass.scala rename to algebird-core/src/main/scala-2/com/twitter/algebird/macros/caseclass.scala diff --git a/algebird-core/src/main/scala/com/twitter/algebird/macros/package.scala b/algebird-core/src/main/scala-2/com/twitter/algebird/macros/package.scala similarity index 100% rename from algebird-core/src/main/scala/com/twitter/algebird/macros/package.scala rename to algebird-core/src/main/scala-2/com/twitter/algebird/macros/package.scala diff --git a/algebird-core/src/main/scala-3/Aggregator.scala b/algebird-core/src/main/scala-3/Aggregator.scala new file mode 100644 index 000000000..8a4d2b230 --- /dev/null +++ b/algebird-core/src/main/scala-3/Aggregator.scala @@ -0,0 +1,637 @@ +package com.twitter.algebird + +import java.util.PriorityQueue +import scala.collection.compat._ +import scala.collection.generic.CanBuildFrom + +/** + * Aggregators compose well. + * + * To create a parallel aggregator that operates on a single input in parallel, use: + * GeneratedTupleAggregator.from2((agg1, agg2)) + */ +object Aggregator extends java.io.Serializable { + implicit def applicative[I]: Applicative[({ type L[O] = Aggregator[I, ?, O] })#L] = + new AggregatorApplicative[I] + + private val DefaultSeed = 471312384 + + /** + * This is a trivial aggregator that always returns a single value + */ + def const[T](t: T): MonoidAggregator[Any, Unit, T] = + prepareMonoid { (_: Any) => () }.andThenPresent(_ => t) + + /** + * Using Aggregator.prepare,present you can add to this aggregator + */ + def fromReduce[T](red: (T, T) => T): Aggregator[T, T, T] = + fromSemigroup(Semigroup.from(red)) + def fromSemigroup[T](implicit sg: Semigroup[T]): Aggregator[T, T, T] = + new Aggregator[T, T, T] { + override def prepare(input: T): T = input + override def semigroup: Semigroup[T] = sg + override def present(reduction: T): T = reduction + } + def fromMonoid[T](implicit mon: Monoid[T]): MonoidAggregator[T, T, T] = + prepareMonoid(identity[T]) + // Uses the product from the ring + def fromRing[T](implicit rng: Ring[T]): RingAggregator[T, T, T] = + fromRing[T, T](rng, identity[T]) + + def fromMonoid[F, T](implicit mon: Monoid[T], prep: F => T): MonoidAggregator[F, T, T] = + prepareMonoid(prep)(mon) + + def prepareSemigroup[F, T](prep: F => T)(implicit sg: Semigroup[T]): Aggregator[F, T, T] = + new Aggregator[F, T, T] { + override def prepare(input: F): T = prep(input) + override def semigroup: Semigroup[T] = sg + override def present(reduction: T): T = reduction + } + def prepareMonoid[F, T](prep: F => T)(implicit m: Monoid[T]): MonoidAggregator[F, T, T] = + new MonoidAggregator[F, T, T] { + override def prepare(input: F): T = prep(input) + override def monoid: Monoid[T] = m + override def present(reduction: T): T = reduction + } + // Uses the product from the ring + def fromRing[F, T](implicit rng: Ring[T], prep: F => T): RingAggregator[F, T, T] = + new RingAggregator[F, T, T] { + override def prepare(input: F): T = prep(input) + override def ring: Ring[T] = rng + override def present(reduction: T): T = reduction + } + + /** + * Obtain an [[Aggregator]] that uses an efficient append operation for faster aggregation. Equivalent to + * {{{appendSemigroup(prep, appnd, identity[T]_)(sg)}}} + */ + def appendSemigroup[F, T](prep: F => T, appnd: (T, F) => T)(implicit + sg: Semigroup[T] + ): Aggregator[F, T, T] = + appendSemigroup(prep, appnd, identity[T])(sg) + + /** + * Obtain an [[Aggregator]] that uses an efficient append operation for faster aggregation + * @tparam F + * Data input type + * @tparam T + * Aggregating [[Semigroup]] type + * @tparam P + * Presentation (output) type + * @param prep + * The preparation function. Expected to construct an instance of type T from a single data element. + * @param appnd + * Function that appends the [[Semigroup]]. Defines the [[Aggregator.append]] method for this aggregator. + * Analogous to the 'seqop' function in Scala's sequence 'aggregate' method + * @param pres + * The presentation function + * @param sg + * The [[Semigroup]] type class + * @note + * The functions 'appnd' and 'prep' are expected to obey the law: {{{appnd(t, f) == sg.plus(t, prep(f))}}} + */ + def appendSemigroup[F, T, P](prep: F => T, appnd: (T, F) => T, pres: T => P)(implicit + sg: Semigroup[T] + ): Aggregator[F, T, P] = + new Aggregator[F, T, P] { + override def semigroup: Semigroup[T] = sg + override def prepare(input: F): T = prep(input) + override def present(reduction: T): P = pres(reduction) + + override def apply(inputs: TraversableOnce[F]): P = + applyOption(inputs).get + + override def applyOption(inputs: TraversableOnce[F]): Option[P] = + agg(inputs).map(pres) + + override def append(l: T, r: F): T = appnd(l, r) + + override def appendAll(old: T, items: TraversableOnce[F]): T = + if (items.iterator.isEmpty) old else reduce(old, agg(items).get) + + private def agg(inputs: TraversableOnce[F]): Option[T] = + if (inputs.iterator.isEmpty) None + else { + val itr = inputs.iterator + val t = prepare(itr.next) + Some(itr.foldLeft(t)(appnd)) + } + } + + /** + * Obtain a [[MonoidAggregator]] that uses an efficient append operation for faster aggregation. Equivalent + * to {{{appendMonoid(appnd, identity[T]_)(m)}}} + */ + def appendMonoid[F, T](appnd: (T, F) => T)(implicit m: Monoid[T]): MonoidAggregator[F, T, T] = + appendMonoid(appnd, identity[T])(m) + + /** + * Obtain a [[MonoidAggregator]] that uses an efficient append operation for faster aggregation + * @tparam F + * Data input type + * @tparam T + * Aggregating [[Monoid]] type + * @tparam P + * Presentation (output) type + * @param appnd + * Function that appends the [[Monoid]]. Defines the [[MonoidAggregator.append]] method for this + * aggregator. Analogous to the 'seqop' function in Scala's sequence 'aggregate' method + * @param pres + * The presentation function + * @param m + * The [[Monoid]] type class + * @note + * The function 'appnd' is expected to obey the law: {{{appnd(t, f) == m.plus(t, appnd(m.zero, f))}}} + */ + def appendMonoid[F, T, P](appnd: (T, F) => T, pres: T => P)(implicit + m: Monoid[T] + ): MonoidAggregator[F, T, P] = + new MonoidAggregator[F, T, P] { + override def monoid: Monoid[T] = m + override def prepare(input: F): T = appnd(m.zero, input) + override def present(reduction: T): P = pres(reduction) + + override def apply(inputs: TraversableOnce[F]): P = present(agg(inputs)) + + override def applyOption(inputs: TraversableOnce[F]): Option[P] = + if (inputs.isEmpty) None else Some(apply(inputs)) + + override def append(l: T, r: F): T = appnd(l, r) + + override def appendAll(old: T, items: TraversableOnce[F]): T = + reduce(old, agg(items)) + + override def appendAll(items: TraversableOnce[F]): T = agg(items) + + private def agg(inputs: TraversableOnce[F]): T = + inputs.foldLeft(m.zero)(append) + } + + /** + * How many items satisfy a predicate + */ + def count[T](pred: T => Boolean): MonoidAggregator[T, Long, Long] = + prepareMonoid { (t: T) => if (pred(t)) 1L else 0L } + + /** + * Do any items satisfy some predicate + */ + def exists[T](pred: T => Boolean): MonoidAggregator[T, Boolean, Boolean] = + prepareMonoid(pred)(OrVal.unboxedMonoid) + + /** + * Do all items satisfy a predicate + */ + def forall[T](pred: T => Boolean): MonoidAggregator[T, Boolean, Boolean] = + prepareMonoid(pred)(AndVal.unboxedMonoid) + + /** + * Take the first (left most in reduce order) item found + */ + def head[T]: Aggregator[T, T, T] = fromReduce[T]((l, _) => l) + + /** + * Take the last (right most in reduce order) item found + */ + def last[T]: Aggregator[T, T, T] = fromReduce[T]((_, r) => r) + + /** + * Get the maximum item + */ + def max[T: Ordering]: Aggregator[T, T, T] = new MaxAggregator[T] + def maxBy[U, T: Ordering](fn: U => T): Aggregator[U, U, U] = { + implicit val ordU: Ordering[U] = Ordering.by(fn) + max[U] + } + + /** + * Get the minimum item + */ + def min[T: Ordering]: Aggregator[T, T, T] = new MinAggregator[T] + def minBy[U, T: Ordering](fn: U => T): Aggregator[U, U, U] = { + implicit val ordU: Ordering[U] = Ordering.by(fn) + min[U] + } + + /** + * This returns the number of items we find + */ + def size: MonoidAggregator[Any, Long, Long] = + prepareMonoid((_: Any) => 1L) + + /** + * Take the smallest `count` items using a heap + */ + def sortedTake[T: Ordering](count: Int): MonoidAggregator[T, PriorityQueue[T], Seq[T]] = + new mutable.PriorityQueueToListAggregator[T](count) + + /** + * Same as sortedTake, but using a function that returns a value that has an Ordering. + * + * This function is like writing list.sortBy(fn).take(count). + */ + def sortByTake[T, U: Ordering](count: Int)(fn: T => U): MonoidAggregator[T, PriorityQueue[T], Seq[T]] = + Aggregator.sortedTake(count)(Ordering.by(fn)) + + /** + * Take the largest `count` items using a heap + */ + def sortedReverseTake[T: Ordering](count: Int): MonoidAggregator[T, PriorityQueue[T], Seq[T]] = + new mutable.PriorityQueueToListAggregator[T](count)(implicitly[Ordering[T]].reverse) + + /** + * Same as sortedReverseTake, but using a function that returns a value that has an Ordering. + * + * This function is like writing list.sortBy(fn).reverse.take(count). + */ + def sortByReverseTake[T, U: Ordering]( + count: Int + )(fn: T => U): MonoidAggregator[T, PriorityQueue[T], Seq[T]] = + Aggregator.sortedReverseTake(count)(Ordering.by(fn)) + + /** + * Immutable version of sortedTake, for frameworks that check immutability of reduce functions. + */ + def immutableSortedTake[T: Ordering](count: Int): MonoidAggregator[T, TopK[T], Seq[T]] = + new TopKToListAggregator[T](count) + + /** + * Immutable version of sortedReverseTake, for frameworks that check immutability of reduce functions. + */ + def immutableSortedReverseTake[T: Ordering](count: Int): MonoidAggregator[T, TopK[T], Seq[T]] = + new TopKToListAggregator[T](count)(implicitly[Ordering[T]].reverse) + + /** + * Randomly selects input items where each item has an independent probability 'prob' of being selected. + * This assumes that all sampled records can fit in memory, so use this only when the expected number of + * sampled values is small. + */ + def randomSample[T]( + prob: Double, + seed: Int = DefaultSeed + ): MonoidAggregator[T, Option[Batched[T]], List[T]] = { + assert(prob >= 0 && prob <= 1, "randomSample.prob must lie in [0, 1]") + val rng = new java.util.Random(seed) + Preparer[T] + .filter(_ => rng.nextDouble() <= prob) + .monoidAggregate(toList) + } + + /** + * Selects exactly 'count' of the input records randomly (or all of the records if there are less then + * 'count' total records). This assumes that all 'count' of the records can fit in memory, so use this only + * for small values of 'count'. + */ + def reservoirSample[T]( + count: Int, + seed: Int = DefaultSeed + ): MonoidAggregator[T, PriorityQueue[(Double, T)], Seq[T]] = { + val rng = new java.util.Random(seed) + Preparer[T] + .map(rng.nextDouble() -> _) + .monoidAggregate(sortByTake(count)(_._1)) + .andThenPresent(_.map(_._2)) + } + + /** + * Put everything in a List. Note, this could fill the memory if the List is very large. + */ + def toList[T]: MonoidAggregator[T, Option[Batched[T]], List[T]] = + new MonoidAggregator[T, Option[Batched[T]], List[T]] { + override def prepare(t: T): Option[Batched[T]] = Some(Batched(t)) + override def monoid: Monoid[Option[Batched[T]]] = + Monoid.optionMonoid(Batched.semigroup) + override def present(o: Option[Batched[T]]): List[T] = + o.map(_.toList).getOrElse(Nil) + } + + /** + * Put everything in a Set. Note, this could fill the memory if the Set is very large. + */ + def toSet[T]: MonoidAggregator[T, Set[T], Set[T]] = + prepareMonoid { (t: T) => Set(t) } + + /** + * This builds an in-memory Set, and then finally gets the size of that set. This may not be scalable if the + * Uniques are very large. You might check the approximateUniqueCount or HyperLogLog Aggregator to get an + * approximate version of this that is scalable. + */ + def uniqueCount[T]: MonoidAggregator[T, Set[T], Int] = + toSet[T].andThenPresent(_.size) + + /** + * Using a constant amount of memory, give an approximate unique count (~ 1% error). This uses an exact set + * for up to 100 items, then HyperLogLog (HLL) with an 1.2% standard error which uses at most 8192 bytes for + * each HLL. For more control, see HyperLogLogAggregator. + */ + def approximateUniqueCount[T: Hash128]: MonoidAggregator[T, Either[HLL, Set[T]], Long] = + SetSizeHashAggregator[T](hllBits = 13, maxSetSize = 100) + + /** + * Returns the lower bound of a given percentile where the percentile is between (0,1] The items that are + * iterated over cannot be negative. + */ + def approximatePercentile[T](percentile: Double, k: Int = QTreeAggregator.DefaultK)(implicit + num: Numeric[T] + ): QTreeAggregatorLowerBound[T] = + QTreeAggregatorLowerBound[T](percentile, k) + + /** + * Returns the intersection of a bounded percentile where the percentile is between (0,1] The items that are + * iterated over cannot be negative. + */ + def approximatePercentileBounds[T](percentile: Double, k: Int = QTreeAggregator.DefaultK)(implicit + num: Numeric[T] + ): QTreeAggregator[T] = + QTreeAggregator[T](percentile, k) + + /** + * An aggregator that sums Numeric values into Doubles. + * + * This is really no more than converting to Double and then summing. The conversion to double means we + * don't have the overflow semantics of integer types on the jvm (e.g. Int.MaxValue + 1 == Int.MinValue). + * + * Note that if you instead wanted to aggregate Numeric values of a type T into the same type T (e.g. if you + * want MonoidAggregator[T, T, T] for some Numeric type T), you can directly use Aggregator.fromMonoid[T] + * after importing the numericRing implicit: + * + * > import com.twitter.algebird.Ring.numericRing > def numericAggregator[T: Numeric]: MonoidAggregator[T, + * T, T] = Aggregator.fromMonoid[T] + */ + def numericSum[T](implicit num: Numeric[T]): MonoidAggregator[T, Double, Double] = + Preparer[T].map(num.toDouble).monoidAggregate(Aggregator.fromMonoid) + +} + +/** + * This is a type that models map/reduce(map). First each item is mapped, then we reduce with a semigroup, + * then finally we present the results. + * + * Unlike Fold, Aggregator keeps it's middle aggregation type externally visible. This is because Aggregators + * are useful in parallel map/reduce systems where there may be some additional types needed to cross the + * map/reduce boundary (such a serialization and intermediate storage). If you don't care about the middle + * type, an _ may be used and the main utility of the instance is still preserved (e.g. def operate[T](ag: + * Aggregator[T, _, Int]): Int) + * + * Note, join is very useful to combine multiple aggregations with one pass. Also + * GeneratedTupleAggregator.fromN((agg1, agg2, ... aggN)) can glue these together well. + * + * This type is the the Fold.M from Haskell's fold package: + * https://hackage.haskell.org/package/folds-0.6.2/docs/Data-Fold-M.html + */ +trait Aggregator[-A, B, +C] extends java.io.Serializable { self => + def prepare(input: A): B + def semigroup: Semigroup[B] + def present(reduction: B): C + + /* ***** + * All the following are in terms of the above + */ + + /** + * combine two inner values + */ + def reduce(l: B, r: B): B = semigroup.plus(l, r) + + /** + * This may error if items is empty. To be safe you might use reduceOption if you don't know that items is + * non-empty + */ + def reduce(items: TraversableOnce[B]): B = semigroup.sumOption(items).get + + /** + * This is the safe version of the above. If the input in empty, return None, else reduce the items + */ + def reduceOption(items: TraversableOnce[B]): Option[B] = + semigroup.sumOption(items) + + /** + * This may error if inputs are empty (for Monoid Aggregators it never will, instead you see + * present(Monoid.zero[B]) + */ + def apply(inputs: TraversableOnce[A]): C = + present(reduce(inputs.iterator.map(prepare))) + + /** + * This returns None if the inputs are empty + */ + def applyOption(inputs: TraversableOnce[A]): Option[C] = + reduceOption(inputs.iterator.map(prepare)) + .map(present) + + /** + * This returns the cumulative sum of its inputs, in the same order. If the inputs are empty, the result + * will be empty too. + */ + def cumulativeIterator(inputs: Iterator[A]): Iterator[C] = + inputs + .scanLeft(None: Option[B]) { + case (None, a) => Some(prepare(a)) + case (Some(b), a) => Some(append(b, a)) + } + .collect { case Some(b) => present(b) } + + /** + * This returns the cumulative sum of its inputs, in the same order. If the inputs are empty, the result + * will be empty too. + */ + def applyCumulatively[In <: TraversableOnce[A], Out]( + inputs: In + )(implicit bf: CanBuildFrom[In, C, Out]): Out = + (bf: BuildFrom[In, C, Out]).fromSpecific(inputs)(cumulativeIterator(inputs.iterator)) + + def append(l: B, r: A): B = reduce(l, prepare(r)) + + def appendAll(old: B, items: TraversableOnce[A]): B = + if (items.iterator.isEmpty) old else reduce(old, reduce(items.iterator.map(prepare))) + + /** Like calling andThen on the present function */ + def andThenPresent[D](present2: C => D): Aggregator[A, B, D] = + new Aggregator[A, B, D] { + override def prepare(input: A): B = self.prepare(input) + override def semigroup: Semigroup[B] = self.semigroup + override def present(reduction: B): D = present2(self.present(reduction)) + } + + /** Like calling compose on the prepare function */ + def composePrepare[A1](prepare2: A1 => A): Aggregator[A1, B, C] = + new Aggregator[A1, B, C] { + override def prepare(input: A1): B = self.prepare(prepare2(input)) + override def semigroup: Semigroup[B] = self.semigroup + override def present(reduction: B): C = self.present(reduction) + } + + /** + * This allows you to run two aggregators on the same data with a single pass + */ + def join[A2 <: A, B2, C2](that: Aggregator[A2, B2, C2]): Aggregator[A2, (B, B2), (C, C2)] = + GeneratedTupleAggregator.from2((this, that)) + + /** + * This allows you to join two aggregators into one that takes a tuple input, which in turn allows you to + * chain .composePrepare onto the result if you have an initial input that has to be prepared differently + * for each of the joined aggregators. + * + * The law here is: ag1.zip(ag2).apply(as.zip(bs)) == (ag1(as), ag2(bs)) + */ + def zip[A2, B2, C2](ag2: Aggregator[A2, B2, C2]): Aggregator[(A, A2), (B, B2), (C, C2)] = { + val ag1 = this + new Aggregator[(A, A2), (B, B2), (C, C2)] { + override def prepare(a: (A, A2)): (B, B2) = (ag1.prepare(a._1), ag2.prepare(a._2)) + override val semigroup = new Tuple2Semigroup()(ag1.semigroup, ag2.semigroup) + override def present(b: (B, B2)): (C, C2) = (ag1.present(b._1), ag2.present(b._2)) + } + } + + /** + * An Aggregator can be converted to a Fold, but not vice-versa Note, a Fold is more constrained so only do + * this if you require joining a Fold with an Aggregator to produce a Fold + */ + def toFold: Fold[A, Option[C]] = + Fold.fold[Option[B], A, Option[C]]( + { + case (None, a) => Some(self.prepare(a)) + case (Some(b), a) => Some(self.append(b, a)) + }, + None, + _.map(self.present) + ) + + def lift: MonoidAggregator[A, Option[B], Option[C]] = + new MonoidAggregator[A, Option[B], Option[C]] { + override def prepare(input: A): Option[B] = Some(self.prepare(input)) + override def present(reduction: Option[B]): Option[C] = reduction.map(self.present) + override def monoid = new OptionMonoid[B]()(self.semigroup) + } +} + +/** + * Aggregators are Applicatives, but this hides the middle type. If you need a join that does not hide the + * middle type use join on the trait, or GeneratedTupleAggregator.fromN + */ +class AggregatorApplicative[I] extends Applicative[({ type L[O] = Aggregator[I, ?, O] })#L] { + override def map[T, U](mt: Aggregator[I, ?, T])(fn: T => U): Aggregator[I, ?, U] = + mt.andThenPresent(fn) + override def apply[T](v: T): Aggregator[I, ?, T] = + Aggregator.const(v) + override def join[T, U](mt: Aggregator[I, ?, T], mu: Aggregator[I, ?, U]): Aggregator[I, ?, (T, U)] = + mt.join(mu) + override def join[T1, T2, T3]( + m1: Aggregator[I, ?, T1], + m2: Aggregator[I, ?, T2], + m3: Aggregator[I, ?, T3] + ): Aggregator[I, ?, (T1, T2, T3)] = + GeneratedTupleAggregator.from3((m1, m2, m3)) + + override def join[T1, T2, T3, T4]( + m1: Aggregator[I, ?, T1], + m2: Aggregator[I, ?, T2], + m3: Aggregator[I, ?, T3], + m4: Aggregator[I, ?, T4] + ): Aggregator[I, ?, (T1, T2, T3, T4)] = + GeneratedTupleAggregator.from4((m1, m2, m3, m4)) + + override def join[T1, T2, T3, T4, T5]( + m1: Aggregator[I, ?, T1], + m2: Aggregator[I, ?, T2], + m3: Aggregator[I, ?, T3], + m4: Aggregator[I, ?, T4], + m5: Aggregator[I, ?, T5] + ): Aggregator[I, ?, (T1, T2, T3, T4, T5)] = + GeneratedTupleAggregator.from5((m1, m2, m3, m4, m5)) +} + +trait MonoidAggregator[-A, B, +C] extends Aggregator[A, B, C] { self => + def monoid: Monoid[B] + override def semigroup: Monoid[B] = monoid + final override def reduce(items: TraversableOnce[B]): B = + monoid.sum(items) + + def appendAll(items: TraversableOnce[A]): B = reduce(items.iterator.map(prepare)) + + override def andThenPresent[D](present2: C => D): MonoidAggregator[A, B, D] = { + val self = this + new MonoidAggregator[A, B, D] { + override def prepare(a: A): B = self.prepare(a) + override def monoid: Monoid[B] = self.monoid + override def present(b: B): D = present2(self.present(b)) + } + } + override def composePrepare[A2](prepare2: A2 => A): MonoidAggregator[A2, B, C] = { + val self = this + new MonoidAggregator[A2, B, C] { + override def prepare(a: A2): B = self.prepare(prepare2(a)) + override def monoid: Monoid[B] = self.monoid + override def present(b: B): C = self.present(b) + } + } + + /** + * Build a MonoidAggregator that either takes left or right input and outputs the pair from both + */ + def either[A2, B2, C2]( + that: MonoidAggregator[A2, B2, C2] + ): MonoidAggregator[Either[A, A2], (B, B2), (C, C2)] = + new MonoidAggregator[Either[A, A2], (B, B2), (C, C2)] { + override def prepare(e: Either[A, A2]): (B, B2) = e match { + case Left(a) => (self.prepare(a), that.monoid.zero) + case Right(a2) => (self.monoid.zero, that.prepare(a2)) + } + override val monoid = new Tuple2Monoid[B, B2]()(self.monoid, that.monoid) + override def present(bs: (B, B2)): (C, C2) = (self.present(bs._1), that.present(bs._2)) + } + + /** + * Only transform values where the function is defined, else discard + */ + def collectBefore[A2](fn: PartialFunction[A2, A]): MonoidAggregator[A2, B, C] = + new MonoidAggregator[A2, B, C] { + override def prepare(a: A2): B = + if (fn.isDefinedAt(a)) self.prepare(fn(a)) else self.monoid.zero + override def monoid: Monoid[B] = self.monoid + override def present(b: B): C = self.present(b) + } + + /** + * Only aggregate items that match a predicate + */ + def filterBefore[A1 <: A](pred: A1 => Boolean): MonoidAggregator[A1, B, C] = + new MonoidAggregator[A1, B, C] { + override def prepare(a: A1): B = if (pred(a)) self.prepare(a) else self.monoid.zero + override def monoid: Monoid[B] = self.monoid + override def present(b: B): C = self.present(b) + } + + /** + * This maps the inputs to Bs, then sums them, effectively flattening the inputs to the MonoidAggregator + */ + def sumBefore: MonoidAggregator[TraversableOnce[A], B, C] = + new MonoidAggregator[TraversableOnce[A], B, C] { + override def monoid: Monoid[B] = self.monoid + override def prepare(input: TraversableOnce[A]): B = + monoid.sum(input.iterator.map(self.prepare)) + override def present(reduction: B): C = self.present(reduction) + } + + /** + * This allows you to join two aggregators into one that takes a tuple input, which in turn allows you to + * chain .composePrepare onto the result if you have an initial input that has to be prepared differently + * for each of the joined aggregators. + * + * The law here is: ag1.zip(ag2).apply(as.zip(bs)) == (ag1(as), ag2(bs)) + */ + def zip[A2, B2, C2](ag2: MonoidAggregator[A2, B2, C2]): MonoidAggregator[(A, A2), (B, B2), (C, C2)] = { + val ag1 = self + new MonoidAggregator[(A, A2), (B, B2), (C, C2)] { + override def prepare(a: (A, A2)): (B, B2) = (ag1.prepare(a._1), ag2.prepare(a._2)) + override val monoid = new Tuple2Monoid[B, B2]()(ag1.monoid, ag2.monoid) + override def present(b: (B, B2)): (C, C2) = (ag1.present(b._1), ag2.present(b._2)) + } + } +} + +trait RingAggregator[-A, B, +C] extends MonoidAggregator[A, B, C] { + def ring: Ring[B] + override def monoid: Monoid[B] = Ring.asTimesMonoid(ring) +} diff --git a/algebird-core/src/main/scala-3/CountMinSketch.scala b/algebird-core/src/main/scala-3/CountMinSketch.scala new file mode 100644 index 000000000..a526b2a51 --- /dev/null +++ b/algebird-core/src/main/scala-3/CountMinSketch.scala @@ -0,0 +1,1418 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ + +import algebra.CommutativeMonoid + +import scala.collection.compat._ + +/** + * A Count-Min sketch is a probabilistic data structure used for summarizing streams of data in sub-linear + * space. + * + * It works as follows. Let `(eps, delta)` be two parameters that describe the confidence in our error + * estimates, and let `d = ceil(ln 1/delta)` and `w = ceil(e / eps)`. + * + * Note: Throughout the code `d` and `w` are called `depth` and `width`, respectively. + * + * Then: + * + * - Take `d` pairwise independent hash functions `h_i`, each of which maps onto the domain `[0, w - 1]`. + * - Create a 2-dimensional table of counts, with `d` rows and `w` columns, initialized with all zeroes. + * - When a new element x arrives in the stream, update the table of counts by setting `counts[i, h_i[x]] += + * 1`, for each `1 <= i <= d`. + * - (Note the rough similarity to a Bloom filter.) + * + * As an example application, suppose you want to estimate the number of times an element `x` has appeared in + * a data stream so far. The Count-Min sketch estimate of this frequency is + * + * min_i { counts[i, h_i[x]] } + * + * With probability at least `1 - delta`, this estimate is within `eps * N` of the true frequency (i.e., `true + * frequency <= estimate <= true frequency + eps * N`), where N is the total size of the stream so far. + * + * See http://www.eecs.harvard.edu/~michaelm/CS222/countmin.pdf for technical details, including proofs of the + * estimates and error bounds used in this implementation. + * + * Parts of this implementation are taken from + * https://github.com/clearspring/stream-lib/blob/master/src/main/java/com/clearspring/analytics/stream/frequency/CountMinSketch.java + * + * @author + * Edwin Chen + */ +/** + * Monoid for adding CMS sketches. + * + * =Usage= + * + * `eps` and `delta` are parameters that bound the error of each query estimate. For example, errors in + * answering point queries (e.g., how often has element x appeared in the stream described by the sketch?) are + * often of the form: "with probability p >= 1 - delta, the estimate is close to the truth by some factor + * depending on eps." + * + * The type `K` is the type of items you want to count. You must provide an implicit `CMSHasher[K]` for `K`, + * and Algebird ships with several such implicits for commonly used types such as `Long` and `BigInt`. + * + * If your type `K` is not supported out of the box, you have two options: 1) You provide a "translation" + * function to convert items of your (unsupported) type `K` to a supported type such as Double, and then use + * the `contramap` function of [[CMSHasher]] to create the required `CMSHasher[K]` for your type (see the + * documentation of [[CMSHasher]] for an example); 2) You implement a `CMSHasher[K]` from scratch, using the + * existing CMSHasher implementations as a starting point. + * + * Note: Because Arrays in Scala/Java not have sane `equals` and `hashCode` implementations, you cannot safely + * use types such as `Array[Byte]`. Extra work is required for Arrays. For example, you may opt to convert + * `Array[T]` to a `Seq[T]` via `toSeq`, or you can provide appropriate wrapper classes. Algebird provides one + * such wrapper class, [[Bytes]], to safely wrap an `Array[Byte]` for use with CMS. + * + * @param eps + * One-sided error bound on the error of each point query, i.e. frequency estimate. + * @param delta + * A bound on the probability that a query estimate does not lie within some small interval (an interval + * that depends on `eps`) around the truth. + * @param seed + * A seed to initialize the random number generator used to create the pairwise independent hash functions. + * @param maxExactCountOpt + * An Option parameter about how many exact counts a sparse CMS wants to keep. + * @tparam K + * The type used to identify the elements to be counted. For example, if you want to count the occurrence of + * user names, you could map each username to a unique numeric ID expressed as a `Long`, and then count the + * occurrences of those `Long`s with a CMS of type `K=Long`. Note that this mapping between the elements of + * your problem domain and their identifiers used for counting via CMS should be bijective. We require a + * [[CMSHasher]] context bound for `K`, see [[CMSHasherImplicits]] for available implicits that can be + * imported. Which type K should you pick in practice? For domains that have less than `2^64` unique + * elements, you'd typically use `Long`. For larger domains you can try `BigInt`, for example. Other + * possibilities include Spire's `SafeLong` and `Numerical` data types (https://github.com/non/spire), + * though Algebird does not include the required implicits for CMS-hashing (cf. [[CMSHasherImplicits]]. + */ +class CMSMonoid[K: CMSHasher](eps: Double, delta: Double, seed: Int, maxExactCountOpt: Option[Int] = None) + extends Monoid[CMS[K]] + with CommutativeMonoid[CMS[K]] { + + val params: CMSParams[K] = { + val hashes: Seq[CMSHash[K]] = CMSFunctions.generateHashes(eps, delta, seed) + CMSParams(hashes, eps, delta, maxExactCountOpt) + } + + override val zero: CMS[K] = CMSZero[K](params) + + /** + * Combines the two sketches. + * + * The sketches must use the same hash functions. + */ + override def plus(left: CMS[K], right: CMS[K]): CMS[K] = { + require(left.params.hashes == right.params.hashes, "The sketches must use the same hash functions.") + left ++ right + } + + /** + * Creates a sketch out of a single item. + */ + def create(item: K): CMS[K] = CMSItem[K](item, 1L, params) + + /** + * Creates a sketch out of multiple items. + */ + def create(data: Seq[K]): CMS[K] = { + val summation = new CMSSummation(params) + data.foreach(k => summation.insert(k, 1L)) + summation.result + } + + override def sumOption(sketches: TraversableOnce[CMS[K]]): Option[CMS[K]] = + if (sketches.iterator.isEmpty) None else Some(sum(sketches)) + + override def sum(sketches: TraversableOnce[CMS[K]]): CMS[K] = { + val summation = new CMSSummation(params) + summation.updateAll(sketches) + summation.result + } +} + +/** + * This mutable builder can be used when speed is essential and you can be sure the scope of the mutability + * cannot escape in an unsafe way. The intended use is to allocate and call result in one method without + * letting a reference to the instance escape into a closure. + */ +class CMSSummation[K](params: CMSParams[K]) { + private[this] val hashes = params.hashes.toArray + private[this] val height = CMSFunctions.depth(params.delta) + private[this] val width = CMSFunctions.width(params.eps) + private[this] val cells = new Array[Long](height * width) + private[this] var totalCount = 0L + + final def insert(k: K, count: Long): Unit = { + var row = 0 + var offset = 0 + val hs = hashes + while (row < hs.length) { + cells(offset + hs(row)(k)) += count + offset += width + row += 1 + } + totalCount += count + } + + def updateAll(sketches: TraversableOnce[CMS[K]]): Unit = + sketches.iterator.foreach(updateInto) + + def updateInto(cms: CMS[K]): Unit = + cms match { + case CMSZero(_) => + () + case CMSItem(item, count, _) => + insert(item, count) + case SparseCMS(table, _, _) => + table.foreach { case (item, c) => + insert(item, c) + } + case CMSInstance(CMSInstance.CountsTable(matrix), count, _) => + var offset = 0 + val rit = matrix.iterator + while (rit.hasNext) { + var col = 0 + val cit = rit.next().iterator + while (cit.hasNext) { + cells(offset + col) += cit.next() + col += 1 + } + offset += width + } + totalCount += count + } + + def result: CMS[K] = + if (totalCount == 0L) CMSZero(params) + else { + def vectorize(row: Int): Vector[Long] = { + val offset = row * width + val b = Vector.newBuilder[Long] + var col = 0 + while (col < width) { + b += cells(offset + col) + col += 1 + } + b.result() + } + + val b = Vector.newBuilder[Vector[Long]] + var row = 0 + while (row < height) { + b += vectorize(row) + row += 1 + } + CMSInstance(CMSInstance.CountsTable(b.result()), totalCount, params) + } +} + +/** + * An Aggregator for [[CMS]]. Can be created using CMS.aggregator. + */ +case class CMSAggregator[K](cmsMonoid: CMSMonoid[K]) extends MonoidAggregator[K, CMS[K], CMS[K]] { + override val monoid: CMSMonoid[K] = cmsMonoid + + override def prepare(value: K): CMS[K] = monoid.create(value) + + override def present(cms: CMS[K]): CMS[K] = cms + +} + +/** + * Configuration parameters for [[CMS]]. + * + * @param hashes + * Pair-wise independent hashes functions. We need `N=depth` such functions (`depth` can be derived from + * `delta`). + * @param eps + * One-sided error bound on the error of each point query, i.e. frequency estimate. + * @param delta + * A bound on the probability that a query estimate does not lie within some small interval (an interval + * that depends on `eps`) around the truth. + * @param maxExactCountOpt + * An Option parameter about how many exact counts a sparse CMS wants to keep. + * @tparam K + * The type used to identify the elements to be counted. + */ +case class CMSParams[K]( + hashes: Seq[CMSHash[K]], + eps: Double, + delta: Double, + maxExactCountOpt: Option[Int] = None +) { + + require(0 < eps && eps < 1, "eps must lie in (0, 1)") + require(0 < delta && delta < 1, "delta must lie in (0, 1)") + require( + hashes.size >= CMSFunctions.depth(delta), + s"we require at least ${CMSFunctions.depth(delta)} hash functions" + ) + +} + +/** + * Helper functions to generate or to translate between various CMS parameters (cf. [[CMSParams]]). + */ +object CMSFunctions { + + /** + * Translates from `width` to `eps`. + */ + def eps(width: Int): Double = scala.math.exp(1.0) / width + + /** + * Translates from `depth` to `delta`. + */ + @throws[IllegalArgumentException]("if depth is too large, causing precision errors when computing delta") + def delta(depth: Int): Double = { + val i = scala.math.exp(-depth) + require( + i > 0.0, + s"depth must be smaller as it causes precision errors when computing delta ($depth led to an invalid delta of $i)" + ) + i + } + + /** + * Translates from `delta` to `depth`. + */ + @throws[IllegalArgumentException]("if delta is is not in (0, 1)") + def depth(delta: Double): Int = { + require(0 < delta && delta < 1, "delta must lie in (0, 1)") + scala.math.ceil(scala.math.log(1.0 / delta)).toInt + } + + /** + * Translates from `eps` to `width`. + */ + def width(eps: Double): Int = + scala.math.ceil(truncatePrecisionError(scala.math.exp(1) / eps)).toInt + + /** + * Compute maxExactCount from parameters or `depth` and `width` + */ + def maxExactCount(maxExactCountOpt: Option[Int], depth: Int, width: Int): Int = + maxExactCountOpt.getOrElse(math.max(width * depth / 100, 50)) + + // Eliminates precision errors such as the following: + // + // scala> val width = 39 + // scala> scala.math.exp(1) / CMSFunctions.eps(width) + // res171: Double = 39.00000000000001 <<< should be 39.0 + // + // Because of the actual types on which CMSFunctions operates (i.e. Int and Double), the maximum number of decimal + // places should be 6. + private def truncatePrecisionError(i: Double, decimalPlaces: Int = 6) = + BigDecimal(i) + .setScale(decimalPlaces, BigDecimal.RoundingMode.HALF_UP) + .toDouble + + /** + * Generates `N=depth` pair-wise independent hash functions. + * + * @param eps + * One-sided error bound on the error of each point query, i.e. frequency estimate. + * @param delta + * Error bound on the probability that a query estimate does NOT lie within some small interval around the + * truth. + * @param seed + * Seed for the random number generator. + * @tparam K + * The type used to identify the elements to be counted. + * @return + * The generated hash functions. + */ + def generateHashes[K: CMSHasher](eps: Double, delta: Double, seed: Int): Seq[CMSHash[K]] = { + // Typically, we would use d -- aka depth -- pair-wise independent hash functions of the form + // + // h_i(x) = a_i * x + b_i (mod p) + // + // But for this particular application, setting b_i does not matter (since all it does is shift the results of a + // particular hash), so we omit it (by setting b_i to 0) and simply use hash functions of the form + // + // h_i(x) = a_i * x (mod p) + // + val r = new scala.util.Random(seed) + val numHashes = depth(delta) + val numCounters = width(eps) + (0 to (numHashes - 1)).map(_ => CMSHash[K](r.nextInt(), 0, numCounters)) + } + +} + +/** + * A trait for CMS implementations that can count elements in a data stream and that can answer point queries + * (i.e. frequency estimates) for these elements. + * + * Known implementations: [[CMS]], [[TopCMS]]. + * + * @tparam K + * The type used to identify the elements to be counted. + * @tparam C + * The type of the actual CMS that implements this trait. + */ +trait CMSCounting[K, C[_]] { + + /** + * Returns the one-sided error bound on the error of each point query, i.e. frequency estimate. + */ + def eps: Double + + /** + * Returns the bound on the probability that a query estimate does NOT lie within some small interval (an + * interval that depends on `eps`) around the truth. + */ + def delta: Double + + /** + * Number of hash functions (also: number of rows in the counting table). This number is derived from + * `delta`. + */ + def depth: Int = CMSFunctions.depth(delta) + + /** + * Number of counters per hash function (also: number of columns in the counting table). This number is + * derived from `eps`. + */ + def width: Int = CMSFunctions.width(eps) + + /** + * An Option parameter about how many exact counts a sparse CMS wants to keep + */ + def maxExactCountOpt: Option[Int] + + /** + * Number of exact counts a sparse CMS wants to keep. This number is derived from `maxExactCountOpt`. + */ + def maxExactCount: Int = + CMSFunctions.maxExactCount(maxExactCountOpt, depth, width) + + /** + * Returns a new sketch that is the combination of this sketch and the other sketch. + */ + def ++(other: C[K]): C[K] + + /** + * Counts the item and returns the result as a new sketch. + */ + def +(item: K): C[K] = this + (item, 1L) + + /** + * Counts the item `count` times and returns the result as a new sketch. + */ + def +(item: K, count: Long): C[K] + + /** + * Returns an estimate of the total number of times this item has been seen in the stream so far. This + * estimate is an upper bound. + * + * It is always true that `estimatedFrequency >= trueFrequency`. With probability `p >= 1 - delta`, it also + * holds that `estimatedFrequency <= trueFrequency + eps * totalCount`. + */ + def frequency(item: K): Approximate[Long] + + /** + * Returns an estimate of the inner product against another data stream. + * + * In other words, let a_i denote the number of times element i has been seen in the data stream summarized + * by this CMS, and let b_i denote the same for the other CMS. Then this returns an estimate of ` = + * \sum a_i b_i`. + * + * Note: This can also be viewed as the join size between two relations. + * + * It is always true that actualInnerProduct <= estimatedInnerProduct. With probability `p >= 1 - delta`, it + * also holds that `estimatedInnerProduct <= actualInnerProduct + eps * thisTotalCount * otherTotalCount`. + */ + def innerProduct(other: C[K]): Approximate[Long] + + /** + * Total number of elements counted (i.e. seen in the data stream) so far. + */ + def totalCount: Long + + /** + * The first frequency moment is the total number of elements in the stream. + */ + def f1: Long = totalCount + + /** + * The second frequency moment is `\sum a_i^2`, where `a_i` is the count of the i-th element. + */ + def f2: Approximate[Long] + +} + +/** + * A trait for CMS implementations that can track heavy hitters in a data stream. + * + * It is up to the implementation how the semantics of tracking heavy hitters are defined. For instance, one + * implementation could track the "top %" heavy hitters whereas another implementation could track the "top N" + * heavy hitters. + * + * Known implementations: [[TopCMS]]. + * + * @tparam K + * The type used to identify the elements to be counted. + */ +trait CMSHeavyHitters[K] { + + /** + * The pluggable logic of how heavy hitters are being tracked. + */ + def heavyHittersLogic: HeavyHittersLogic[K] + + /** + * Returns the set of heavy hitters. + */ + def heavyHitters: Set[K] + +} + +object CMS { + + def monoid[K: CMSHasher](eps: Double, delta: Double, seed: Int): CMSMonoid[K] = + monoid(eps, delta, seed, None) + def monoid[K: CMSHasher]( + eps: Double, + delta: Double, + seed: Int, + maxExactCountOpt: Option[Int] + ): CMSMonoid[K] = + new CMSMonoid[K](eps, delta, seed, maxExactCountOpt) + + def monoid[K: CMSHasher](depth: Int, width: Int, seed: Int): CMSMonoid[K] = + monoid(depth, width, seed, None) + def monoid[K: CMSHasher](depth: Int, width: Int, seed: Int, maxExactCountOpt: Option[Int]): CMSMonoid[K] = + monoid(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, maxExactCountOpt) + + def aggregator[K: CMSHasher](eps: Double, delta: Double, seed: Int): CMSAggregator[K] = + aggregator(eps, delta, seed, None) + def aggregator[K: CMSHasher]( + eps: Double, + delta: Double, + seed: Int, + maxExactCountOpt: Option[Int] + ): CMSAggregator[K] = + new CMSAggregator[K](monoid(eps, delta, seed, maxExactCountOpt)) + + def aggregator[K: CMSHasher](depth: Int, width: Int, seed: Int): CMSAggregator[K] = + aggregator(depth, width, seed, None) + def aggregator[K: CMSHasher]( + depth: Int, + width: Int, + seed: Int, + maxExactCountOpt: Option[Int] + ): CMSAggregator[K] = + aggregator(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, maxExactCountOpt) + + /** + * Returns a fresh, zeroed CMS instance. + */ + def apply[K: CMSHasher]( + eps: Double, + delta: Double, + seed: Int, + maxExactCountOpt: Option[Int] = None + ): CMS[K] = { + val params = { + val hashes: Seq[CMSHash[K]] = + CMSFunctions.generateHashes(eps, delta, seed) + CMSParams(hashes, eps, delta, maxExactCountOpt) + } + CMSZero[K](params) + } + +} + +/** + * A Count-Min sketch data structure that allows for counting and frequency estimation of elements in a data + * stream. + * + * Tip: If you also need to track heavy hitters ("Top N" problems), take a look at [[TopCMS]]. + * + * =Usage= + * + * This example demonstrates how to count `Long` elements with [[CMS]], i.e. `K=Long`. + * + * Note that the actual counting is always performed with a `Long`, regardless of your choice of `K`. That is, + * the counting table behind the scenes is backed by `Long` values (at least in the current implementation), + * and thus the returned frequency estimates are always instances of `Approximate[Long]`. + * + * @example + * {{{ + * + * // Creates a monoid for a CMS that can count `Long` elements. val cmsMonoid: CMSMonoid[Long] = { val eps = + * 0.001 val delta = 1E-10 val seed = 1 CMS.monoid[Long](eps, delta, seed) } + * + * // Creates a CMS instance that has counted the element `1L`. val cms: CMS[Long] = cmsMonoid.create(1L) + * + * // Estimates the frequency of `1L` val estimate: Approximate[Long] = cms.frequency(1L) + * }}} + * + * @tparam K + * The type used to identify the elements to be counted. + */ +sealed abstract class CMS[K](val params: CMSParams[K]) extends java.io.Serializable with CMSCounting[K, CMS] { + + override val eps: Double = params.eps + + override val delta: Double = params.delta + + override val maxExactCountOpt: Option[Int] = params.maxExactCountOpt + + override def f2: Approximate[Long] = innerProduct(this) + +} + +/** + * Zero element. Used for initialization. + */ +case class CMSZero[K](override val params: CMSParams[K]) extends CMS[K](params) { + + override val totalCount: Long = 0L + + override def +(item: K, count: Long): CMS[K] = CMSItem[K](item, count, params) + + override def ++(other: CMS[K]): CMS[K] = other + + override def frequency(item: K): Approximate[Long] = Approximate.exact(0L) + + override def innerProduct(other: CMS[K]): Approximate[Long] = + Approximate.exact(0L) + +} + +/** + * Used for holding a single element, to avoid repeatedly adding elements from sparse counts tables. + */ +case class CMSItem[K](item: K, override val totalCount: Long, override val params: CMSParams[K]) + extends CMS[K](params) { + + override def +(x: K, count: Long): CMS[K] = + SparseCMS[K](params) + (item, totalCount) + (x, count) + + override def ++(other: CMS[K]): CMS[K] = + other match { + case _: CMSZero[?] => this + case other: CMSItem[K] => + CMSInstance[K](params) + (item, totalCount) + (other.item, other.totalCount) + case _ => other + item + } + + override def frequency(x: K): Approximate[Long] = + if (item == x) Approximate.exact(totalCount) else Approximate.exact(0L) + + override def innerProduct(other: CMS[K]): Approximate[Long] = + Approximate.exact(totalCount) * other.frequency(item) + +} + +/** + * A sparse Count-Min sketch structure, used for situations where the key is highly skewed. + */ +case class SparseCMS[K]( + exactCountTable: Map[K, Long], + override val totalCount: Long, + override val params: CMSParams[K] +) extends CMS[K](params) { + import SparseCMS._ + + override def +(x: K, count: Long): CMS[K] = { + val currentCount = exactCountTable.getOrElse(x, 0L) + val newTable = exactCountTable.updated(x, currentCount + count) + if (newTable.size < maxExactCount) { + // still sparse + SparseCMS(newTable, totalCount = totalCount + count, params = params) + } else { + toDense(newTable, params) + } + } + + override def ++(other: CMS[K]): CMS[K] = + other match { + case _: CMSZero[?] => this + case other: CMSItem[K] => this + (other.item, other.totalCount) + case other: SparseCMS[K] => + // This SparseCMS's maxExactCount is used, so ++ is not communitive + val newTable = Semigroup.plus(exactCountTable, other.exactCountTable) + if (newTable.size < maxExactCount) { + // still sparse + SparseCMS(newTable, totalCount = totalCount + other.totalCount, params = params) + } else { + toDense(newTable, params) + } + + case other: CMSInstance[K] => other ++ this + } + + override def frequency(x: K): Approximate[Long] = + Approximate.exact(exactCountTable.getOrElse(x, 0L)) + + override def innerProduct(other: CMS[K]): Approximate[Long] = + exactCountTable.iterator + .map { case (x, count) => Approximate.exact(count) * other.frequency(x) } + .reduceOption(_ + _) + .getOrElse(Approximate.exact(0L)) +} + +object SparseCMS { + + /** + * Creates a new [[SparseCMS]] with empty exactCountTable + */ + def apply[K](params: CMSParams[K]): SparseCMS[K] = { + val exactCountTable = Map[K, Long]() + SparseCMS[K](exactCountTable, totalCount = 0, params = params) + } + + /** + * Creates a new [[CMSInstance]] from a Map[K, Long] + */ + def toDense[K](exactCountTable: Map[K, Long], params: CMSParams[K]): CMS[K] = + // Create new CMSInstace + exactCountTable.foldLeft(CMSInstance[K](params)) { case (cms, (x, count)) => + cms + (x, count) + } +} + +/** + * The general Count-Min sketch structure, used for holding any number of elements. + */ +case class CMSInstance[K]( + countsTable: CMSInstance.CountsTable[K], + override val totalCount: Long, + override val params: CMSParams[K] +) extends CMS[K](params) { + + override def ++(other: CMS[K]): CMS[K] = + other match { + case _: CMSZero[?] => this + case other: CMSItem[K] => this + other.item + case other: SparseCMS[K] => + other.exactCountTable.foldLeft(this) { case (cms, (x, count)) => + cms + (x, count) + } + case other: CMSInstance[K] => + val newTable = countsTable ++ other.countsTable + val newTotalCount = totalCount + other.totalCount + CMSInstance[K](newTable, newTotalCount, params) + } + + private def makeApprox(est: Long): Approximate[Long] = + if (est == 0L) Approximate.exact(0L) + else { + val lower = math.max(0L, est - (eps * totalCount).toLong) + Approximate(lower, est, est, 1 - delta) + } + + override def frequency(item: K): Approximate[Long] = { + var freq = Long.MaxValue + val hs = params.hashes + val it = countsTable.counts.iterator + var i = 0 + while (it.hasNext) { + val row = it.next() + val count = row(hs(i)(item)) + if (count < freq) freq = count + i += 1 + } + makeApprox(freq) + } + + /** + * Let X be a CMS, and let count_X[j, k] denote the value in X's 2-dimensional count table at row j and + * column k. Then the Count-Min sketch estimate of the inner product between A and B is the minimum inner + * product between their rows: estimatedInnerProduct = min_j (\sum_k count_A[j, k] * count_B[j, k]|) + */ + override def innerProduct(other: CMS[K]): Approximate[Long] = + other match { + case other: CMSInstance[?] => + require(other.depth == depth && other.width == width, "Tables must have the same dimensions.") + + def innerProductAtDepth(d: Int) = + (0 to (width - 1)).iterator.map { w => + countsTable.getCount((d, w)) * other.countsTable.getCount((d, w)) + }.sum + + val est = (0 to (depth - 1)).iterator.map(innerProductAtDepth).min + val minimum = + math.max(est - (eps * totalCount * other.totalCount).toLong, 0) + Approximate(minimum, est, est, 1 - delta) + case _ => other.innerProduct(this) + } + + override def +(item: K, count: Long): CMSInstance[K] = { + require(count >= 0, "count must be >= 0 (negative counts not implemented") + if (count != 0L) { + val newCountsTable = + (0 to (depth - 1)).foldLeft(countsTable) { case (table, row) => + val pos = (row, params.hashes(row)(item)) + table + (pos, count) + } + CMSInstance[K](newCountsTable, totalCount + count, params) + } else this + } + +} + +object CMSInstance { + + /** + * Initializes a [[CMSInstance]] with all zeroes, i.e. nothing has been counted yet. + */ + def apply[K](params: CMSParams[K]): CMSInstance[K] = { + val countsTable = CountsTable[K](CMSFunctions.depth(params.delta), CMSFunctions.width(params.eps)) + CMSInstance[K](countsTable, 0, params) + } + + /** + * The 2-dimensional table of counters used in the Count-Min sketch. Each row corresponds to a particular + * hash function. + */ + // TODO: implement a dense matrix type, and use it here + case class CountsTable[K](counts: Vector[Vector[Long]]) { + require(depth > 0, "Table must have at least 1 row.") + require(width > 0, "Table must have at least 1 column.") + + def depth: Int = counts.size + + def width: Int = counts(0).size + + def getCount(pos: (Int, Int)): Long = { + val (row, col) = pos + require(row < depth && col < width, "Position must be within the bounds of this table.") + counts(row)(col) + } + + /** + * Updates the count of a single cell in the table. + */ + def +(pos: (Int, Int), count: Long): CountsTable[K] = { + val (row, col) = pos + val currCount = getCount(pos) + val newCounts = + counts.updated(row, counts(row).updated(col, currCount + count)) + CountsTable[K](newCounts) + } + + /** + * Adds another counts table to this one, through element-wise addition. + */ + def ++(other: CountsTable[K]): CountsTable[K] = { + require(depth == other.depth && width == other.width, "Tables must have the same dimensions.") + val xss = this.counts.iterator + val yss = other.counts.iterator + val rows = Vector.newBuilder[Vector[Long]] + while (xss.hasNext) { + val xs = xss.next().iterator + val ys = yss.next().iterator + val row = Vector.newBuilder[Long] + while (xs.hasNext) row += (xs.next() + ys.next()) + rows += row.result() + } + CountsTable[K](rows.result()) + } + } + + object CountsTable { + + /** + * Creates a new [[CountsTable]] with counts initialized to all zeroes. + */ + def apply[K](depth: Int, width: Int): CountsTable[K] = + CountsTable[K](Vector.fill[Long](depth, width)(0L)) + + } + +} + +case class TopCMSParams[K](logic: HeavyHittersLogic[K]) + +/** + * A Count-Min sketch data structure that allows for (a) counting and frequency estimation of elements in a + * data stream and (b) tracking the heavy hitters among these elements. + * + * The logic of how heavy hitters are computed is pluggable, see [[HeavyHittersLogic]]. + * + * Tip: If you do not need to track heavy hitters, take a look at [[CMS]], which is more efficient in this + * case. + * + * =Usage= + * + * This example demonstrates how to count `Long` elements with [[TopCMS]], i.e. `K=Long`. + * + * Note that the actual counting is always performed with a `Long`, regardless of your choice of `K`. That is, + * the counting table behind the scenes is backed by `Long` values (at least in the current implementation), + * and thus the returned frequency estimates are always instances of `Approximate[Long]`. + * + * @example + * {{{ // Creates a monoid for a CMS that can count `Long` elements. val topPctCMSMonoid: + * TopPctCMSMonoid[Long] = { val eps = 0.001 val delta = 1E-10 val seed = 1 val heavyHittersPct = 0.1 + * TopPctCMS.monoid[Long](eps, delta, seed, heavyHittersPct) } + * + * // Creates a TopCMS instance that has counted the element `1L`. val topCMS: TopCMS[Long] = + * topPctCMSMonoid.create(1L) + * + * // Estimates the frequency of `1L` val estimate: Approximate[Long] = topCMS.frequency(1L) + * + * // What are the heavy hitters so far? val heavyHitters: Set[Long] = topCMS.heavyHitters }}} + * + * @tparam K + * The type used to identify the elements to be counted. + */ +sealed abstract class TopCMS[K](val cms: CMS[K], params: TopCMSParams[K]) + extends java.io.Serializable + with CMSCounting[K, TopCMS] + with CMSHeavyHitters[K] { + + override val eps: Double = cms.eps + + override val delta: Double = cms.delta + + override val totalCount: Long = cms.totalCount + + override val maxExactCountOpt: Option[Int] = cms.maxExactCountOpt + + override def frequency(item: K): Approximate[Long] = cms.frequency(item) + + override def innerProduct(other: TopCMS[K]): Approximate[Long] = + cms.innerProduct(other.cms) + + override def f2: Approximate[Long] = innerProduct(this) + + /** + * The pluggable logic with which heavy hitters are being tracked. + */ + override def heavyHittersLogic: HeavyHittersLogic[K] = params.logic + +} + +/** + * Zero element. Used for initialization. + */ +case class TopCMSZero[K](override val cms: CMS[K], params: TopCMSParams[K]) extends TopCMS[K](cms, params) { + + override val heavyHitters: Set[K] = Set.empty[K] + + override def +(item: K, count: Long): TopCMS[K] = + TopCMSInstance(cms, params) + (item, count) + + override def ++(other: TopCMS[K]): TopCMS[K] = other + +} + +/** + * Used for holding a single element, to avoid repeatedly adding elements from sparse counts tables. + */ +case class TopCMSItem[K](item: K, override val cms: CMS[K], params: TopCMSParams[K]) + extends TopCMS[K](cms, params) { + + override val heavyHitters: Set[K] = Set(item) + + override def +(x: K, count: Long): TopCMS[K] = toCMSInstance + (x, count) + + override def ++(other: TopCMS[K]): TopCMS[K] = other match { + case _: TopCMSZero[?] => this + case other: TopCMSItem[K] => toCMSInstance + other.item + case other: TopCMSInstance[K] => other + item + } + + private def toCMSInstance: TopCMSInstance[K] = { + val hhs = HeavyHitters.from(HeavyHitter(item, 1L)) + TopCMSInstance(cms, hhs, params) + } + +} + +object TopCMSInstance { + + def apply[K](cms: CMS[K], params: TopCMSParams[K]): TopCMSInstance[K] = + TopCMSInstance[K](cms, HeavyHitters.empty[K], params) + +} + +case class TopCMSInstance[K](override val cms: CMS[K], hhs: HeavyHitters[K], params: TopCMSParams[K]) + extends TopCMS[K](cms, params) { + + override def heavyHitters: Set[K] = hhs.items + + override def +(item: K, count: Long): TopCMSInstance[K] = { + require(count >= 0, "count must be >= 0 (negative counts not implemented") + if (count != 0L) { + val newCms = cms + (item, count) + val newHhs = + heavyHittersLogic.updateHeavyHitters(cms, newCms)(hhs, item, count) + TopCMSInstance[K](newCms, newHhs, params) + } else this + } + + override def ++(other: TopCMS[K]): TopCMS[K] = other match { + case _: TopCMSZero[?] => this + case other: TopCMSItem[K] => this + other.item + case other: TopCMSInstance[K] => + val newCms = cms ++ other.cms + val newHhs = heavyHittersLogic.updateHeavyHitters(newCms)(hhs, other.hhs) + TopCMSInstance(newCms, newHhs, params) + } + +} + +class TopCMSMonoid[K](emptyCms: CMS[K], logic: HeavyHittersLogic[K]) extends Monoid[TopCMS[K]] { + + val params: TopCMSParams[K] = TopCMSParams(logic) + + override val zero: TopCMS[K] = TopCMSZero[K](emptyCms, params) + + /** + * Combines the two sketches. + * + * The sketches must use the same hash functions. + */ + override def plus(left: TopCMS[K], right: TopCMS[K]): TopCMS[K] = { + require( + left.cms.params.hashes == right.cms.params.hashes, + "The sketches must use the same hash functions." + ) + left ++ right + } + + /** + * Creates a sketch out of a single item. + */ + def create(item: K): TopCMS[K] = + TopCMSItem[K](item, emptyCms + item, params) + + /** + * Creates a sketch out of multiple items. + */ + def create(data: Seq[K]): TopCMS[K] = + data.foldLeft(zero) { case (acc, x) => plus(acc, create(x)) } + + override def sum(sketches: TraversableOnce[TopCMS[K]]): TopCMS[K] = { + val topCandidates = scala.collection.mutable.Set.empty[K] + val summation = new CMSSummation(emptyCms.params) + sketches.iterator.foreach { sketch => + summation.updateInto(sketch.cms) + topCandidates ++= sketch.heavyHitters + } + val cms = summation.result + val ests = + topCandidates.map(k => HeavyHitter(k, cms.frequency(k).estimate)).toSet + val hhs = logic.purgeHeavyHitters(cms)(HeavyHitters(ests)) + TopCMSInstance(cms, hhs, params) + } + + override def sumOption(sketches: TraversableOnce[TopCMS[K]]): Option[TopCMS[K]] = + if (sketches.iterator.isEmpty) None else Some(sum(sketches)) +} + +class TopCMSAggregator[K](cmsMonoid: TopCMSMonoid[K]) extends MonoidAggregator[K, TopCMS[K], TopCMS[K]] { + + override def monoid: TopCMSMonoid[K] = cmsMonoid + + override def prepare(value: K): TopCMS[K] = monoid.create(value) + + override def present(cms: TopCMS[K]): TopCMS[K] = cms + +} + +/** + * Controls how a CMS that implements [[CMSHeavyHitters]] tracks heavy hitters. + */ +abstract class HeavyHittersLogic[K] extends java.io.Serializable { + + def updateHeavyHitters( + oldCms: CMS[K], + newCms: CMS[K] + )(hhs: HeavyHitters[K], item: K, count: Long): HeavyHitters[K] = { + val oldItemCount = oldCms.frequency(item).estimate + val oldHh = HeavyHitter[K](item, oldItemCount) + val newItemCount = oldItemCount + count + val newHh = HeavyHitter[K](item, newItemCount) + purgeHeavyHitters(newCms)(hhs - oldHh + newHh) + } + + def updateHeavyHitters(cms: CMS[K])(left: HeavyHitters[K], right: HeavyHitters[K]): HeavyHitters[K] = { + val candidates = (left.items ++ right.items).map { case i => + HeavyHitter[K](i, cms.frequency(i).estimate) + } + val newHhs = HeavyHitters.from(candidates) + purgeHeavyHitters(cms)(newHhs) + } + + def purgeHeavyHitters(cms: CMS[K])(hhs: HeavyHitters[K]): HeavyHitters[K] + +} + +/** + * Finds all heavy hitters, i.e., elements in the stream that appear at least `(heavyHittersPct * totalCount)` + * times. + * + * Every item that appears at least `(heavyHittersPct * totalCount)` times is output, and with probability `p + * >= 1 - delta`, no item whose count is less than `(heavyHittersPct - eps) * totalCount` is output. + * + * This also means that this parameter is an upper bound on the number of heavy hitters that will be tracked: + * the set of heavy hitters contains at most `1 / heavyHittersPct` elements. For example, if + * `heavyHittersPct=0.01` (or 0.25), then at most `1 / 0.01 = 100` items (or `1 / 0.25 = 4` items) will be + * tracked/returned as heavy hitters. This parameter can thus control the memory footprint required for + * tracking heavy hitters. + */ +case class TopPctLogic[K](heavyHittersPct: Double) extends HeavyHittersLogic[K] { + + require(0 < heavyHittersPct && heavyHittersPct < 1, "heavyHittersPct must lie in (0, 1)") + + override def purgeHeavyHitters(cms: CMS[K])(hitters: HeavyHitters[K]): HeavyHitters[K] = { + val minCount = heavyHittersPct * cms.totalCount + HeavyHitters[K](hitters.hhs.filter(_.count >= minCount)) + } + +} + +/** + * Tracks the top N heavy hitters, where `N` is defined by `heavyHittersN`. + * + * '''Warning:''' top-N computations are not associative. The effect is that a top-N CMS has an ordering bias + * (with regard to heavy hitters) when merging instances. This means merging heavy hitters across CMS + * instances may lead to incorrect, biased results: the outcome is biased by the order in which CMS instances + * / heavy hitters are being merged, with the rule of thumb being that the earlier a set of heavy hitters is + * being merged, the more likely is the end result biased towards these heavy hitters. + * + * @see + * Discussion in [[https://github.com/twitter/algebird/issues/353 Algebird issue 353]] + */ +case class TopNLogic[K](heavyHittersN: Int) extends HeavyHittersLogic[K] { + + require(heavyHittersN > 0, "heavyHittersN must be > 0") + + override def purgeHeavyHitters(cms: CMS[K])(hitters: HeavyHitters[K]): HeavyHitters[K] = { + val sorted = + hitters.hhs.toSeq.sortBy(hh => hh.count).takeRight(heavyHittersN) + HeavyHitters[K](sorted.toSet) + } + +} + +/** + * Containers for holding heavy hitter items and their associated counts. + */ +case class HeavyHitters[K](hhs: Set[HeavyHitter[K]]) extends java.io.Serializable { + + def -(hh: HeavyHitter[K]): HeavyHitters[K] = HeavyHitters[K](hhs - hh) + + def +(hh: HeavyHitter[K]): HeavyHitters[K] = HeavyHitters[K](hhs + hh) + + def ++(other: HeavyHitters[K]): HeavyHitters[K] = + HeavyHitters[K](hhs ++ other.hhs) + + def items: Set[K] = hhs.map(_.item) + +} + +object HeavyHitters { + + def empty[K]: HeavyHitters[K] = HeavyHitters(emptyHhs) + + private def emptyHhs[K]: Set[HeavyHitter[K]] = Set[HeavyHitter[K]]() + + def from[K](hhs: Set[HeavyHitter[K]]): HeavyHitters[K] = + hhs.foldLeft(empty[K])(_ + _) + + def from[K](hh: HeavyHitter[K]): HeavyHitters[K] = HeavyHitters(emptyHhs + hh) + +} + +case class HeavyHitter[K](item: K, count: Long) extends java.io.Serializable + +/** + * Monoid for Top-% based [[TopCMS]] sketches. + * + * =Usage= + * + * The type `K` is the type of items you want to count. You must provide an implicit `CMSHasher[K]` for `K`, + * and Algebird ships with several such implicits for commonly used types such as `Long` and `BigInt`. + * + * If your type `K` is not supported out of the box, you have two options: 1) You provide a "translation" + * function to convert items of your (unsupported) type `K` to a supported type such as Double, and then use + * the `contramap` function of [[CMSHasher]] to create the required `CMSHasher[K]` for your type (see the + * documentation of [[CMSHasher]] for an example); 2) You implement a `CMSHasher[K]` from scratch, using the + * existing CMSHasher implementations as a starting point. + * + * Note: Because Arrays in Scala/Java not have sane `equals` and `hashCode` implementations, you cannot safely + * use types such as `Array[Byte]`. Extra work is required for Arrays. For example, you may opt to convert + * `Array[T]` to a `Seq[T]` via `toSeq`, or you can provide appropriate wrapper classes. Algebird provides one + * such wrapper class, [[Bytes]], to safely wrap an `Array[Byte]` for use with CMS. + * + * @param cms + * A [[CMS]] instance, which is used for the counting and the frequency estimation performed by this class. + * @param heavyHittersPct + * A threshold for finding heavy hitters, i.e., elements that appear at least (heavyHittersPct * totalCount) + * times in the stream. + * @tparam K + * The type used to identify the elements to be counted. For example, if you want to count the occurrence of + * user names, you could map each username to a unique numeric ID expressed as a `Long`, and then count the + * occurrences of those `Long`s with a CMS of type `K=Long`. Note that this mapping between the elements of + * your problem domain and their identifiers used for counting via CMS should be bijective. We require a + * [[CMSHasher]] context bound for `K`, see [[CMSHasher]] for available implicits that can be imported. + * Which type K should you pick in practice? For domains that have less than `2^64` unique elements, you'd + * typically use `Long`. For larger domains you can try `BigInt`, for example. + */ +class TopPctCMSMonoid[K](cms: CMS[K], heavyHittersPct: Double = 0.01) + extends TopCMSMonoid[K](cms, TopPctLogic[K](heavyHittersPct)) + +object TopPctCMS { + + def monoid[K: CMSHasher]( + eps: Double, + delta: Double, + seed: Int, + heavyHittersPct: Double + ): TopPctCMSMonoid[K] = + new TopPctCMSMonoid[K](CMS(eps, delta, seed), heavyHittersPct) + + def monoid[K: CMSHasher](depth: Int, width: Int, seed: Int, heavyHittersPct: Double): TopPctCMSMonoid[K] = + monoid(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersPct) + + def aggregator[K: CMSHasher]( + eps: Double, + delta: Double, + seed: Int, + heavyHittersPct: Double + ): TopPctCMSAggregator[K] = + new TopPctCMSAggregator[K](monoid(eps, delta, seed, heavyHittersPct)) + + def aggregator[K: CMSHasher]( + depth: Int, + width: Int, + seed: Int, + heavyHittersPct: Double + ): TopPctCMSAggregator[K] = + aggregator(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersPct) + +} + +/** + * An Aggregator for [[TopPctCMS]]. Can be created using [[TopPctCMS.aggregator]]. + */ +case class TopPctCMSAggregator[K](cmsMonoid: TopPctCMSMonoid[K]) extends TopCMSAggregator(cmsMonoid) + +/** + * Monoid for top-N based [[TopCMS]] sketches. '''Use with care! (see warning below)''' + * + * =Warning: Adding top-N CMS instances (`++`) is an unsafe operation= + * + * Top-N computations are not associative. The effect is that a top-N CMS has an ordering bias (with regard to + * heavy hitters) when ''merging'' CMS instances (e.g. via `++`). This means merging heavy hitters across CMS + * instances may lead to incorrect, biased results: the outcome is biased by the order in which CMS instances + * / heavy hitters are being merged, with the rule of thumb being that the earlier a set of heavy hitters is + * being merged, the more likely is the end result biased towards these heavy hitters. + * + * The warning above only applies when ''adding CMS instances'' (think: `cms1 ++ cms2`). In comparison, heavy + * hitters are correctly computed when: + * + * - a top-N CMS instance is created from a single data stream, i.e. `Seq[K]` + * - items are added/counted individually, i.e. `cms + item` or `cms + (item, count)`. + * + * See the discussion in [[https://github.com/twitter/algebird/issues/353 Algebird issue 353]] for further + * details. + * + * =Alternatives= + * + * The following, alternative data structures may be better picks than a top-N based CMS given the warning + * above: + * + * - [[TopPctCMS]]: Has safe merge semantics for its instances including heavy hitters. + * - [[SpaceSaver]]: Has the same ordering bias than a top-N CMS, but at least it provides bounds on the + * bias. + * + * =Usage= + * + * The type `K` is the type of items you want to count. You must provide an implicit `CMSHasher[K]` for `K`, + * and Algebird ships with several such implicits for commonly used types such as `Long` and `BigInt`. + * + * If your type `K` is not supported out of the box, you have two options: 1) You provide a "translation" + * function to convert items of your (unsupported) type `K` to a supported type such as [[Double]], and then + * use the `contramap` function of [[CMSHasher]] to create the required `CMSHasher[K]` for your type (see the + * documentation of [[CMSHasher]] for an example); 2) You implement a `CMSHasher[K]` from scratch, using the + * existing CMSHasher implementations as a starting point. + * + * Note: Because Arrays in Scala/Java not have sane `equals` and `hashCode` implementations, you cannot safely + * use types such as `Array[Byte]`. Extra work is required for Arrays. For example, you may opt to convert + * `Array[T]` to a `Seq[T]` via `toSeq`, or you can provide appropriate wrapper classes. Algebird provides one + * such wrapper class, [[Bytes]], to safely wrap an `Array[Byte]` for use with CMS. + * + * @param cms + * A [[CMS]] instance, which is used for the counting and the frequency estimation performed by this class. + * @param heavyHittersN + * The maximum number of heavy hitters to track. + * @tparam K + * The type used to identify the elements to be counted. For example, if you want to count the occurrence of + * user names, you could map each username to a unique numeric ID expressed as a `Long`, and then count the + * occurrences of those `Long`s with a CMS of type `K=Long`. Note that this mapping between the elements of + * your problem domain and their identifiers used for counting via CMS should be bijective. We require a + * [[CMSHasher]] context bound for `K`, see [[CMSHasher]] for available implicits that can be imported. + * Which type K should you pick in practice? For domains that have less than `2^64` unique elements, you'd + * typically use `Long`. For larger domains you can try `BigInt`, for example. + */ +class TopNCMSMonoid[K](cms: CMS[K], heavyHittersN: Int = 100) + extends TopCMSMonoid[K](cms, TopNLogic[K](heavyHittersN)) + +object TopNCMS { + + def monoid[K: CMSHasher](eps: Double, delta: Double, seed: Int, heavyHittersN: Int): TopNCMSMonoid[K] = + new TopNCMSMonoid[K](CMS(eps, delta, seed), heavyHittersN) + + def monoid[K: CMSHasher](depth: Int, width: Int, seed: Int, heavyHittersN: Int): TopNCMSMonoid[K] = + monoid(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersN) + + def aggregator[K: CMSHasher]( + eps: Double, + delta: Double, + seed: Int, + heavyHittersN: Int + ): TopNCMSAggregator[K] = + new TopNCMSAggregator[K](monoid(eps, delta, seed, heavyHittersN)) + + def aggregator[K: CMSHasher](depth: Int, width: Int, seed: Int, heavyHittersN: Int): TopNCMSAggregator[K] = + aggregator(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersN) + +} + +/** + * An Aggregator for [[TopNCMS]]. Can be created using [[TopNCMS.aggregator]]. + */ +case class TopNCMSAggregator[K](cmsMonoid: TopNCMSMonoid[K]) extends TopCMSAggregator(cmsMonoid) + +/** + * K1 defines a scope for the CMS. For each k1, keep the top heavyHittersN associated k2 values. + */ +case class ScopedTopNLogic[K1, K2](heavyHittersN: Int) extends HeavyHittersLogic[(K1, K2)] { + + require(heavyHittersN > 0, "heavyHittersN must be > 0") + + override def purgeHeavyHitters( + cms: CMS[(K1, K2)] + )(hitters: HeavyHitters[(K1, K2)]): HeavyHitters[(K1, K2)] = { + val grouped = hitters.hhs.groupBy(hh => hh.item._1) + val (underLimit, overLimit) = grouped.partition { + _._2.size <= heavyHittersN + } + val sorted = overLimit.transform { case (_, hhs) => + hhs.toSeq.sortBy(hh => hh.count) + } + val purged = sorted.transform { case (_, hhs) => + hhs.takeRight(heavyHittersN) + } + HeavyHitters[(K1, K2)](purged.values.flatten.toSet ++ underLimit.values.flatten.toSet) + } + +} + +/* + * Monoid for Top-N values per key in an associative [[TopCMS]]. + * + * Typical use case for this might be (Country, City) pairs. For a stream of such + * pairs, we might want to keep track of the most popular cities for each country. + * + * This can, of course, be achieved using a Map[Country, TopNCMS[City]], but this + * requires storing one CMS per distinct Country. + * + * Similarly, one could attempt to use a TopNCMS[(Country, City)], but less common + * countries may not make the cut if N is not "very large". + * + * ScopedTopNCMSMonoid[Country, City] will avoid having one Country drown others + * out, while still only using a single CMS. + * + * In general the eviction of K1 is not supported, and all distinct K1 values must + * be retained. Therefore it is important to only use this Monoid when the number + * of distinct K1 values is known to be reasonably bounded. + */ +class ScopedTopNCMSMonoid[K1, K2](cms: CMS[(K1, K2)], heavyHittersN: Int = 100) + extends TopCMSMonoid[(K1, K2)](cms, ScopedTopNLogic[K1, K2](heavyHittersN)) + +object ScopedTopNCMS { + + def scopedHasher[K1: CMSHasher, K2: CMSHasher]: CMSHasher[(K1, K2)] = new CMSHasher[(K1, K2)] { + private val k1Hasher = implicitly[CMSHasher[K1]] + private val k2Hasher = implicitly[CMSHasher[K2]] + + override def hash(a: Int, b: Int, width: Int)(x: (K1, K2)): Int = { + val (k1, k2) = x + val xs = Seq(k1Hasher.hash(a, b, width)(k1), k2Hasher.hash(a, b, width)(k2), a, b) + (scala.util.hashing.MurmurHash3.seqHash(xs) & Int.MaxValue) % width + } + } + + def monoid[K1: CMSHasher, K2: CMSHasher]( + eps: Double, + delta: Double, + seed: Int, + heavyHittersN: Int + ): ScopedTopNCMSMonoid[K1, K2] = + new ScopedTopNCMSMonoid[K1, K2](CMS(eps, delta, seed)(scopedHasher[K1, K2]), heavyHittersN) + + def monoid[K1: CMSHasher, K2: CMSHasher]( + depth: Int, + width: Int, + seed: Int, + heavyHittersN: Int + ): ScopedTopNCMSMonoid[K1, K2] = + monoid(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersN) + + def aggregator[K1: CMSHasher, K2: CMSHasher]( + eps: Double, + delta: Double, + seed: Int, + heavyHittersN: Int + ): TopCMSAggregator[(K1, K2)] = + new TopCMSAggregator(monoid(eps, delta, seed, heavyHittersN)) + + def aggregator[K1: CMSHasher, K2: CMSHasher]( + depth: Int, + width: Int, + seed: Int, + heavyHittersN: Int + ): TopCMSAggregator[(K1, K2)] = + aggregator(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersN) + +} + +case class CMSHash[K: CMSHasher](a: Int, b: Int, width: Int) extends java.io.Serializable { + + /** + * Returns `a * x + b (mod p) (mod width)`. + */ + def apply(x: K): Int = implicitly[CMSHasher[K]].hash(a, b, width)(x) + +} + +/** + * This formerly held the instances that moved to object CMSHasher + * + * These instances are slow, but here for compatibility with old serialized data. For new code, avoid these + * and instead use the implicits found in the CMSHasher companion object. + */ +object CMSHasherImplicits { + + implicit object CMSHasherBigInt extends CMSHasher[BigInt] { + override def hash(a: Int, b: Int, width: Int)(x: BigInt): Int = + CMSHasher.hashBytes(a, b, width)(x.toByteArray) + } + + implicit object CMSHasherString extends CMSHasher[String] { + override def hash(a: Int, b: Int, width: Int)(x: String): Int = + CMSHasher.hashBytes(a, b, width)(x.getBytes("UTF-8")) + } + + def cmsHasherShort: CMSHasher[Short] = CMSHasher.cmsHasherShort +} diff --git a/algebird-core/src/main/scala-3/DecayedVector.scala b/algebird-core/src/main/scala-3/DecayedVector.scala new file mode 100644 index 000000000..18e816fe4 --- /dev/null +++ b/algebird-core/src/main/scala-3/DecayedVector.scala @@ -0,0 +1,75 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ + +package com.twitter.algebird + +/** + * Represents a container class together with time. Its monoid consists of exponentially scaling the older + * value and summing with the newer one. + */ +object DecayedVector extends CompatDecayedVector { + def buildWithHalflife[C[_]](vector: C[Double], time: Double, halfLife: Double): DecayedVector[C] = + DecayedVector(vector, time * scala.math.log(2.0) / halfLife) + + def monoidWithEpsilon[C[_]]( + eps: Double + )(implicit vs: VectorSpace[Double, C], metric: Metric[C[Double]]): Monoid[DecayedVector[C]] = + new Monoid[DecayedVector[C]] { + override val zero = DecayedVector(vs.group.zero, Double.NegativeInfinity) + override def plus(left: DecayedVector[C], right: DecayedVector[C]) = + if (left.scaledTime <= right.scaledTime) { + scaledPlus(right, left, eps) + } else { + scaledPlus(left, right, eps) + } + } + + def forMap[K](m: Map[K, Double], scaledTime: Double): DecayedVector[Map[K, _]] = + DecayedVector[Map[K, _]](m, scaledTime) + def forMapWithHalflife[K](m: Map[K, Double], time: Double, halfLife: Double): DecayedVector[Map[K, _]] = + forMap(m, time * scala.math.log(2.0) / halfLife) + + def mapMonoidWithEpsilon[K]( + eps: Double + )(implicit + vs: VectorSpace[Double, Map[K, _]], + metric: Metric[Map[K, Double]] + ): Monoid[DecayedVector[Map[K, _]]] = + monoidWithEpsilon[Map[K, _]](eps) + + implicit def mapMonoid[K](implicit + vs: VectorSpace[Double, Map[K, _]], + metric: Metric[Map[K, Double]] + ): Monoid[DecayedVector[Map[K, _]]] = + mapMonoidWithEpsilon(-1.0) + + def scaledPlus[C[_]](newVal: DecayedVector[C], oldVal: DecayedVector[C], eps: Double)(implicit + vs: VectorSpace[Double, C], + metric: Metric[C[Double]] + ): DecayedVector[C] = { + implicit val mon: Monoid[C[Double]] = vs.group + val expFactor = scala.math.exp(oldVal.scaledTime - newVal.scaledTime) + val newVector = + Monoid.plus(newVal.vector, vs.scale(expFactor, oldVal.vector)) + if (eps < 0.0 || Metric.norm(newVector) > eps) { + DecayedVector(newVector, newVal.scaledTime) + } else { + DecayedVector(mon.zero, Double.NegativeInfinity) + } + } +} + +case class DecayedVector[C[_]](vector: C[Double], scaledTime: Double) diff --git a/algebird-core/src/main/scala-3/DecayingCMS.scala b/algebird-core/src/main/scala-3/DecayingCMS.scala new file mode 100644 index 000000000..54809e2a8 --- /dev/null +++ b/algebird-core/src/main/scala-3/DecayingCMS.scala @@ -0,0 +1,650 @@ +package com.twitter.algebird + +import java.lang.Double.{compare => cmp} +import java.lang.Math +import java.util.Arrays.deepHashCode +import scala.concurrent.duration.Duration +import scala.util.Random + +/** + * DecayingCMS is a module to build count-min sketch instances whose counts decay exponentially. + * + * Similar to a Map[K, com.twitter.algebird.DecayedValue], each key is associated with a single count value + * that decays over time. Unlike a map, the decyaing CMS is an approximate count -- in exchange for the + * possibility of over-counting, we can bound its size in memory. + * + * The intended use case is for metrics or machine learning where exact values aren't needed. + * + * You can expect the keys with the biggest values to be fairly accurate but the very small values (rare keys + * or very old keys) to be lost in the noise. For both metrics and ML this should be fine: you can't learn too + * much from very rare values. + * + * We recommend depth of at least 5, and width of at least 100, but you should do some experiments to + * determine the smallest parameters that will work for your use case. + */ +final class DecayingCMS[K]( + seed: Long, + val halfLife: Duration, + val depth: Int, // number of hashing functions + val width: Int, // number of table cells per hashing function + hasher: CMSHasher[K] +) extends Serializable { module => + + override def toString: String = + s"DecayingCMS(seed=$seed, halfLife=$halfLife, depth=$depth, width=$width)" + + @inline private def getNextLogScale( + logScale: Double, + oldTimeInHL: Double, + nowInHL: Double + ): Double = + if (nowInHL == oldTimeInHL) logScale else logScale + (nowInHL - oldTimeInHL) * log2 + + @inline private def getScale(logScale: Double, oldTimeInHL: Double, nowInHL: Double): Double = { + val logScale1 = getNextLogScale(logScale, oldTimeInHL, nowInHL) + Math.exp(-logScale1) + } + + val empty: CMS = + new CMS(Array.fill(depth)(Vector.fill[Double](width)(0.0)), 0.0, Double.NegativeInfinity) + + /** + * Represents a decaying scalar value at a particular point in time. + * + * The value decays according to halfLife. Another way to think about DoubleAt is that it represents a + * particular decay curve (and in particular, a point along that curve). Two DoubleAt values may be + * equivalent if they are two points on the same curve. + * + * The `timeToZero` and `timeToUnit` methods can be used to "normalize" DoubleAt values. If two DoubleAt + * values do not produce the same (approximate) Double values from these methods, they represent different + * curves. + */ + class DoubleAt private[algebird] (val value: Double, val timeInHL: Double) extends Serializable { + lhs => + + // this is not public because it's not safe in general -- you need + // to run a function that is time-commutative. + private[algebird] def map(f: Double => Double): DoubleAt = + new DoubleAt(f(value), timeInHL) + + // this is not public because it's not safe in general -- you need + // to run a function that is time-commutative. + private[algebird] def map2(rhs: DoubleAt)(f: (Double, Double) => Double): DoubleAt = + if (lhs.timeInHL < rhs.timeInHL) { + val x = lhs.scaledAt(rhs.timeInHL) + new DoubleAt(f(x, rhs.value), rhs.timeInHL) + } else if (lhs.timeInHL == rhs.timeInHL) { + new DoubleAt(f(lhs.value, rhs.value), rhs.timeInHL) + } else { + val y = rhs.scaledAt(lhs.timeInHL) + new DoubleAt(f(lhs.value, y), lhs.timeInHL) + } + + def unary_- : DoubleAt = new DoubleAt(-value, timeInHL) + def abs: DoubleAt = new DoubleAt(Math.abs(value), timeInHL) + def *(n: Double): DoubleAt = new DoubleAt(value * n, timeInHL) + + def +(rhs: DoubleAt): DoubleAt = map2(rhs)(_ + _) + def -(rhs: DoubleAt): DoubleAt = map2(rhs)(_ - _) + def min(rhs: DoubleAt): DoubleAt = map2(rhs)(Math.min) + def max(rhs: DoubleAt): DoubleAt = map2(rhs)(Math.max) + + def /(rhs: DoubleAt): Double = map2(rhs)(_ / _).value + + /** + * We consider two DoubleAt values equal not just if their elements are equal, but also if they represent + * the same value at different points of decay. + */ + def compare(rhs: DoubleAt): Int = { + val vc = cmp(lhs.value, rhs.value) + val tc = cmp(lhs.timeInHL, rhs.timeInHL) + if (vc == tc) vc + else if (tc == 0) vc + else if (vc == 0) tc + else if (tc < 0) cmp(lhs.scaledAt(rhs.timeInHL), rhs.value) + else cmp(lhs.value, rhs.scaledAt(lhs.timeInHL)) + } + + /** + * Time when this value will reach the smallest double value bigger than zero, unless we are already at + * zero in which case we return the current time + */ + def timeToZero: Double = + if (java.lang.Double.isNaN(value)) Double.NaN + else if (java.lang.Double.isInfinite(value)) Double.PositiveInfinity + else if (value == 0.0) timeInHL + else timeToUnit + DoubleAt.TimeFromUnitToZero + + /** + * This is the scaled time when the current value will reach 1 (or -1 for negative values) + * + * This method is a way of collapsing a DoubleAt into a single value (the time in the past or future where + * its value would be 1, the unit value). + */ + def timeToUnit: Double = + if (java.lang.Double.isNaN(value)) Double.NaN + else if (java.lang.Double.isInfinite(value)) Double.PositiveInfinity + else if (value == 0.0) Double.NegativeInfinity + else { + // solve for result: + // + // 1 = value * module.getScale(0.0, timeInHL, result) + // 1 = value * Math.exp(-getNextLogScale(0.0, timeInHL, result)) + // 1 / value = Math.exp(-getNextLogScale(0.0, timeInHL, result)) + // log(1 / value) = -getNextLogScale(0.0, timeInHL, result) + // -log(1 / value) = getNextLogScale(0.0, timeInHL, result) + // log(value) = getNextLogScale(0.0, timeInHL, result) + // log(value) = if (result == timeInHL) 0 else 0 + (result - timeInHL) * log2 + // log(value) = if (result == timeInHL) 0 else (result - timeInHL) * log2 + // + // log(value) = (result - timeInHL) * log2 + // log(value) / log2 = result - timeInHL + // log(value) / log2 + timeInHL = result + Math.log(Math.abs(value)) / log2 + timeInHL + } + + override def equals(that: Any): Boolean = + that match { + case d: DoubleAt => compare(d) == 0 + case _ => false + } + + override def hashCode: Int = + timeToUnit.## + + override def toString: String = + s"DoubleAt($value, $timeInHL)" + + def <(rhs: DoubleAt): Boolean = (lhs.compare(rhs)) < 0 + def <=(rhs: DoubleAt): Boolean = (lhs.compare(rhs)) <= 0 + def >(rhs: DoubleAt): Boolean = (lhs.compare(rhs)) > 0 + def >=(rhs: DoubleAt): Boolean = (lhs.compare(rhs)) >= 0 + + def time: Long = + toTimestamp(timeInHL) + + private def scaledAt(t: Double): Double = + if (value == 0.0) 0.0 + else value * module.getScale(0.0, timeInHL, t) + + def at(time: Long): Double = + if (value == 0.0) 0.0 + else value * module.getScale(0.0, timeInHL, fromTimestamp(time)) + } + + object DoubleAt { + def apply(x: Double, t: Long): DoubleAt = + new DoubleAt(x, fromTimestamp(t)) + + val zero: DoubleAt = + new DoubleAt(0.0, Double.NegativeInfinity) + + private val TimeFromUnitToZero: Double = + -Math.log(Double.MinPositiveValue) / log2 + } + + val totalCells: Int = depth * width + + val halfLifeSecs: Double = + halfLife.toMillis.toDouble / 1000.0 + + // TODO: consider a smaller number? + // we are trading accuracy for possible performence + private[this] val maxLogScale: Double = 20.0 + + /** + * Allocate an empty array of row. + * + * The elements start as null. It's an important optimization _not_ to allocate vectors here, since we're + * often building up cells mutably. + */ + private def allocCells(): Array[Vector[Double]] = + new Array[Vector[Double]](depth) + + def toTimestamp(t: Double): Long = + (t * halfLifeSecs * 1000.0).toLong + + def fromTimestamp(t: Long): Double = + (t.toDouble / 1000.0) / halfLifeSecs + + val hashFns: Array[K => Int] = { + val rng = new Random(seed) + def genPos(): Int = + rng.nextInt() match { + case 0 => genPos() + case n => n & 0x7fffffff + } + + (0 until depth).map { _ => + val n = genPos() + (k: K) => hasher.hash(n, 0, width)(k) + }.toArray + } + + private final val log2 = Math.log(2.0) + + /** + * The idealized formula for the updating current value for a key (y0 -> y1) is given as: + * + * delta = (t1 - t0) / halflife y1 = y0 * 2^(-delta) + n + * + * However, we want to avoid having to rescale every single cell every time we update; i.e. a cell with a + * zero value should continue to have a zero value when n=0. + * + * Therefore, we introduce a change of variable to cell values (z) along with a scale factor (scale), and + * the following formula: + * + * (1) zN = yN * scaleN + * + * Our constraint is expressed as: + * + * (2) If n=0, z1 = z0 + * + * In that case: + * + * (3) If n=0, (y1 * scale1) = (y0 * scale0) (4) Substituting for y1, (y0 * 2^(-delta) + 0) * scale1 = y0 * + * scale0 (5) 2^(-delta) * scale1 = scale0 (6) scale1 = scale0 * 2^(delta) + * + * Also, to express z1 in terms of z0, we say: + * + * (7) z1 = y1 * scale1 (8) z1 = (y0 * 2^(-delta) + n) * scale1 (9) z1 = ((z0 / scale0) * 2^(-delta) + n) * + * scale1 (10) z1 / scale1 = (z0 / (scale1 * 2^(-delta))) * 2^(-delta) + n (11) z1 / scale1 = z0 / scale1 + + * n (12) z1 = z0 + n * scale1 + * + * So, for cells where n=0, we just update scale0 to scale1, and for cells where n is non-zero, we update z1 + * in terms of z0 and scale1. + * + * If we convert scale to logscale, we have: + * + * (13) logscale1 = logscale0 + delta * log(2) (14) z1 = z0 + n * exp(logscale1) + * + * When logscale1 gets big, we start to distort z1. For example, exp(36) is close to 2^53. We can measure + * when n * exp(logscale1) gets big, and in those cases we can rescale all our cells (set each z to its + * corresponding y) and set the logscale to 0. + * + * (15) y1 = z1 / scale1 (16) y1 = z1 / exp(logscale1) (17) y1 = z1 * exp(-logscale1) + */ + final class CMS( + val cells: Array[Vector[Double]], + val logScale: Double, + val timeInHL: Double + ) extends Serializable { + + @inline private def scale: Double = + Math.exp(-logScale) + + override def toString: String = { + val s = cells.iterator.map(_.toString).mkString("Array(", ", ", ")") + s"CMS($s, $logScale, $timeInHL)" + } + + override def hashCode: Int = + deepHashCode(cells.asInstanceOf[Array[Object]]) * 59 + + logScale.## * 17 + + timeInHL.## * 37 + + 19 + + // unfortunately we can't check the path-dependent type of this + // CMS, which we signal by using a type projection here. + override def equals(any: Any): Boolean = + any match { + case that: DecayingCMS[?]#CMS => + this.logScale == that.logScale && + this.timeInHL == that.timeInHL && + this.cells.length == that.cells.length && { + var i = 0 + while (i < depth) { + if (this.cells(i) != that.cells(i)) return false + i += 1 + } + true + } + case _ => + false + } + + def lastUpdateTime: Long = + toTimestamp(timeInHL) + + /** + * Provide lower and upper bounds on values returned for any possible key. + * + * The first value is a lower bound: even keys that have never been counted will return this value or + * greater. This will be zero unless the CMS is saturated. + * + * The second value is an upper bound: the key with the largest cardinality will not be reported as being + * larger than this value (though it might be reported as being smaller). + * + * Together these values indicate how saturated and skewed the CMS might be. + */ + def range: (DoubleAt, DoubleAt) = { + var minMinimum = Double.PositiveInfinity + var minMaximum = Double.PositiveInfinity + var i = 0 + while (i < cells.length) { + val it = cells(i).iterator + var localMax = it.next() // we know it doesn't start empty + if (localMax < minMinimum) minMinimum = localMax + while (it.hasNext) { + val n = it.next() + if (n > localMax) localMax = n + else if (n < minMinimum) minMinimum = n + } + if (localMax < minMaximum) minMaximum = localMax + i += 1 + } + + val s = scale + def sc(x: Double): DoubleAt = + new DoubleAt(if (x == 0.0) 0.0 else x * s, timeInHL) + + (sc(minMinimum), sc(minMaximum)) + } + + /** + * Returns the square-root of the inner product of two decaying CMSs. + * + * We want the result to decay at the same rate as the CMS for this method to be valid. Taking the square + * root ensures that this is true. Without it, we would violate the following equality (assuming we had + * at() on a CMS): + * + * x.innerProduct(y).at(t) = x.at(t).innerProduct(y.at(t)) + * + * This is why we don't support innerProduct, only innerProductRoot. + */ + def innerProductRoot(that: CMS): DoubleAt = { + var i = 0 + var res = Double.PositiveInfinity + val t = Math.max(this.timeInHL, that.timeInHL) + val scale = this.getScale(t) * that.getScale(t) + while (i < depth) { + var sum = 0.0 + val it0 = this.cells(i).iterator + val it1 = that.cells(i).iterator + while (it0.hasNext) { + val x = it0.next() * it1.next() + if (x != 0.0) sum += x + } + if (sum < res) res = sum + i += 1 + } + val x = if (res != 0.0) Math.sqrt(res * scale) else 0.0 + new DoubleAt(x, t) + } + + def l2Norm: DoubleAt = + innerProductRoot(this) + + def scale(x: Double): CMS = + if (java.lang.Double.isNaN(x)) { + throw new IllegalArgumentException(s"invalid scale: $x") + } else if (x < 0.0) { + throw new IllegalArgumentException(s"negative scale is not allowed: $x") + } else if (x == 0.0) { + module.empty + } else { + val s = logScale + Math.log(x) + val c = new CMS(cells, s, timeInHL) + if (s > maxLogScale) c.rescaleTo(timeInHL) else c + } + + /** + * Get the total count of all items in the CMS. + * + * The total is the same as the l1Norm, since we don't allow negative values. + * + * Total is one of the few non-approximate statistics that DecayingCMS supports. We expect the total to be + * exact (except for floating-point error). + */ + def total: DoubleAt = { + val n = cells(0).sum + val x = if (n == 0.0) 0.0 else scale * n + new DoubleAt(x, timeInHL) + } + + def get(k: K): DoubleAt = { + var minValue = Double.PositiveInfinity + var didx = 0 + while (didx < depth) { + val i = hashFns(didx)(k) + val inner = cells(didx) + val value = inner(i) + if (value < minValue) minValue = value + didx += 1 + } + val x = if (minValue == 0.0) 0.0 else scale * minValue + new DoubleAt(x, timeInHL) + } + + def getScale(t: Double): Double = + module.getScale(logScale, timeInHL, t) + + private final def nextLogScale(t: Double): Double = + module.getNextLogScale(logScale, timeInHL, t) + + def +(other: CMS): CMS = { + val x = this + val y = other + val timeInHL = Math.max(x.timeInHL, y.timeInHL) + val cms = new CMS(allocCells(), 0.0, timeInHL) + + val xscale = x.getScale(timeInHL) + val yscale = y.getScale(timeInHL) + + // a zero count is zero, no matter, how big the scale is. + @inline def prod(x: Double, y: Double): Double = + if (x == 0.0) 0.0 else x * y + + var i = 0 + while (i < depth) { + val left = x.cells(i) + val right = y.cells(i) + var j = 0 + val bldr = rowBuilder() + while (j < width) { + bldr += prod(left(j), xscale) + prod(right(j), yscale) + j += 1 + } + cms.cells(i) = bldr.result() + i += 1 + } + cms + } + + def add(t: Long, k: K, n: Double): CMS = + scaledAdd(fromTimestamp(t), k, n) + + // TODO: we could allocate a mutable scratch pad, write all the + // values into it, and then build a CMS out of it. if items is + // very small, this would be less efficient than what we're doing + // now. probably the "ideal" solution would be determine how many + // items there are. if we have fewer than ~width items, this + // approach is fine. for more, a scratch pad would be better + // (assuming we wrote that code). + // + // alternately, you could map items into (zero + item) and then + // use the monoid's sum to boil it down. + // + // we only use this in testing currently so the current code is + // fine until we rely on it in production. any change here should + // probably include benchmarks justifying the design. + def bulkAdd(items: Iterable[(Long, K, Double)]): CMS = + items.foldLeft(this) { case (c, (t, k, v)) => c.add(t, k, v) } + + private[algebird] def scaledAdd(ts1: Double, k: K, n: Double): CMS = + if (n < 0.0) { + val t = toTimestamp(ts1) + throw new IllegalArgumentException( + s"we can only add non-negative numbers to a CMS, got $n for key: $k at time: $t" + ) + } else if (n == 0.0) { + this + } else { + val logScale1 = nextLogScale(ts1) + if (logScale1 > maxLogScale) { + rescaleTo(ts1).scaledAdd(ts1, k, n) + } else { + val increment = n * Math.exp(logScale1) + val cells1 = allocCells() + var didx = 0 + while (didx < depth) { + val cell = cells(didx) + val w = hashFns(didx)(k) + cells1(didx) = cell.updated(w, cell(w) + increment) + didx += 1 + } + new CMS(cells1, logScale1, ts1) + } + } + + // Set the scale back to 0.0 + // input time is in half-lives + private[algebird] def rescaleTo(ts: Double): CMS = { + val logScale1 = nextLogScale(ts) + val expL = Math.exp(-logScale1) + if (expL == 0.0) { + new CMS(monoid.zero.cells, 0.0, ts) + } else { + val cms = new CMS(allocCells(), 0.0, ts) + var i = 0 + while (i < depth) { + val ci = cells(i) + cms.cells(i) = ci.map(_ * expL) + i += 1 + } + cms + } + } + } + + private def rowBuilder() = { + val bldr = Vector.newBuilder[Double] + bldr.sizeHint(width) + bldr + } + + object CMS { + + implicit val monoidForCMS: Monoid[CMS] = + new Monoid[CMS] { + + def zero: CMS = module.empty + + def plus(x: CMS, y: CMS): CMS = + x + y + + /** + * Turn a flat array into an array of vectors. + */ + private def scratchToCells(scratch: Array[Double]): Array[Vector[Double]] = { + val cells = new Array[Vector[Double]](depth) + var i = 0 + while (i < depth) { + var j = i * width + val limit = j + width + val bldr = rowBuilder() + while (j < limit) { + bldr += scratch(j) + j += 1 + } + cells(i) = bldr.result() + i += 1 + } + cells + } + + /** + * This method sums the first `num` items in `arr`. + */ + private def innerSum(arr: Array[CMS], num: Int): CMS = + if (num == 0) zero + else if (num == 1) arr(0) + else if (num == 2) plus(arr(0), arr(1)) + else { + // start with zero + val scratch: Array[Double] = new Array(totalCells) + + val latestTimeInHL: Double = + arr.iterator.take(num).map(cms => cms.timeInHL).max + + var i = 0 + while (i < num) { + val cms = arr(i) + val scale = cms.getScale(latestTimeInHL) + var j = 0 + while (j < depth) { + val row = cms.cells(j) + val stride = j * width + var k = 0 + while (k < width) { + val n = row(k) + if (n > 0.0) { + scratch(stride + k) += scale * n + } + k += 1 + } + j += 1 + } + i += 1 + } + + val cells = scratchToCells(scratch) + + new CMS(cells, 0.0, latestTimeInHL) + } + + override def sumOption(xs: TraversableOnce[CMS]): Option[CMS] = { + + val it: Iterator[CMS] = xs.toIterator + val ChunkSize = 1000 + + // the idea here is that we read up to 1000 CMS values into + // a fixed array, crunch them down to a single CMS, store it + // in the first array index, read up to 999 more CMS values + // in, crunch them down, and so on. + var i = 0 + val arr = new Array[CMS](ChunkSize) + while (it.hasNext) { + while (it.hasNext && i < ChunkSize) { + arr(i) = it.next() + i += 1 + } + if (i > 1) { + arr(0) = innerSum(arr, i) + } + i = 1 + } + if (i == 0) None else Some(arr(0)) + } + } + } + + val monoid: Monoid[CMS] = CMS.monoidForCMS +} + +object DecayingCMS { + + /** + * Construct a DecayingCMS module. + * + * The seed is used to initialize the hash families used by the count-min sketch. Using the same seed will + * always produce the same hash family. + * + * Half-life determines the rate at which values in the CMS decay. If a key was counted once at time t, by + * time (t + halfLife), the value for that key will be 0.5. After enough half lives the value will decay to + * zero. + * + * The size of the CMS in bytes is O(depth * width). + * + * Width controls the relative error due to over-counting (approximately 1/width). For 1% error, use + * width=100, for 0.1% error, use width=1000, etc. + * + * Depth controls the probability the error bounds are broken and that probability scales with exp(-alpha * + * depth) so, a small depth (e.g. 5-10) is fine. Each update requires O(depth) work so you want to keep this + * as small as possible. + */ + def apply[K](seed: Long, halfLife: Duration, depth: Int, width: Int)(implicit + hasher: CMSHasher[K] + ): DecayingCMS[K] = + new DecayingCMS(seed, halfLife, depth, width, hasher) +} diff --git a/algebird-core/src/main/scala-3/Fold.scala b/algebird-core/src/main/scala-3/Fold.scala new file mode 100644 index 000000000..0b89f2d62 --- /dev/null +++ b/algebird-core/src/main/scala-3/Fold.scala @@ -0,0 +1,352 @@ +/* +Copyright 2014 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.algebird + +import java.io.Serializable +import scala.collection.compat._ + +/** + * Folds are first-class representations of "Traversable.foldLeft." They have the nice property that they can + * be fused to work in parallel over an input sequence. + * + * A Fold accumulates inputs (I) into some internal type (X), converting to a defined output type (O) when + * done. We use existential types to hide internal details and to allow for internal and external (X and O) + * types to differ for "map" and "join." + * + * In discussing this type we draw parallels to Function1 and related types. You can think of a fold as a + * function "Seq[I] => O" but in reality we do not have to materialize the input sequence at once to "run" the + * fold. + * + * The traversal of the input data structure is NOT done by Fold itself. Instead we expose some methods like + * "overTraversable" that know how to iterate through various sequence types and drive the fold. We also + * expose some internal state so library authors can fold over their own types. + * + * See the companion object for constructors. + */ +sealed trait Fold[-I, +O] extends Serializable { + + /** + * Users can ignore this type. + * + * The internal accumulator type. No one outside this Fold needs to know what this is, and that's a good + * thing. It keeps type signatures sane and makes this easy to use for the amount of flexibility it + * provides. + */ + type X + + /** + * Users can ignore this method. It is exposed so library authors can run folds over their own sequence + * types. + * + * "build" constructs a FoldState, which tells us how to run the fold. It is expected that we can run the + * same Fold many times over different data structures, but we must build a new FoldState every time. + * + * See FoldState for information on how to use this for your own sequence types. + */ + def build(): FoldState[X, I, O] + + /** + * Transforms the output of the Fold after iteration is complete. This is analogous to "Future.map" or + * "Function1.compose." + */ + def map[P](f: O => P): Fold[I, P] = { + val self = this + new Fold[I, P] { + type X = self.X + override def build(): FoldState[X, I, P] = + self.build().map(f) + } + } + + /** + * Joins two folds into one and combines the results. The fused fold accumulates with both at the same time + * and combines at the end. + */ + def joinWith[I2 <: I, P, Q](other: Fold[I2, P])(f: (O, P) => Q): Fold[I2, Q] = { + val self = this + new Fold[I2, Q] { + type X = (self.X, other.X) + override def build(): FoldState[X, I2, Q] = { + val first = self.build() + val second = other.build() + new FoldState( + { case ((x, y), i) => (first.add(x, i), second.add(y, i)) }, + (first.start, second.start), + { case (x, y) => f(first.end(x), second.end(y)) } + ) + } + } + } + + /** + * Convenient shorthand for joining Folds without combining at the end. + */ + def join[I2 <: I, P](other: Fold[I2, P]): Fold[I2, (O, P)] = + joinWith(other) { case (o, p) => (o, p) } + + /** + * Transforms the input of the fold before every accumulation. (The name comes from "contravariant map.") + * This is analogous to "Function1.andThen." + */ + def contramap[H](f: H => I): Fold[H, O] = { + val self = this + new Fold[H, O] { + type X = self.X + override def build(): FoldState[X, H, O] = + self.build().contramap(f) + } + } + + /** + * Trivially runs a Fold over an empty sequence. + */ + def overEmpty: O = { + // build is a "def" so we construct the state once and use the pieces to run the fold + val state = build() + state.end(state.start) + } + + /** + * Trivially runs a Fold over a single element sequence. + */ + def overSingleton(i: I): O = { + val state = build() + state.end(state.add(state.start, i)) + } + + /** + * Runs a Fold over a Traversable. + */ + def overTraversable(is: TraversableOnce[I]): O = { + val state = build() + state.end(is.iterator.foldLeft(state.start)(state.add)) + } +} + +/** + * A FoldState defines a left fold with a "hidden" accumulator type. It is exposed so library authors can run + * Folds over their own sequence types. + * + * The fold can be executed correctly according to the properties of "add" and your traversed data structure. + * For example, the "add" function of a monoidal fold will be associative. A FoldState is valid for only one + * iteration because the accumulator (seeded by "start" may be mutable. + * + * The three components of a fold are add: (X, I) => X - updates and returns internal state for every input I + * start: X - the initial state end: X => O - transforms internal state to a final result + * + * Folding over Seq(x, y) would produce the result end(add(add(start, x), y)) + */ +final class FoldState[X, -I, +O] private[algebird] (val add: (X, I) => X, val start: X, val end: X => O) + extends Serializable { + + /** + * Transforms the output type of the FoldState (see Fold.map). + */ + def map[P](f: O => P): FoldState[X, I, P] = + new FoldState(add, start, end.andThen(f)) + + /** + * Transforms the input type of the FoldState (see Fold.contramap). + */ + def contramap[H](f: H => I): FoldState[X, H, O] = + new FoldState((x, h) => add(x, f(h)), start, end) +} + +/** + * Methods to create and run Folds. + * + * The Folds defined here are immutable and serializable, which we expect by default. It is important that you + * as a user indicate mutability or non-serializability when defining new Folds. Additionally, it is + * recommended that "end" functions not mutate the accumulator in order to support scans (producing a stream + * of intermediate outputs by calling "end" at each step). + */ +object Fold extends CompatFold { + + /** + * "import Fold.applicative" will bring the Applicative instance into scope. See FoldApplicative. + */ + implicit def applicative[I]: Applicative[Fold[I, _]] = + new FoldApplicative[I] + + /** + * Turn a common Scala foldLeft into a Fold. The accumulator MUST be immutable and serializable. + */ + def foldLeft[I, O](o: O)(add: (O, I) => O): Fold[I, O] = + fold[O, I, O](add, o, o => o) + + /** + * A general way of defining Folds that supports a separate accumulator type. The accumulator MUST be + * immutable and serializable. + */ + def fold[M, I, O](add: (M, I) => M, start: M, end: M => O): Fold[I, O] = + new Fold[I, O] { + type X = M + override def build(): FoldState[X, I, O] = + new FoldState(add, start, end) + } + + /** + * A general way of defining Folds that supports constructing mutable or non-serializable accumulators. + */ + def foldMutable[M, I, O](add: (M, I) => M, start: Unit => M, end: M => O): Fold[I, O] = + new Fold[I, O] { + type X = M + override def build(): FoldState[X, I, O] = + new FoldState(add, start(()), end) + } + + /** + * Fuse a sequence of Folds into one that outputs the result of each. + */ + def sequence[I, O](ms: Seq[Fold[I, O]]): Fold[I, Seq[O]] = + new Fold[I, Seq[O]] { + type X = Seq[Any] + override def build(): FoldState[Seq[Any], I, Seq[O]] = { + val bs: Seq[FoldState[Any, I, O]] = + ms.map(_.build().asInstanceOf[FoldState[Any, I, O]]) + val adds = + bs.map(_.add) + val ends = + bs.map(_.end) + val starts: Seq[Any] = + bs.map(_.start) + val add: (Seq[Any], I) => Seq[Any] = { (xs, i) => adds.zip(xs).map { case (f, x) => f(x, i) } } + val end: (Seq[Any] => Seq[O]) = { xs => ends.zip(xs).map { case (f, x) => f(x) } } + new FoldState(add, starts, end) + } + } + + /** + * An even simpler Fold that collects into a Seq. Shorthand for "container[I, Seq];" fewer type arguments, + * better type inferrence. + */ + def seq[I]: Fold[I, Seq[I]] = + container[I, Seq] + + /** + * A Fold that does no work and returns a constant. Analogous to Function1 const: def const[A, B](b: B): (A + * \=> B) = { _ => b } + */ + def const[O](value: O): Fold[Any, O] = + Fold.foldLeft(value) { case (u, _) => u } + + /** + * A Fold that runs the given side effect for every element. + */ + def foreach[I](e: I => Unit): Fold[I, Unit] = + Fold.foldLeft(()) { case (_, i) => e(i) } + + /** + * A Fold that returns the first value in a sequence. + */ + def first[I]: Fold[I, Option[I]] = + Fold.foldLeft[I, Option[I]](None) { + case (None, i) => Some(i) + case (x, _) => x + } + + /** + * A Fold that returns the last value in a sequence. + */ + def last[I]: Fold[I, Option[I]] = + Fold.foldLeft[I, Option[I]](None) { case (_, i) => Some(i) } + + /** + * A Fold that returns the max value in a sequence. (Biased to earlier equal values.) + */ + def max[I](implicit ordering: Ordering[I]): Fold[I, Option[I]] = + Fold.foldLeft[I, Option[I]](None) { + case (None, i) => Some(i) + case (Some(y), i) if ordering.compare(y, i) < 0 => Some(i) + case (x, _) => x + } + + /** + * A Fold that returns a min value in a sequence. (Biased to earlier equal values.) + */ + def min[I](implicit ordering: Ordering[I]): Fold[I, Option[I]] = + Fold.foldLeft[I, Option[I]](None) { + case (None, i) => Some(i) + case (Some(y), i) if ordering.compare(y, i) > 0 => Some(i) + case (x, _) => x + } + + /** + * A Fold that returns the sum of a numeric sequence. Does not protect against overflow. + */ + def sum[I](implicit numeric: Monoid[I]): Fold[I, I] = + Fold.foldLeft(numeric.zero) { case (x, i) => numeric.plus(x, i) } + + /** + * For a semigroup, if we get more than 0 items, use plus + */ + def sumOption[T](implicit sg: Semigroup[T]): Fold[T, Option[T]] = + Fold.foldLeft(None: Option[T]) { + case (None, i) => Some(i) + case (Some(l), r) => Some(sg.plus(l, r)) + } + + /** + * A Fold that returns the product of a numeric sequence. Does not protect against overflow. + */ + def product[I](implicit numeric: Ring[I]): Fold[I, I] = + Fold.foldLeft(numeric.one) { case (x, i) => numeric.times(x, i) } + + /** + * A Fold that returns the length of a sequence. + */ + def size: Fold[Any, Long] = + Fold.foldLeft(0L) { case (x, _) => x + 1 } + + /** + * A Fold that returns "true" if all elements of the sequence statisfy the predicate. Note this does not + * short-circuit enumeration of the sequence. + */ + def forall[I](pred: I => Boolean): Fold[I, Boolean] = + foldLeft(true)((b, i) => b && pred(i)) + + /** + * A Fold that returns "true" if any element of the sequence statisfies the predicate. Note this does not + * short-circuit enumeration of the sequence. + */ + def exists[I](pred: I => Boolean): Fold[I, Boolean] = + foldLeft(false)((b, i) => b || pred(i)) + + /** + * A Fold that counts the number of elements satisfying the predicate. + */ + def count[I](pred: I => Boolean): Fold[I, Long] = + foldLeft(0L) { + case (c, i) if pred(i) => c + 1L + case (c, _) => c + } +} + +/** + * Folds are Applicatives! + */ +class FoldApplicative[I] extends Applicative[Fold[I, _]] { + override def map[T, U](mt: Fold[I, T])(fn: T => U): Fold[I, U] = + mt.map(fn) + override def apply[T](v: T): Fold[I, T] = + Fold.const(v) + override def join[T, U](mt: Fold[I, T], mu: Fold[I, U]): Fold[I, (T, U)] = + mt.join(mu) + override def sequence[T](ms: Seq[Fold[I, T]]): Fold[I, Seq[T]] = + Fold.sequence(ms) + override def joinWith[T, U, V](mt: Fold[I, T], mu: Fold[I, U])(fn: (T, U) => V): Fold[I, V] = + mt.joinWith(mu)(fn) +} diff --git a/algebird-core/src/main/scala-3/Interval.scala b/algebird-core/src/main/scala-3/Interval.scala new file mode 100644 index 000000000..6a1645d16 --- /dev/null +++ b/algebird-core/src/main/scala-3/Interval.scala @@ -0,0 +1,380 @@ +/* + Copyright 2013 Twitter, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +package com.twitter.algebird + +// TODO this is clearly more general than summingbird, and should be extended to be a ring (add union, etc...) + +/** + * Represents a single interval on a T with an Ordering + */ +sealed trait Interval[T] extends java.io.Serializable { + def contains(t: T)(implicit ord: Ordering[T]): Boolean + + def intersect(that: Interval[T])(implicit ord: Ordering[T]): Interval[T] + final def apply(t: T)(implicit ord: Ordering[T]): Boolean = contains(t) + final def &&(that: Interval[T])(implicit ord: Ordering[T]): Interval[T] = intersect(that) + + /** + * Map the Interval with a non-decreasing function. If you use a non-monotonic function (like x^2) then the + * result is meaningless. TODO: It might be good to have types for these properties in algebird. + */ + def mapNonDecreasing[U](fn: T => U): Interval[U] +} + +case class Universe[T]() extends Interval[T] { + override def contains(t: T)(implicit ord: Ordering[T]): Boolean = true + override def intersect(that: Interval[T])(implicit ord: Ordering[T]): Interval[T] = + that + override def mapNonDecreasing[U](fn: T => U): Interval[U] = Universe() +} + +case class Empty[T]() extends Interval[T] { + override def contains(t: T)(implicit ord: Ordering[T]): Boolean = false + override def intersect(that: Interval[T])(implicit ord: Ordering[T]): Interval[T] = + this + override def mapNonDecreasing[U](fn: T => U): Interval[U] = Empty() +} + +object Interval extends java.io.Serializable { + + /** + * Class that only exists so that [[leftClosedRightOpen]] and [[leftOpenRightClosed]] can retain the type + * information of the returned interval. The compiler doesn't know anything about ordering, so without + * [[MaybeEmpty]] the only valid return type is Interval[T]. + */ + sealed abstract class MaybeEmpty[T, NonEmpty[t] <: Interval[t]] { + def isEmpty: Boolean + } + object MaybeEmpty { + + /** + * Represents an empty interval. + */ + case class SoEmpty[T, NonEmpty[t] <: Interval[t]]() extends MaybeEmpty[T, NonEmpty] { + override def isEmpty: Boolean = true + } + + /** + * Represents a non-empty interval. + */ + case class NotSoEmpty[T, NonEmpty[t] <: Interval[t]](get: NonEmpty[T]) extends MaybeEmpty[T, NonEmpty] { + override def isEmpty: Boolean = false + } + } + + type GenIntersection[T] = Intersection[Lower, Upper, T] + type InLowExUp[T] = Intersection[InclusiveLower, ExclusiveUpper, T] + type InLowInUp[T] = Intersection[InclusiveLower, InclusiveUpper, T] + type ExLowExUp[T] = Intersection[ExclusiveLower, ExclusiveUpper, T] + type ExLowInUp[T] = Intersection[ExclusiveLower, InclusiveUpper, T] + + implicit def monoid[T: Ordering]: Monoid[Interval[T]] = + Monoid.from[Interval[T]](Universe[T]())(_ && _) + + // Automatically convert from a MaybeEmpty instance + implicit def fromMaybeEmpty[T, NonEmpty[t] <: Interval[t]](me: MaybeEmpty[T, NonEmpty]): Interval[T] = + me match { + case MaybeEmpty.SoEmpty() => Empty() + case MaybeEmpty.NotSoEmpty(i) => i + } + + def leftClosedRightOpen[T: Ordering](lower: T, upper: T): MaybeEmpty[T, InLowExUp] = + if (Ordering[T].lt(lower, upper)) + MaybeEmpty.NotSoEmpty[T, InLowExUp](Intersection(InclusiveLower(lower), ExclusiveUpper(upper))) + else MaybeEmpty.SoEmpty[T, InLowExUp]() + + def leftOpenRightClosed[T: Ordering](lower: T, upper: T): MaybeEmpty[T, ExLowInUp] = + if (Ordering[T].lt(lower, upper)) + MaybeEmpty.NotSoEmpty[T, ExLowInUp](Intersection(ExclusiveLower(lower), InclusiveUpper(upper))) + else MaybeEmpty.SoEmpty[T, ExLowInUp]() + + def closed[T: Ordering](lower: T, upper: T): MaybeEmpty[T, InLowInUp] = + if (Ordering[T].lteq(lower, upper)) + MaybeEmpty.NotSoEmpty[T, InLowInUp](Intersection(InclusiveLower(lower), InclusiveUpper(upper))) + else MaybeEmpty.SoEmpty[T, InLowInUp]() + + def open[T: Ordering](lower: T, upper: T): MaybeEmpty[T, ExLowExUp] = + if (Ordering[T].lt(lower, upper)) + MaybeEmpty.NotSoEmpty[T, ExLowExUp](Intersection(ExclusiveLower(lower), ExclusiveUpper(upper))) + else MaybeEmpty.SoEmpty[T, ExLowExUp]() + + /** + * This is here for binary compatibility reasons. These methods should be moved to Interval, which should + * also be an abstract class for better binary compatibility at the next incompatible change + */ + implicit final class IntervalMethods[T](val intr: Interval[T]) extends AnyVal { + def isEmpty(implicit succ: Successible[T], pred: Predecessible[T]): Boolean = intr match { + case Empty() => true + case Universe() => false + case Intersection(InclusiveLower(l), ExclusiveUpper(u)) => + !succ.ordering.lt(l, u) + case Intersection(InclusiveLower(l), InclusiveUpper(u)) => + !succ.ordering.lteq(l, u) + case Intersection(ExclusiveLower(l), ExclusiveUpper(u)) => + !succ.next(l).exists(succ.ordering.lt(_, u)) + case Intersection(ExclusiveLower(l), InclusiveUpper(u)) => + !succ.next(l).exists(succ.ordering.lteq(_, u)) + case InclusiveLower(_) => false // we at least have l + case InclusiveUpper(_) => false // false // we at least have u + case ExclusiveLower(l) => + succ.next(l).isEmpty + case ExclusiveUpper(u) => + pred.prev(u).isEmpty + } + + /** + * If this returns Some(t), then intr.contains(t) and there is no s less than t such that intr.contains(s) + * + * if this returns None, it may be Empty, Upper or Universe + */ + def boundedLeast(implicit succ: Successible[T]): Option[T] = intr match { + case Empty() => None + case Universe() => None + case _: Upper[?] => None + case i @ Intersection(_, _) => i.least + case l: Lower[?] => l.least + } + + /** + * If this returns Some(t), then intr.contains(t) and there is no s greater than t such that + * intr.contains(s) + * + * if this returns None, it may be Empty, Lower, or Universe + */ + def boundedGreatest(implicit pred: Predecessible[T]): Option[T] = + intr match { + case Empty() => None + case Universe() => None + case _: Lower[?] => None + case i @ Intersection(_, _) => i.greatest + case u: Upper[?] => u.greatest + } + } +} + +// Marker traits to keep lower on the left in Intersection +sealed trait Lower[T] extends Interval[T] { + + /** + * This may give a false positive (but should try not to). Note the case of (0,1) for the integers. If they + * were doubles, this would intersect, but since there are no members of the set Int that are bigger than 0 + * and less than 1, they don't really intersect. So, ordering is not enough here. You need a stronger + * notion, which we don't have a typeclass for. + */ + def intersects(u: Upper[T])(implicit ord: Ordering[T]): Boolean + + /** + * The smallest value that is contained here This is an Option, because of cases like + * ExclusiveLower(Int.MaxValue) which are pathological and equivalent to Empty + */ + def least(implicit s: Successible[T]): Option[T] + def strictLowerBound(implicit p: Predecessible[T]): Option[T] + + /** + * Iterates all the items in this Lower[T] from lowest to highest + */ + def toIterable(implicit s: Successible[T]): Iterable[T] = + least match { + case Some(l) => s.iterateNext(l) + case None => Iterable.empty + } +} +sealed trait Upper[T] extends Interval[T] { + + /** + * The smallest value that is contained here This is an Option, because of cases like + * ExclusiveUpper(Int.MinValue), which are pathological and equivalent to Empty + */ + def greatest(implicit p: Predecessible[T]): Option[T] + // The smallest value that is not present + def strictUpperBound(implicit s: Successible[T]): Option[T] + + /** + * Iterates all the items in this Upper[T] from highest to lowest + */ + def toIterable(implicit p: Predecessible[T]): Iterable[T] = + greatest match { + case Some(g) => p.iteratePrev(g) + case None => Iterable.empty + } +} + +case class InclusiveLower[T](lower: T) extends Interval[T] with Lower[T] { + override def contains(t: T)(implicit ordering: Ordering[T]): Boolean = + ordering.lteq(lower, t) + override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match { + case Universe() => this + case Empty() => that + case ub @ InclusiveUpper(_) => + if (intersects(ub)) Intersection(this, ub) else Empty() + case ub @ ExclusiveUpper(_) => + if (intersects(ub)) Intersection(this, ub) else Empty() + case InclusiveLower(thatlb) => + if (ordering.gt(lower, thatlb)) this else that + case ExclusiveLower(thatlb) => + if (ordering.gt(lower, thatlb)) this else that + case Intersection(thatL, thatU) => (this && thatL) && thatU + } + override def intersects(u: Upper[T])(implicit ordering: Ordering[T]): Boolean = + u match { + case InclusiveUpper(upper) => ordering.lteq(lower, upper) + case ExclusiveUpper(upper) => ordering.lt(lower, upper) + } + override def least(implicit s: Successible[T]): Option[T] = Some(lower) + override def strictLowerBound(implicit p: Predecessible[T]): Option[T] = p.prev(lower) + override def mapNonDecreasing[U](fn: T => U): Interval[U] = InclusiveLower(fn(lower)) +} +case class ExclusiveLower[T](lower: T) extends Interval[T] with Lower[T] { + override def contains(t: T)(implicit ordering: Ordering[T]): Boolean = + ordering.lt(lower, t) + override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match { + case Universe() => this + case Empty() => that + case ub @ InclusiveUpper(_) => + if (intersects(ub)) Intersection(this, ub) else Empty() + case ub @ ExclusiveUpper(_) => + if (intersects(ub)) Intersection(this, ub) else Empty() + case InclusiveLower(thatlb) => + if (ordering.gteq(lower, thatlb)) this else that + case ExclusiveLower(thatlb) => + if (ordering.gteq(lower, thatlb)) this else that + case Intersection(thatL, thatU) => (this && thatL) && thatU + } + override def intersects(u: Upper[T])(implicit ordering: Ordering[T]): Boolean = + u match { + case InclusiveUpper(upper) => ordering.lt(lower, upper) + case ExclusiveUpper(upper) => + ordering.lt(lower, upper) // This is a false positive for (x, next(x)) + } + override def least(implicit s: Successible[T]): Option[T] = s.next(lower) + override def strictLowerBound(implicit p: Predecessible[T]): Option[T] = Some(lower) + override def mapNonDecreasing[U](fn: T => U): Interval[U] = ExclusiveLower(fn(lower)) +} +case class InclusiveUpper[T](upper: T) extends Interval[T] with Upper[T] { + override def contains(t: T)(implicit ordering: Ordering[T]): Boolean = + ordering.lteq(t, upper) + override def greatest(implicit p: Predecessible[T]): Option[T] = Some(upper) + // The smallest value that is not present + override def strictUpperBound(implicit s: Successible[T]): Option[T] = s.next(upper) + override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match { + case Universe() => this + case Empty() => that + case lb @ InclusiveLower(_) => + if (lb.intersects(this)) Intersection(lb, this) else Empty() + case lb @ ExclusiveLower(_) => + if (lb.intersects(this)) Intersection(lb, this) else Empty() + case InclusiveUpper(thatub) => + if (ordering.lt(upper, thatub)) this else that + case ExclusiveUpper(thatub) => + if (ordering.lt(upper, thatub)) this else that + case Intersection(thatL, thatU) => thatL && (this && thatU) + } + override def mapNonDecreasing[U](fn: T => U): Interval[U] = InclusiveUpper(fn(upper)) +} +case class ExclusiveUpper[T](upper: T) extends Interval[T] with Upper[T] { + override def contains(t: T)(implicit ordering: Ordering[T]): Boolean = + ordering.lt(t, upper) + override def greatest(implicit p: Predecessible[T]): Option[T] = p.prev(upper) + // The smallest value that is not present + override def strictUpperBound(implicit s: Successible[T]): Option[T] = Some(upper) + override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match { + case Universe() => this + case Empty() => that + case lb @ InclusiveLower(_) => + if (lb.intersects(this)) Intersection(lb, this) else Empty() + case lb @ ExclusiveLower(_) => + if (lb.intersects(this)) Intersection(lb, this) else Empty() + case InclusiveUpper(thatub) => + if (ordering.lteq(upper, thatub)) this else that + case ExclusiveUpper(thatub) => + if (ordering.lteq(upper, thatub)) this else that + case Intersection(thatL, thatU) => thatL && (this && thatU) + } + override def mapNonDecreasing[U](fn: T => U): Interval[U] = ExclusiveUpper(fn(upper)) +} + +case class Intersection[L[t] <: Lower[t], U[t] <: Upper[t], T](lower: L[T], upper: U[T]) extends Interval[T] { + override def contains(t: T)(implicit ordering: Ordering[T]): Boolean = + lower.contains(t) && upper.contains(t) + override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match { + case Universe() => this + case Empty() => that + case lb @ InclusiveLower(_) => (lb && lower) && upper + case lb @ ExclusiveLower(_) => (lb && lower) && upper + case ub @ InclusiveUpper(_) => lower && (ub && upper) + case ub @ ExclusiveUpper(_) => lower && (ub && upper) + case Intersection(thatL, thatU) => (lower && thatL) && (upper && thatU) + } + override def mapNonDecreasing[T1](fn: T => T1): Interval[T1] = { + val newLower = lower match { + case InclusiveLower(l) => InclusiveLower(fn(l)) + case ExclusiveLower(l) => ExclusiveLower(fn(l)) + } + val newUpper = upper match { + case InclusiveUpper(u) => InclusiveUpper(fn(u)) + case ExclusiveUpper(u) => ExclusiveUpper(fn(u)) + } + Intersection(newLower, newUpper) + } + + def least(implicit s: Successible[T]): Option[T] = + lower.least.filter(upper.contains(_)(s.ordering)) + + /** + * Goes from lowest to highest for all items that are contained in this Intersection + */ + def leastToGreatest(implicit s: Successible[T]): Iterable[T] = { + val self = this + implicit val ord: Ordering[T] = s.ordering + // TODO https://github.com/twitter/algebird/issues/263 + new AbstractIterable[T] { + // We have to do this because the normal takeWhile causes OOM on big intervals + override def iterator: Iterator[T] = lower.toIterable.iterator.takeWhile(self.upper.contains(_)) + } + } + + def greatest(implicit p: Predecessible[T]): Option[T] = + upper.greatest.filter(lower.contains(_)(p.ordering)) + + /** + * Goes from highest to lowest for all items that are contained in this Intersection + */ + def greatestToLeast(implicit p: Predecessible[T]): Iterable[T] = { + val self = this + implicit val ord: Ordering[T] = p.ordering + // TODO https://github.com/twitter/algebird/issues/263 + new AbstractIterable[T] { + // We have to do this because the normal takeWhile causes OOM on big intervals + override def iterator: Iterator[T] = upper.toIterable.iterator.takeWhile(self.lower.contains(_)) + } + } + + /** + * Some intervals can actually be synonyms for empty: (0,0) for instance, contains nothing. This cannot be + * normalized to [a, b) form, thus we return an option Also, there are cases like [Int.MinValue, + * Int.MaxValue] that cannot are actually equivalent to Universe. The bottom line: if this returns None, it + * just means you can't express it this way, it does not mean it is empty or universe, etc... (there are + * other cases). + */ + def toLeftClosedRightOpen(implicit + s: Successible[T] + ): Option[Intersection[InclusiveLower, ExclusiveUpper, T]] = + for { + l <- lower.least + g <- upper.strictUpperBound if s.ordering.lt(l, g) + } yield Intersection(InclusiveLower(l), ExclusiveUpper(g)) +} diff --git a/algebird-core/src/main/scala-3/InvariantAlgebras.scala b/algebird-core/src/main/scala-3/InvariantAlgebras.scala new file mode 100644 index 000000000..6f30ebc1c --- /dev/null +++ b/algebird-core/src/main/scala-3/InvariantAlgebras.scala @@ -0,0 +1,48 @@ +package com.twitter.algebird + +class InvariantSemigroup[T, U](val forward: T => U, val reverse: U => T)(implicit val semigroup: Semigroup[T]) + extends Semigroup[U] { + override def plus(l: U, r: U): U = + forward(semigroup.plus(reverse(l), reverse(r))) + override def sumOption(iter: TraversableOnce[U]): Option[U] = + semigroup.sumOption(iter.map(reverse)).map(forward) + + /* + * Note these work for the subclasses since in those cases semigroup + * will be the appropriate algebra. + */ + override val hashCode: Int = (forward, reverse, semigroup).hashCode + override def equals(that: Any): Boolean = + that match { + case r: InvariantSemigroup[?, ?] => + (hashCode == r.hashCode) && + (forward == r.forward) && + (reverse == r.reverse) && + (semigroup == r.semigroup) + case _ => false + } +} + +class InvariantMonoid[T, U](forward: T => U, reverse: U => T)(implicit val monoid: Monoid[T]) + extends InvariantSemigroup[T, U](forward, reverse) + with Monoid[U] { + override val zero: U = forward(monoid.zero) +} + +class InvariantGroup[T, U](forward: T => U, reverse: U => T)(implicit val group: Group[T]) + extends InvariantMonoid[T, U](forward, reverse) + with Group[U] { + override def negate(u: U): U = forward(group.negate(reverse(u))) + override def minus(l: U, r: U): U = + forward(group.minus(reverse(l), reverse(r))) +} + +class InvariantRing[T, U](forward: T => U, reverse: U => T)(implicit val ring: Ring[T]) + extends InvariantGroup[T, U](forward, reverse) + with Ring[U] { + override val one: U = forward(ring.one) + override def times(l: U, r: U): U = + forward(ring.times(reverse(l), reverse(r))) + override def product(iter: TraversableOnce[U]): U = + forward(ring.product(iter.map(reverse))) +} diff --git a/algebird-core/src/main/scala-3/JavaMonoids.scala b/algebird-core/src/main/scala-3/JavaMonoids.scala new file mode 100644 index 000000000..26ce54f0a --- /dev/null +++ b/algebird-core/src/main/scala-3/JavaMonoids.scala @@ -0,0 +1,147 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.algebird + +import java.lang.{ + Boolean => JBool, + Double => JDouble, + Float => JFloat, + Integer => JInt, + Long => JLong, + Short => JShort +} +import java.util.{ArrayList => JArrayList, HashMap => JHashMap, List => JList, Map => JMap} + +import scala.collection.JavaConverters._ + +object JIntRing extends Ring[JInt] { + override val zero: JInt = JInt.valueOf(0) + override val one: JInt = JInt.valueOf(1) + override def plus(x: JInt, y: JInt): JInt = x + y + override def negate(x: JInt): JInt = -x + override def minus(x: JInt, y: JInt): JInt = x - y + override def times(x: JInt, y: JInt): JInt = x * y +} + +object JShortRing extends Ring[JShort] { + override val zero: JShort = Short.box(0) + override val one: JShort = Short.box(1) + override def plus(x: JShort, y: JShort): JShort = (x + y).toShort + override def negate(x: JShort): JShort = (-x).toShort + override def minus(x: JShort, y: JShort): JShort = (x - y).toShort + override def times(x: JShort, y: JShort): JShort = (x * y).toShort +} + +object JLongRing extends Ring[JLong] { + override val zero: JLong = JLong.valueOf(0L) + override val one: JLong = JLong.valueOf(1L) + override def plus(x: JLong, y: JLong): JLong = x + y + override def negate(x: JLong): JLong = -x + override def minus(x: JLong, y: JLong): JLong = x - y + override def times(x: JLong, y: JLong): JLong = x * y +} + +object JFloatRing extends Ring[JFloat] { + override val zero: JFloat = JFloat.valueOf(0.0f) + override val one: JFloat = JFloat.valueOf(1.0f) + override def plus(x: JFloat, y: JFloat): JFloat = x + y + override def negate(x: JFloat): JFloat = -x + override def minus(x: JFloat, y: JFloat): JFloat = x - y + override def times(x: JFloat, y: JFloat): JFloat = x * y +} + +object JDoubleRing extends Ring[JDouble] { + override val zero: JDouble = JDouble.valueOf(0.0) + override val one: JDouble = JDouble.valueOf(1.0) + override def plus(x: JDouble, y: JDouble): JDouble = x + y + override def negate(x: JDouble): JDouble = -x + override def minus(x: JDouble, y: JDouble): JDouble = x - y + override def times(x: JDouble, y: JDouble): JDouble = x * y +} + +object JBoolRing extends Ring[JBool] { + override val zero: JBool = JBool.FALSE + override val one: JBool = JBool.TRUE + override def plus(x: JBool, y: JBool): JBool = + JBool.valueOf(x.booleanValue ^ y.booleanValue) + override def negate(x: JBool): JBool = x + override def minus(x: JBool, y: JBool): JBool = plus(x, y) + override def times(x: JBool, y: JBool): JBool = + JBool.valueOf(x.booleanValue & y.booleanValue) +} + +/** + * Since Lists are mutable, this always makes a full copy. Prefer scala immutable Lists if you use scala + * immutable lists, the tail of the result of plus is always the right argument + */ +class JListMonoid[T] extends Monoid[JList[T]] { + override def isNonZero(x: JList[T]): Boolean = !x.isEmpty + override lazy val zero: JArrayList[T] = new JArrayList[T](0) + override def plus(x: JList[T], y: JList[T]): JArrayList[T] = { + val res = new JArrayList[T](x.size + y.size) + res.addAll(x) + res.addAll(y) + res + } +} + +/** + * Since maps are mutable, this always makes a full copy. Prefer scala immutable maps if you use scala + * immutable maps, this operation is much faster TODO extend this to Group, Ring + */ +class JMapMonoid[K, V: Semigroup] extends Monoid[JMap[K, V]] { + override lazy val zero: JHashMap[K, V] = new JHashMap[K, V](0) + + val nonZero: (V => Boolean) = implicitly[Semigroup[V]] match { + case mon: Monoid[?] => mon.isNonZero(_) + case _ => _ => true + } + + override def isNonZero(x: JMap[K, V]): Boolean = + !x.isEmpty && (implicitly[Semigroup[V]] match { + case mon: Monoid[?] => + x.values.asScala.exists(v => mon.isNonZero(v)) + case _ => true + }) + override def plus(x: JMap[K, V], y: JMap[K, V]): JHashMap[K, V] = { + val (big, small, bigOnLeft) = + if (x.size > y.size) { + (x, y, true) + } else { + (y, x, false) + } + val vsemi = implicitly[Semigroup[V]] + val result = new JHashMap[K, V](big.size + small.size) + result.putAll(big) + small.entrySet.asScala.foreach { kv => + val smallK = kv.getKey + val smallV = kv.getValue + if (big.containsKey(smallK)) { + val bigV = big.get(smallK) + val newV = + if (bigOnLeft) vsemi.plus(bigV, smallV) else vsemi.plus(smallV, bigV) + if (nonZero(newV)) + result.put(smallK, newV) + else + result.remove(smallK) + } else { + // No need to explicitly add with zero on V, just put in the small value + result.put(smallK, smallV) + } + } + result + } +} diff --git a/algebird-core/src/main/scala-3/MapAlgebra.scala b/algebird-core/src/main/scala-3/MapAlgebra.scala new file mode 100644 index 000000000..9ca370eaf --- /dev/null +++ b/algebird-core/src/main/scala-3/MapAlgebra.scala @@ -0,0 +1,320 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.algebird + +import com.twitter.algebird.macros.{Cuber, Roller} +import scala.collection.mutable.{Builder, Map => MMap} +import scala.collection.{Map => ScMap} +import algebra.ring.Rng +import scala.collection.compat._ + +trait MapOperations[K, V, M <: ScMap[K, V]] { + def add(oldMap: M, kv: (K, V)): M + def remove(oldMap: M, k: K): M + def fromMutable(mut: MMap[K, V]): M +} + +abstract class GenericMapMonoid[K, V, M <: ScMap[K, V]](implicit val semigroup: Semigroup[V]) + extends Monoid[M] + with MapOperations[K, V, M] { + + val nonZero: (V => Boolean) = semigroup match { + case mon: Monoid[?] => mon.isNonZero(_) + case _ => _ => true + } + + override def isNonZero(x: M): Boolean = + !x.isEmpty && (semigroup match { + case mon: Monoid[?] => + x.valuesIterator.exists(v => mon.isNonZero(v)) + case _ => true + }) + + override def plus(x: M, y: M): M = { + // Scala maps can reuse internal structure, so don't copy just add into the bigger one: + // This really saves computation when adding lots of small maps into big ones (common) + val (big, small, bigOnLeft) = + if (x.size > y.size) { + (x, y, true) + } else { + (y, x, false) + } + small match { + // Mutable maps create new copies of the underlying data on add so don't use the + // handleImmutable method. + // Cannot have a None so 'get' is safe here. + case _: MMap[?, ?] => sumOption(Seq(big, small)).get + case _ => handleImmutable(big, small, bigOnLeft) + } + } + + private def handleImmutable(big: M, small: M, bigOnLeft: Boolean) = + small.foldLeft(big) { (oldMap, kv) => + val newV = big + .get(kv._1) + .map { bigV => + if (bigOnLeft) + semigroup.plus(bigV, kv._2) + else + semigroup.plus(kv._2, bigV) + } + .getOrElse(kv._2) + if (nonZero(newV)) + add(oldMap, kv._1 -> newV) + else + remove(oldMap, kv._1) + } + override def sumOption(items: TraversableOnce[M]): Option[M] = + if (items.iterator.isEmpty) None + else { + val mutable = MMap[K, V]() + items.iterator.foreach { m => + m.foreach { case (k, v) => + val oldVOpt = mutable.get(k) + // sorry for the micro optimization here: avoiding a closure + val newV = + if (oldVOpt.isEmpty) v else Semigroup.plus(oldVOpt.get, v) + if (nonZero(newV)) + mutable.update(k, newV) + else + mutable.remove(k) + } + } + Some(fromMutable(mutable)) + } +} + +class MapMonoid[K, V](implicit semigroup: Semigroup[V]) extends GenericMapMonoid[K, V, Map[K, V]] { + override lazy val zero: Map[K, V] = Map[K, V]() + override def add(oldMap: Map[K, V], kv: (K, V)): Map[K, V] = oldMap + kv + override def remove(oldMap: Map[K, V], k: K): Map[K, V] = oldMap - k + override def fromMutable(mut: MMap[K, V]): Map[K, V] = + new MutableBackedMap(mut) +} + +class ScMapMonoid[K, V](implicit semigroup: Semigroup[V]) extends GenericMapMonoid[K, V, ScMap[K, V]] { + override lazy val zero: ScMap[K, V] = ScMap[K, V]() + override def add(oldMap: ScMap[K, V], kv: (K, V)): ScMap[K, V] = oldMap + kv + override def remove(oldMap: ScMap[K, V], k: K): ScMap[K, V] = oldMap - k + override def fromMutable(mut: MMap[K, V]): ScMap[K, V] = + new MutableBackedMap(mut) +} + +/** + * You can think of this as a Sparse vector group + */ +class MapGroup[K, V](implicit val group: Group[V]) extends MapMonoid[K, V]()(group) with Group[Map[K, V]] { + override def negate(kv: Map[K, V]): Map[K, V] = + kv.iterator.map { case (k, v) => + (k, group.negate(v)) + }.toMap +} + +class ScMapGroup[K, V](implicit val group: Group[V]) + extends ScMapMonoid[K, V]()(group) + with Group[ScMap[K, V]] { + override def negate(kv: ScMap[K, V]): ScMap[K, V] = + kv.iterator.map { case (k, v) => + (k, group.negate(v)) + }.toMap +} + +/** + * You can think of this as a Sparse vector ring + */ +trait GenericMapRing[K, V, M <: ScMap[K, V]] extends Rng[M] with MapOperations[K, V, M] { + + implicit def ring: Ring[V] + + override def times(x: M, y: M): M = { + val (big, small, bigOnLeft) = + if (x.size > y.size) { + (x, y, true) + } else { + (y, x, false) + } + small.foldLeft(zero) { (oldMap, kv) => + val bigV = big.getOrElse(kv._1, ring.zero) + val newV = + if (bigOnLeft) ring.times(bigV, kv._2) else ring.times(kv._2, bigV) + if (ring.isNonZero(newV)) { + add(oldMap, kv._1 -> newV) + } else { + remove(oldMap, kv._1) + } + } + } +} + +class MapRing[K, V](implicit override val ring: Ring[V]) + extends MapGroup[K, V]()(ring) + with GenericMapRing[K, V, Map[K, V]] + +class ScMapRing[K, V](implicit override val ring: Ring[V]) + extends ScMapGroup[K, V]()(ring) + with GenericMapRing[K, V, ScMap[K, V]] + +object MapAlgebra { + def rightContainsLeft[K, V: Equiv](l: Map[K, V], r: Map[K, V]): Boolean = + l.forall { case (k, v) => + r.get(k).exists(Equiv[V].equiv(_, v)) + } + + implicit def sparseEquiv[K, V: Monoid: Equiv]: Equiv[Map[K, V]] = + Equiv.fromFunction { (m1, m2) => + val cleanM1 = removeZeros(m1) + val cleanM2 = removeZeros(m2) + rightContainsLeft(cleanM1, cleanM2) && rightContainsLeft(cleanM2, cleanM1) + } + + def mergeLookup[T, U, V: Monoid]( + keys: TraversableOnce[T] + )(lookup: T => Option[V])(present: T => U): Map[U, V] = + sumByKey { + keys.iterator.map(k => present(k) -> lookup(k).getOrElse(Monoid.zero[V])) + } + + // Returns a new map with zero-value entries removed + def removeZeros[K, V: Monoid](m: Map[K, V]): Map[K, V] = + m.filter { case (_, v) => Monoid.isNonZero(v) } + + /** + * For each key, sum all the values. Note that if V is a Monoid, the current implementation will drop from + * the output any key where the values are all Monoid.zero. If the Semigroup is a Monoid, This function is + * equivalent to: + * + * pairs.filter(_._2 != Monoid.zero).groupBy(_._1).mapValues(_.map(_._2).sum) + * + * Otherwise, the function is equivalent to: + * + * pairs.groupBy(_._1).mapValues(_.map(_._2).sum) + */ + def sumByKey[K, V: Semigroup](pairs: TraversableOnce[(K, V)]): Map[K, V] = + Monoid.sum(pairs.iterator.map(Map(_))) + + /** + * For each key, creates a list of all values. This function is equivalent to: + * + * pairs.groupBy(_._1).mapValues(_.map(_._2)) + */ + def group[K, V](pairs: TraversableOnce[(K, V)]): Map[K, List[V]] = + if (pairs.iterator.isEmpty) Map.empty + else { + val mutable = MMap[K, Builder[V, List[V]]]() + pairs.iterator.foreach { case (k, v) => + val oldVOpt = mutable.get(k) + // sorry for the micro optimization here: avoiding a closure + val bldr = if (oldVOpt.isEmpty) { + val b = List.newBuilder[V] + mutable.update(k, b) + b + } else oldVOpt.get + bldr += v + } + mutable.iterator.map { case (k, bldr) => (k, bldr.result()) }.toMap + } + + // Consider this as edges from k -> v, produce a Map[K,Set[V]] + def toGraph[K, V](pairs: TraversableOnce[(K, V)]): Map[K, Set[V]] = + Monoid.sum(pairs.map { case (k, v) => Map(k -> Set(v)) }) + + /** join the keys of two maps (similar to outer-join in a DB) */ + def join[K, V, W](map1: Map[K, V], map2: Map[K, W]): Map[K, (Option[V], Option[W])] = + Monoid + .plus( + map1.transform { case (_, v) => + (List(v), List[W]()) + }, + map2.transform { case (_, w) => + (List[V](), List(w)) + } + ) + .transform { case (_, (v, w)) => (v.headOption, w.headOption) } + + /** + * Reverses a graph losslessly None key is for v's with no sources. + */ + def invertExact[K, V](m: Map[Option[K], Set[V]]): Map[Option[V], Set[K]] = { + def nonEmptyIter[T](i: Iterable[T]): Iterable[Option[T]] = + if (i.isEmpty) Iterable(None) + else { + i.map(Some(_)) + } + + Monoid.sum { + for { + (k, sv) <- m.view.toIterable + v <- nonEmptyIter(sv) + } yield Map(v -> k.toSet) + } + } + + /** + * Invert the Common case of exactly one value for each key + */ + def invert[K, V](m: Map[K, V]): Map[V, Set[K]] = + Monoid.sum(m.view.toIterable.map { case (k, v) => Map(v -> Set(k)) }) + + def dot[K, V](left: Map[K, V], right: Map[K, V])(implicit mring: Ring[Map[K, V]], mon: Monoid[V]): V = + Monoid.sum(mring.times(left, right).values) + + def cube[K, V](it: TraversableOnce[(K, V)])(implicit c: Cuber[K]): Map[c.K, List[V]] = { + val map: MMap[c.K, List[V]] = MMap[c.K, List[V]]() + it.iterator.foreach { case (k, v) => + c(k).iterator.foreach { ik => + map.get(ik) match { + case Some(vs) => map += ik -> (v :: vs) + case None => map += ik -> List(v) + } + } + } + map.foreach { case (k, v) => map(k) = v.reverse } + new MutableBackedMap(map) + } + + def cubeSum[K, V](it: TraversableOnce[(K, V)])(implicit c: Cuber[K], sg: Semigroup[V]): Map[c.K, V] = + sumByKey(it.iterator.flatMap { case (k, v) => c(k).map((_, v)) }) + + def cubeAggregate[T, K, U, V](it: TraversableOnce[T], agg: Aggregator[T, U, V])( + fn: T => K + )(implicit c: Cuber[K]): Map[c.K, V] = + sumByKey(it.iterator.flatMap(t => c(fn(t)).iterator.map((_, agg.prepare(t)))))(agg.semigroup) + .map { case (k, v) => (k, agg.present(v)) } + + def rollup[K, V](it: TraversableOnce[(K, V)])(implicit r: Roller[K]): Map[r.K, List[V]] = { + val map: MMap[r.K, List[V]] = MMap[r.K, List[V]]() + it.iterator.foreach { case (k, v) => + r(k).iterator.foreach { ik => + map.get(ik) match { + case Some(vs) => map += ik -> (v :: vs) + case None => map += ik -> List(v) + } + } + } + map.foreach { case (k, v) => map(k) = v.reverse } + new MutableBackedMap(map) + } + + def rollupSum[K, V](it: TraversableOnce[(K, V)])(implicit r: Roller[K], sg: Semigroup[V]): Map[r.K, V] = + sumByKey(it.iterator.flatMap { case (k, v) => r(k).iterator.map((_, v)) }) + + def rollupAggregate[T, K, U, V](it: TraversableOnce[T], agg: Aggregator[T, U, V])( + fn: T => K + )(implicit r: Roller[K]): Map[r.K, V] = + sumByKey(it.iterator.flatMap(t => r(fn(t)).iterator.map((_, agg.prepare(t)))))(agg.semigroup) + .map { case (k, v) => (k, agg.present(v)) } + +} diff --git a/algebird-core/src/main/scala-3/Scan.scala b/algebird-core/src/main/scala-3/Scan.scala new file mode 100644 index 000000000..2dc2ff9c2 --- /dev/null +++ b/algebird-core/src/main/scala-3/Scan.scala @@ -0,0 +1,333 @@ +package com.twitter.algebird + +import scala.collection.compat._ + +object Scan { + + /** + * Most consumers of Scan don't care about the type of the type State type variable. But for those that do, + * we make an effort to expose it in all of our combinators. + * @tparam I + * @tparam S + * @tparam O + */ + type Aux[-I, S, +O] = Scan[I, O] { type State = S } + + implicit def applicative[I]: Applicative[Scan[I, _]] = new ScanApplicative[I] + + def from[I, S, O](initState: S)(presentAndNextStateFn: (I, S) => (O, S)): Aux[I, S, O] = + new Scan[I, O] { + override type State = S + override val initialState = initState + override def presentAndNextState(i: I, s: State): (O, State) = presentAndNextStateFn(i, s) + } + + def fromFunction[I, O](f: I => O): Aux[I, Unit, O] = new Scan[I, O] { + override type State = Unit + override val initialState = () + override def presentAndNextState(i: I, stateBeforeProcessingI: Unit): (O, State) = (f(i), ()) + } + + /** + * Scans take streams of inputs to streams of outputs, but some scans have trivial inputs and just produce a + * stream of outputs. Streams can be thought of as being a hidden state that is queryable for a head + * element, and another hidden state that represents the rest of the stream. + * @param initState + * The initial state of the scan; think of this as an infinite stream. + * @param destructor + * This function decomposes a stream into the its head-element and tail-stream. + * @tparam S + * The hidden state of the stream that we are turning into a Scan. + * @tparam O + * The type of the elments of the stream that we are turning into a Scan + * @return + * A Scan whose inputs are irrelevant, and whose outputs are those that we would get from implementing a + * stream using the information provided to this method. + */ + def iterate[S, O](initState: S)(destructor: S => (O, S)): Aux[Any, S, O] = new Scan[Any, O] { + override type State = S + override val initialState = initState + override def presentAndNextState(i: Any, stateBeforeProcessingI: S): (O, S) = + destructor(stateBeforeProcessingI) + } + + /** + * A Scan whose `Nth` output is the number `N` (starting from 0). + */ + val index: Aux[Any, Long, Long] = iterate(0L)(n => (n, n + 1)) + + def identity[A]: Aux[A, Unit, A] = fromFunction[A, A](x => x) + + /** + * @param initStateCreator + * A call-by-name method that allocates new mutable state + * @param presentAndUpdateStateFn + * A function that both presents the output value, and has the side-effect of updating the mutable state + * @tparam I + * @tparam S + * @tparam O + * @return + * A Scan that safely encapsulates state while it's doing its thing. + */ + def mutable[I, S, O](initStateCreator: => S)(presentAndUpdateStateFn: (I, S) => O): Aux[I, S, O] = + new Scan[I, O] { + override type State = S + override def initialState = initStateCreator + override def presentAndNextState(i: I, s: S): (O, S) = (presentAndUpdateStateFn(i, s), s) + } + + /** + * The trivial scan that always returns the same value, regardless of input + * @param t + * @tparam T + */ + def const[T](t: T): Aux[Any, Unit, T] = fromFunction(_ => t) + + /** + * @param aggregator + * @param initState + * @tparam A + * @tparam B + * @tparam C + * @return + * A scan which, when given `[a_1, ..., a_n]` outputs `[c_1, ..., c_n]` where `c_i = initState + + * aggregator.prepare(a_1) + ... + aggregator.prepare(a_i)` + */ + def fromAggregator[A, B, C](aggregator: Aggregator[A, B, C], initState: B): Aux[A, B, C] = + from(initState) { (a: A, stateBeforeProcessingI: B) => + // nb: the order of the arguments to semigroup.plus here is what determines the order of the final summation; + // this matters because not all semigroups are commutative + val stateAfterProcessingA = + aggregator.append(stateBeforeProcessingI, a) + (aggregator.present(stateAfterProcessingA), stateAfterProcessingA) + } + + /** + * @param monoidAggregator + * @tparam A + * @tparam B + * @tparam C + * @return + * A scan which, when given `[a_1, ..., a_n]` outputs `[c_1, ..., c_n]` where `c_i = + * monoidAggregator.monoid.zero + aggregator.prepare(a_1) + ... + aggregator.prepare(a_i)` + */ + def fromMonoidAggregator[A, B, C](monoidAggregator: MonoidAggregator[A, B, C]): Aux[A, B, C] = + fromAggregator(monoidAggregator, monoidAggregator.monoid.zero) + +} + +/** + * The Scan trait is an alternative to the `scanLeft` method on iterators/other collections for a range of of + * use-cases where `scanLeft` is awkward to use. At a high level it provides some of the same functionality as + * `scanLeft`, but with a separation of "what is the state of the scan" from "what are the elements that I'm + * scanning over?". In particular, when scanning over an iterator with `N` elements, the output is an iterator + * with `N` elements (in contrast to scanLeft's `N+1`). + * + * If you find yourself writing a `scanLeft` over pairs of elements, where you only use one element of the + * pair within the `scanLeft`, then throw that element away in a `map` immediately after the scanLeft is done, + * then this abstraction is for you. + * + * The canonical method to use a scan is `apply`. + * + * @tparam I + * The type of elements that the computation is scanning over. + * @tparam O + * The output type of the scan (typically distinct from the hidden `State` of the scan). + */ +sealed abstract class Scan[-I, +O] extends Serializable { + + import Scan.{from, Aux} + + /** + * The computation of any given scan involves keeping track of a hidden state. + */ + type State + + /** + * The state of the scan before any elements have been processed + * @return + */ + def initialState: State + + /** + * @param i + * An element in the stream to process + * @param stateBeforeProcessingI + * The state of the scan before processing i + * @return + * The output of the scan corresponding to processing i with state stateBeforeProcessing, along with the + * result of updating stateBeforeProcessing with the information from i. + */ + def presentAndNextState(i: I, stateBeforeProcessingI: State): (O, State) + + /** + * @param iter + * @return + * If `iter = Iterator(a_1, ..., a_n)`, return:` `Iterator(o_1, ..., o_n)` where `(o_(i+1), state_(i+1)) = + * presentAndNextState(a_i, state_i)` and `state_0 = initialState` + */ + def scanIterator(iter: Iterator[I]): Iterator[O] = new AbstractIterator[O] { + override def hasNext: Boolean = iter.hasNext + var state: State = initialState + override def next(): O = { + val thisState = state + val thisA = iter.next() + val (thisC, nextState) = presentAndNextState(thisA, thisState) + state = nextState + thisC + } + } + + /** + * @param inputs + * @param bf + * @tparam In + * The type of the input collection + * @tparam Out + * The type of the output collection + * @return + * Given inputs as a collection of the form `[a_1, ..., a_n]` the output will be a collection of the form: + * `[o_1, ..., o_n]` where `(o_(i+1), state_(i+1)) = presentAndNextState(a_i, state_i)` and `state_0 = + * initialState`. + */ + def apply[In <: TraversableOnce[I], Out]( + inputs: In + )(implicit bf: BuildFrom[In, O, Out]): Out = + bf.fromSpecific(inputs)(scanIterator(inputs.toIterator)) + + // combinators + + /** + * Return a new scan that is the same as this scan, but with a different `initialState`. + * @param newInitialState + * @return + */ + def replaceState(newInitialState: => State): Aux[I, State, O] = + from(newInitialState)(presentAndNextState(_, _)) + + def composePrepare[I1](f: I1 => I): Aux[I1, State, O] = from(initialState) { (i, stateBeforeProcessingI) => + presentAndNextState(f(i), stateBeforeProcessingI) + } + + def andThenPresent[O1](g: O => O1): Aux[I, State, O1] = from(initialState) { (i, stateBeforeProcessingI) => + val (c, stateAfterProcessingA) = presentAndNextState(i, stateBeforeProcessingI) + (g(c), stateAfterProcessingA) + } + + /** + * Return a scan that is semantically identical to `this.join(Scan.identity[I1])`, but where we don't + * pollute the `State` by pairing it redundantly with `Unit`. + * @tparam I1 + * @return + * If this Scan's `apply` method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1, + * ..., o_n`, then this results in a Scan whose `apply` method returns `[(o_1, a_1), ..., (o_n, a_n)]` + * when given the same input. + */ + def joinWithInput[I1 <: I]: Aux[I1, State, (O, I1)] = from(initialState) { (i, stateBeforeProcessingI) => + val (o, stateAfterProcessingI) = presentAndNextState(i, stateBeforeProcessingI) + ((o, i), stateAfterProcessingI) + } + + /** + * Return a scan whose output is paired with the state of the scan before each input updates the state. + * @return + * If this Scan's `apply` method is given inputs [a_1, ..., a_n] resulting in outputs of the form `[o_1, + * ..., o_n]`, where `(o_(i+1), state_(i+1)) = presentAndNextState(a_i, state_i)` and `state_0 = + * initialState`, return a scan that whose apply method, when given inputs `[a_1, ..., a_n]` will return + * `[(o_1, state_0), ..., (o_n, state_(n-1))]`. + */ + def joinWithPriorState: Aux[I, State, (State, O)] = from(initialState) { (i, stateBeforeProcessingI) => + val (o, stateAfterProcessingA) = presentAndNextState(i, stateBeforeProcessingI) + ((stateBeforeProcessingI, o), stateAfterProcessingA) + } + + /** + * Return a scan whose output is paired with the state of the scan after each input updates the state. + * @return + * If this Scan's `apply` method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1, + * ..., o_n]`, where `(o_(i+1), state_(i+1)) = presentAndNextState(a_i, state_i)`` and state_0 = + * initialState, return a scan that whose apply method, when given inputs `[a_1, ..., a_n]` will return + * `[(o_1, state_1), ..., (o_n, state_n]`. + */ + def joinWithPosteriorState: Aux[I, State, (O, State)] = from(initialState) { (i, stateBeforeProcessingI) => + val (c, stateAfterProcessingA) = presentAndNextState(i, stateBeforeProcessingI) + ((c, stateAfterProcessingA), stateAfterProcessingA) + } + + /** + * For every `foo`, `scan.joinWithIndex(foo) == scan(foo).zipWithIndex`. + * @return + * If this Scan's `apply` method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1, + * ..., o_n]`, return a scan that whose apply method, when given the same input, will return `[(o_1, 1), + * ..., (o_n, n)]`. + */ + def joinWithIndex: Aux[I, (State, Long), (O, Long)] = join(Scan.index) + + /** + * Compose two scans pairwise such that, when given pairwise zipped inputs, the resulting scan will output + * pairwise zipped outputs. + * @param scan2 + * @tparam I2 + * @tparam O2 + * @return + * If this Scan's apply method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1, + * ..., o_n]`, and `scan2.apply([b_1, ..., b_n] = [p_1, ..., p_n]` then `zip` will return a scan whose + * apply method, when given input `[(a_1, b_1), ..., (a_n, b_n)]` results in the output `[(o_1, p_1), ..., + * (o_2, p_2)]`. In other words: `scan.zip(scan2)(foo.zip(bar)) == scan(foo).zip(scan2(bar))` + */ + def zip[I2, O2](scan2: Scan[I2, O2]): Aux[(I, I2), (State, scan2.State), (O, O2)] = + from((initialState, scan2.initialState)) { (i1i2, stateBeforeProcessingI1I2) => + val (o1, state1AfterProcesingI1) = + presentAndNextState(i1i2._1, stateBeforeProcessingI1I2._1) + val (o2, state2AfterProcesingI2) = + scan2.presentAndNextState(i1i2._2, stateBeforeProcessingI1I2._2) + ((o1, o2), (state1AfterProcesingI1, state2AfterProcesingI2)) + } + + /** + * Given a scan that takes compatible input to this one, pairwise compose the state and outputs of each scan + * on a common input stream. + * @param scan2 + * @tparam I2 + * @tparam O2 + * @return + * If this Scan's apply method is given inputs [a_1, ..., a_n] resulting in outputs of the form `[o_1, + * ..., o_n]`, and `scan2.apply([a_1, ..., a_n] = [p_1, ..., p_n]` then `join` will return a scan whose + * apply method returns `[(o_1, p_1), ..., (o_2, p_2)]`. In other words: `scan.join(scan2)(foo) == + * scan(foo).zip(scan2(foo))` + */ + def join[I2 <: I, O2](scan2: Scan[I2, O2]): Aux[I2, (State, scan2.State), (O, O2)] = + from((initialState, scan2.initialState)) { (i, stateBeforeProcessingI) => + val (o1, state1AfterProcesingI1) = presentAndNextState(i, stateBeforeProcessingI._1) + val (o2, state2AfterProcesingI2) = scan2.presentAndNextState(i, stateBeforeProcessingI._2) + ((o1, o2), (state1AfterProcesingI1, state2AfterProcesingI2)) + } + + /** + * Takes the output of this scan and feeds as input into scan2. + * @param scan2 + * @tparam P + * @return + * If this Scan's apply method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1, + * ..., o_n]`, and `scan2.apply([o_1, ..., o_n] = [p_1, ..., p_n]` then `compose` will return a scan which + * returns `[p_1, ..., p_n]`. + */ + def compose[P](scan2: Scan[O, P]): Aux[I, (State, scan2.State), P] = + from((initialState, scan2.initialState)) { (i, stateBeforeProcessingI) => + val (o, state1AfterProcesingI) = presentAndNextState(i, stateBeforeProcessingI._1) + val (p, state2AfterProcesingO) = scan2.presentAndNextState(o, stateBeforeProcessingI._2) + (p, (state1AfterProcesingI, state2AfterProcesingO)) + } + +} + +class ScanApplicative[I] extends Applicative[Scan[I, _]] { + override def map[T, U](mt: Scan[I, T])(fn: T => U): Scan[I, U] = + mt.andThenPresent(fn) + + override def apply[T](v: T): Scan[I, T] = + Scan.const(v) + + override def join[T, U](mt: Scan[I, T], mu: Scan[I, U]): Scan[I, (T, U)] = + mt.join(mu) +} diff --git a/algebird-core/src/main/scala-3/SpaceSaver.scala b/algebird-core/src/main/scala-3/SpaceSaver.scala new file mode 100644 index 000000000..5f9eee7e6 --- /dev/null +++ b/algebird-core/src/main/scala-3/SpaceSaver.scala @@ -0,0 +1,296 @@ +package com.twitter.algebird + +import java.nio.ByteBuffer + +import scala.collection.immutable.SortedMap +import scala.util.{Failure, Success, Try} + +object SpaceSaver { + + /** + * Construct SpaceSaver with given capacity containing a single item. This is the public api to create a new + * SpaceSaver. + */ + def apply[T](capacity: Int, item: T): SpaceSaver[T] = SSOne(capacity, item) + + /** + * Construct SpaceSaver with given capacity containing a single item with provided exact count. This is the + * public api to create a new SpaceSaver. + */ + def apply[T](capacity: Int, item: T, count: Long): SpaceSaver[T] = + SSMany(capacity, Map(item -> ((count, 0L)))) + + private[algebird] val ordering = + Ordering.by[(?, (Long, Long)), (Long, Long)] { case (_, (count, err)) => + (-count, err) + } + + implicit def spaceSaverSemiGroup[T]: Semigroup[SpaceSaver[T]] = + new SpaceSaverSemigroup[T] + + /** + * Encodes the SpaceSaver as a sequence of bytes containing in order + * - 1 byte: 1/2 => 1 = SSOne, 2 = SSMany + * - 4 bytes: the capacity + * - N bytes: the item/counters (counters as length + N*(item size + item + 2 * counters) + */ + def toBytes[T](ss: SpaceSaver[T], tSerializer: T => Array[Byte]): Array[Byte] = + ss match { + case SSOne(capacity, item) => + val itemAsBytes = tSerializer(item) + val itemLength = itemAsBytes.length + // 1 for the type, 4 for capacity, 4 for itemAsBytes.length + val buffer = new Array[Byte](1 + 4 + 4 + itemLength) + ByteBuffer + .wrap(buffer) + .put(1: Byte) + .putInt(capacity) + .putInt(itemLength) + .put(itemAsBytes) + buffer + + case SSMany( + capacity, + counters, + _ + ) => // We do not care about the buckets are thery are created by SSMany.apply + val buffer = scala.collection.mutable.ArrayBuffer.newBuilder[Byte] + buffer += (2: Byte) + + var buff = ByteBuffer.allocate(4) + buff.putInt(capacity) + buffer ++= buff.array() + + buff = ByteBuffer.allocate(4) + buff.putInt(counters.size) + buffer ++= buff.array() + counters.foreach { case (item, (a, b)) => + val itemAsBytes = tSerializer(item) + + buff = ByteBuffer.allocate(4) + buff.putInt(itemAsBytes.length) + buffer ++= buff.array() + + buffer ++= itemAsBytes + + buff = ByteBuffer.allocate(8 * 2) + buff.putLong(a) + buff.putLong(b) + buffer ++= buff.array() + } + buffer.result().toArray + } + + // Make sure to be reversible so fromBytes(toBytes(x)) == x + def fromBytes[T](bytes: Array[Byte], tDeserializer: Array[Byte] => Try[T]): Try[SpaceSaver[T]] = + fromByteBuffer(ByteBuffer.wrap(bytes), buffer => tDeserializer(buffer.array())) + + def fromByteBuffer[T](bb: ByteBuffer, tDeserializer: ByteBuffer => Try[T]): Try[SpaceSaver[T]] = + Try { + bb.get.toInt match { + case 1 => + val capacity = bb.getInt + val itemLength = bb.getInt + val itemAsBytes = new Array[Byte](itemLength) + bb.get(itemAsBytes) + tDeserializer(ByteBuffer.wrap(itemAsBytes)).map(item => SSOne(capacity, item)) + case 2 => + val capacity = bb.getInt + + var countersToDeserialize = bb.getInt + val counters = scala.collection.mutable.Map.empty[T, (Long, Long)] + while (countersToDeserialize != 0) { + val itemLength = bb.getInt() + val itemAsBytes = new Array[Byte](itemLength) + bb.get(itemAsBytes) + val item = tDeserializer(ByteBuffer.wrap(itemAsBytes)) + + val a = bb.getLong + val b = bb.getLong + + item match { + case Failure(e) => return Failure(e) + case Success(i) => + counters += ((i, (a, b))) + } + + countersToDeserialize -= 1 + } + + Success(SSMany(capacity, counters.toMap)) + } + }.flatten +} + +/** + * Data structure used in the Space-Saving Algorithm to find the approximate most frequent and top-k elements. + * The algorithm is described in "Efficient Computation of Frequent and Top-k Elements in Data Streams". See + * here: www.cs.ucsb.edu/research/tech_reports/reports/2005-23.pdf In the paper the data structure is called + * StreamSummary but we chose to call it SpaceSaver instead. Note that the adaptation to hadoop and + * parallelization were not described in the article and have not been proven to be mathematically correct or + * preserve the guarantees or benefits of the algorithm. + */ +sealed abstract class SpaceSaver[T] { + import SpaceSaver.ordering + + /** + * Maximum number of counters to keep (parameter "m" in the research paper). + */ + def capacity: Int + + /** + * Current lowest value for count + */ + def min: Long + + /** + * Map of item to counter, where each counter consists of an observed count and possible over-estimation + * (error) + */ + def counters: Map[T, (Long, Long)] + + def ++(other: SpaceSaver[T]): SpaceSaver[T] + + /** + * returns the frequency estimate for the item + */ + def frequency(item: T): Approximate[Long] = { + val (count, err) = counters.getOrElse(item, (min, min)) + Approximate(count - err, count, count, 1.0) + } + + /** + * Get the elements that show up more than thres times. Returns sorted in descending order: (item, + * Approximate[Long], guaranteed) + */ + def mostFrequent(thres: Int): Seq[(T, Approximate[Long], Boolean)] = + counters.iterator + .filter { case (_, (count, _)) => count >= thres } + .toList + .sorted(ordering) + .map { case (item, (count, err)) => + (item, Approximate(count - err, count, count, 1.0), thres <= count - err) + } + + /** + * Get the top-k elements. Returns sorted in descending order: (item, Approximate[Long], guaranteed) + */ + def topK(k: Int): Seq[(T, Approximate[Long], Boolean)] = { + require(k < capacity) + val si = counters.toList + .sorted(ordering) + val siK = si.take(k) + val countKPlus1 = si.drop(k).headOption.map(_._2._1).getOrElse(0L) + siK.map { case (item, (count, err)) => + (item, Approximate(count - err, count, count, 1.0), countKPlus1 < count - err) + } + } + + /** + * Check consistency with other SpaceSaver, useful for testing. Returns boolean indicating if they are + * consistent + */ + def consistentWith(that: SpaceSaver[T]): Boolean = + (counters.keys ++ that.counters.keys).forall(item => (frequency(item) - that.frequency(item)) ~ 0) +} + +case class SSOne[T] private[algebird] (override val capacity: Int, item: T) extends SpaceSaver[T] { + require(capacity > 1) + + override def min: Long = 0L + + override def counters: Map[T, (Long, Long)] = Map(item -> ((1L, 1L))) + + override def ++(other: SpaceSaver[T]): SpaceSaver[T] = other match { + case other: SSOne[?] => SSMany(this).add(other) + case other: SSMany[?] => other.add(this) + } +} + +object SSMany { + private def bucketsFromCounters[T](counters: Map[T, (Long, Long)]): SortedMap[Long, Set[T]] = + SortedMap[Long, Set[T]]() ++ counters.groupBy(_._2._1).mapValues(_.keySet).toMap + + private[algebird] def apply[T](capacity: Int, counters: Map[T, (Long, Long)]): SSMany[T] = + SSMany(capacity, counters, bucketsFromCounters(counters)) + + private[algebird] def apply[T](one: SSOne[T]): SSMany[T] = + SSMany(one.capacity, Map(one.item -> ((1L, 0L))), SortedMap(1L -> Set(one.item))) +} + +case class SSMany[T] private ( + override val capacity: Int, + override val counters: Map[T, (Long, Long)], + buckets: SortedMap[Long, Set[T]] +) extends SpaceSaver[T] { + private val exact: Boolean = counters.size < capacity + + override val min: Long = if (counters.size < capacity) 0L else buckets.firstKey + + // item is already present and just needs to be bumped up one + private def bump(item: T) = { + val (count, err) = counters(item) + val counters1 = counters + (item -> ((count + 1L, err))) // increment by one + val currBucket = buckets(count) // current bucket + val buckets1 = { + if (currBucket.size == 1) // delete current bucket since it will be empty + buckets - count + else // remove item from current bucket + buckets + (count -> (currBucket - item)) + } + (count + 1L -> (buckets.getOrElse(count + 1L, Set()) + item)) + SSMany(capacity, counters1, buckets1) + } + + // lose one item to meet capacity constraint + private def loseOne = { + val firstBucket = buckets(buckets.firstKey) + val itemToLose = firstBucket.head + val counters1 = counters - itemToLose + val buckets1 = + if (firstBucket.size == 1) + buckets - min + else + buckets + (min -> (firstBucket - itemToLose)) + SSMany(capacity, counters1, buckets1) + } + + // introduce new item + private def introduce(item: T, count: Long, err: Long) = { + val counters1 = counters + (item -> ((count, err))) + val buckets1 = buckets + (count -> (buckets.getOrElse(count, Set()) + item)) + SSMany(capacity, counters1, buckets1) + } + + // add a single element + private[algebird] def add(x: SSOne[T]): SSMany[T] = { + require(x.capacity == capacity) + if (counters.contains(x.item)) + bump(x.item) + else + (if (exact) this else this.loseOne).introduce(x.item, min + 1L, min) + } + + // merge two stream summaries + private def merge(x: SSMany[T]): SSMany[T] = { + require(x.capacity == capacity) + val counters1 = Map() ++ + (counters.keySet ++ x.counters.keySet).toList + .map { key => + val (count1, err1) = counters.getOrElse(key, (min, min)) + val (count2, err2) = x.counters.getOrElse(key, (x.min, x.min)) + key -> ((count1 + count2, err1 + err2)) + } + .sorted(SpaceSaver.ordering) + .take(capacity) + SSMany(capacity, counters1) + } + + override def ++(other: SpaceSaver[T]): SpaceSaver[T] = other match { + case other: SSOne[?] => add(other) + case other: SSMany[?] => merge(other) + } +} + +class SpaceSaverSemigroup[T] extends Semigroup[SpaceSaver[T]] { + override def plus(x: SpaceSaver[T], y: SpaceSaver[T]): SpaceSaver[T] = x ++ y +} diff --git a/algebird-core/src/main/scala-3/VectorSpace.scala b/algebird-core/src/main/scala-3/VectorSpace.scala new file mode 100644 index 000000000..f8818600c --- /dev/null +++ b/algebird-core/src/main/scala-3/VectorSpace.scala @@ -0,0 +1,59 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ + +package com.twitter.algebird + +import scala.annotation.implicitNotFound + +/** + * This class represents a vector space. For the required properties see: + * + * http://en.wikipedia.org/wiki/Vector_space#Definition + */ +object VectorSpace extends VectorSpaceOps with Implicits + +sealed trait VectorSpaceOps { + def scale[F, C[_]](v: F, c: C[F])(implicit vs: VectorSpace[F, C]): C[F] = + vs.scale(v, c) + def from[F, C[_]](scaleFn: (F, C[F]) => C[F])(implicit r: Ring[F], cGroup: Group[C[F]]): VectorSpace[F, C] = + new VectorSpace[F, C] { + override def ring: Ring[F] = r + override def group: Group[C[F]] = cGroup + override def scale(v: F, c: C[F]): C[F] = + if (r.isNonZero(v)) scaleFn(v, c) else cGroup.zero + } +} +private object VectorSpaceOps extends VectorSpaceOps + +sealed trait Implicits extends LowPrioImpicits { + implicit def indexedSeqSpace[T: Ring]: VectorSpace[T, IndexedSeq] = + VectorSpaceOps.from[T, IndexedSeq]((s, seq) => seq.map(Ring.times(s, _))) +} + +sealed trait LowPrioImpicits { + implicit def mapSpace[K, T: Ring]: VectorSpace[T, Map[K, _]] = + VectorSpaceOps.from[T, Map[K, _]] { (s, m) => + m.transform { case (_, v) => Ring.times(s, v) } + } +} + +@implicitNotFound(msg = "Cannot find VectorSpace type class for Container: ${C} and Ring: ${F}") +trait VectorSpace[F, C[_]] extends java.io.Serializable { + implicit def ring: Ring[F] + def field: Ring[F] = ring // this is for compatibility with older versions + implicit def group: Group[C[F]] + def scale(v: F, c: C[F]): C[F] +} diff --git a/algebird-core/src/main/scala-3/monad/EitherMonad.scala b/algebird-core/src/main/scala-3/monad/EitherMonad.scala new file mode 100644 index 000000000..b6d5e2ffc --- /dev/null +++ b/algebird-core/src/main/scala-3/monad/EitherMonad.scala @@ -0,0 +1,37 @@ +/* + Copyright 2013 Twitter, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +package com.twitter.algebird.monad + +import com.twitter.algebird.Monad + +// Monad for either, used for modeling Error where L is the type of the error +object EitherMonad { + class Error[L] extends Monad[Either[L, *]] { + override def apply[R](r: R): Right[L, R] = Right(r) + + override def flatMap[T, U](self: Either[L, T])(next: T => Either[L, U]): Either[L, U] = + self.right.flatMap(next) + + override def map[T, U](self: Either[L, T])(fn: T => U): Either[L, U] = + self.right.map(fn) + } + + implicit def monad[L]: Monad[Either[L, _]] = new Error[L] + + def assert[L](truth: Boolean, failure: => L): Either[L, Unit] = + if (truth) Right(()) else Left(failure) +} diff --git a/algebird-core/src/main/scala-3/monad/Reader.scala b/algebird-core/src/main/scala-3/monad/Reader.scala new file mode 100644 index 000000000..e0747af20 --- /dev/null +++ b/algebird-core/src/main/scala-3/monad/Reader.scala @@ -0,0 +1,76 @@ +/* + Copyright 2013 Twitter, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +package com.twitter.algebird.monad + +import com.twitter.algebird.Monad + +// TODO this is general, move somewhere better + +// Reader Monad, represents a series of operations that mutate some environment +// type (the input to the function) + +sealed trait Reader[-Env, +T] { + def apply(env: Env): T + def flatMap[E1 <: Env, U](next: T => Reader[E1, U]): Reader[E1, U] = + FlatMappedReader[E1, T, U](this, next) + def map[U](thatFn: T => U): Reader[Env, U] = + FlatMappedReader(this, (t: T) => ConstantReader(thatFn(t))) +} + +final case class ConstantReader[+T](get: T) extends Reader[Any, T] { + override def apply(env: Any): T = get + override def map[U](fn: T => U): ConstantReader[U] = ConstantReader(fn(get)) + override def flatMap[E1 <: Any, U](next: T => Reader[E1, U]): Reader[E1, U] = + next(get) +} +final case class ReaderFn[E, +T](fn: E => T) extends Reader[E, T] { + override def apply(env: E): T = fn(env) +} +final case class FlatMappedReader[E, U, +T](first: Reader[E, U], fn: U => Reader[E, T]) extends Reader[E, T] { + override def apply(env: E): T = { + @annotation.tailrec + def loop(r: Reader[E, Any], stack: List[(Any) => Reader[E, Any]]): Any = + r match { + case ConstantReader(get) => + stack match { + case head :: tail => loop(head(get), tail) + case Nil => get + } + case ReaderFn(fn) => + stack match { + case head :: tail => loop(head(fn(env)), tail) + case Nil => fn(env) + } + case FlatMappedReader(first, nextFn) => loop(first, nextFn :: stack) + } + loop(first, List(fn.asInstanceOf[(Any) => Reader[E, Any]])).asInstanceOf[T] + } +} + +object Reader { + def const[T](t: T): Reader[Any, T] = ConstantReader(t) + implicit def apply[E, T](fn: (E) => T): Reader[E, T] = ReaderFn(fn) + + class ReaderM[Env] extends Monad[Reader[Env, _]] { + override def apply[T](t: T): ConstantReader[T] = ConstantReader(t) + override def flatMap[T, U](self: Reader[Env, T])(next: T => Reader[Env, U]): Reader[Env, U] = + self.flatMap(next) + override def map[T, U](self: Reader[Env, T])(fn: T => U): Reader[Env, U] = self.map(fn) + } + + implicit def monad[Env]: Monad[Reader[Env, _]] = new ReaderM[Env] +} diff --git a/algebird-core/src/main/scala-3/monad/StateWithError.scala b/algebird-core/src/main/scala-3/monad/StateWithError.scala new file mode 100644 index 000000000..e15a9ebc3 --- /dev/null +++ b/algebird-core/src/main/scala-3/monad/StateWithError.scala @@ -0,0 +1,130 @@ +/* + Copyright 2013 Twitter, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +package com.twitter.algebird.monad + +import com.twitter.algebird.{Monad, Semigroup} + +/** + * Monad to handle mutating input state and possible failures. This is used to interact in the planning phase + * with existing mutable APIs (like storm or cascading), but retain the ability to compose carefully. + */ +sealed trait StateWithError[S, +F, +T] { + def join[F1 >: F, U]( + that: StateWithError[S, F1, U], + mergeErr: (F1, F1) => F1, + mergeState: (S, S) => S + ): StateWithError[S, F1, (T, U)] = + join(that)(Semigroup.from(mergeErr), Semigroup.from(mergeState)) + + def join[F1 >: F, U](that: StateWithError[S, F1, U])(implicit + sgf: Semigroup[F1], + sgs: Semigroup[S] + ): // TODO: deep joins could blow the stack, not yet using trampoline here + StateWithError[S, F1, (T, U)] = + StateFn { (requested: S) => + (run(requested), that.run(requested)) match { + case (Right((s1, r1)), Right((s2, r2))) => + Right((sgs.plus(s1, s2), (r1, r2))) + case (Left(err1), Left(err2)) => + Left(sgf.plus(err1, err2)) // Our earlier is not ready + case (Left(err), _) => Left(err) + case (_, Left(err)) => Left(err) + } + } + + def apply(state: S): Either[F, (S, T)] = run(state) + + def run(state: S): Either[F, (S, T)] + + def flatMap[F1 >: F, U](next: T => StateWithError[S, F1, U]): StateWithError[S, F1, U] = + FlatMappedState(this, next) + + def map[U](fn: (T) => U): StateWithError[S, F, U] = + FlatMappedState(this, (t: T) => StateWithError.const(fn(t))) +} + +/** Simple wrapper of a function in the Monad */ +final case class StateFn[S, F, T](fn: S => Either[F, (S, T)]) extends StateWithError[S, F, T] { + override def run(state: S): Either[F, (S, T)] = fn(state) +} + +/** + * A Trampolining instance that should prevent stack overflow at the expense of performance + */ +final case class FlatMappedState[S, F, T, U](start: StateWithError[S, F, T], fn: T => StateWithError[S, F, U]) + extends StateWithError[S, F, U] { + override def run(state: S): Either[F, (S, U)] = { + @annotation.tailrec + def loop(inState: S, st: StateWithError[S, F, Any], stack: List[Any => StateWithError[S, F, Any]]): Any = + st match { + case StateFn(fn) => + fn(inState) match { + case err @ Left(_) => err // bail at first error + case noError @ Right((newState, out)) => + stack match { + case head :: tailStack => loop(newState, head(out), tailStack) + case Nil => noError // recursion ends + } + } + case FlatMappedState(st, next) => loop(inState, st, next :: stack) + } + loop(state, this, Nil).asInstanceOf[Either[F, (S, U)]] + } +} + +object StateWithError { + def getState[S]: StateWithError[S, Nothing, S] = + StateFn((state: S) => Right((state, state))) + def putState[S](newState: S): StateWithError[S, Nothing, Unit] = + StateFn((_: S) => Right((newState, ()))) + def swapState[S](newState: S): StateWithError[S, Nothing, S] = + StateFn((old: S) => Right((newState, old))) + + def const[S, T](t: T): StateWithError[S, Nothing, T] = + StateFn((state: S) => Right((state, t))) + def lazyVal[S, T](t: => T): StateWithError[S, Nothing, T] = + StateFn((state: S) => Right((state, t))) + def failure[S, F](f: F): StateWithError[S, F, Nothing] = + StateFn(_ => Left(f)) + + /** + * Use like fromEither[Int](Right("good")) to get a constant Either in the monad + */ + def fromEither[S]: ConstantStateMaker[S] = new ConstantStateMaker[S] + class ConstantStateMaker[S] { + def apply[F, T](either: Either[F, T]): StateWithError[S, F, T] = { (s: S) => either.right.map((s, _)) } + } + + class FunctionLifter[S] { + def apply[I, F, T](fn: I => Either[F, T]): (I => StateWithError[S, F, T]) = { (i: I) => + StateFn((s: S) => fn(i).right.map((s, _))) + } + } + // TODO this should move to Monad and work for any Monad + def toKleisli[S]: FunctionLifter[S] = new FunctionLifter[S] + + implicit def apply[S, F, T](fn: S => Either[F, (S, T)]): StateWithError[S, F, T] = StateFn(fn) + implicit def monad[S, F]: Monad[StateWithError[S, F, _]] = new StateFMonad[F, S] + + class StateFMonad[F, S] extends Monad[StateWithError[S, F, _]] { + override def apply[T](const: T): StateWithError[S, Nothing, T] = { (s: S) => Right((s, const)) } + override def flatMap[T, U]( + earlier: StateWithError[S, F, T] + )(next: T => StateWithError[S, F, U]): StateWithError[S, F, U] = + earlier.flatMap(next) + } +} diff --git a/algebird-core/src/main/scala/com/twitter/algebird/AdaptiveCache.scala b/algebird-core/src/main/scala/com/twitter/algebird/AdaptiveCache.scala index 29329b788..53a0eff17 100644 --- a/algebird-core/src/main/scala/com/twitter/algebird/AdaptiveCache.scala +++ b/algebird-core/src/main/scala/com/twitter/algebird/AdaptiveCache.scala @@ -87,9 +87,9 @@ class AdaptiveCache[K, V: Semigroup](maxCapacity: Int, growthMargin: Double = 3. summingCache = new SummingWithHitsCache(currentCapacity) if (currentCapacity == maxCapacity) - sentinelCache.stopGrowing + sentinelCache.stopGrowing() else - sentinelCache.clear + sentinelCache.clear() } ret } @@ -101,7 +101,7 @@ class AdaptiveCache[K, V: Semigroup](maxCapacity: Int, growthMargin: Double = 3. override def flush: Option[Map[K, V]] = { val ret = summingCache.flush - sentinelCache.clear + sentinelCache.clear() ret } diff --git a/algebird-core/src/main/scala/com/twitter/algebird/AdaptiveVector.scala b/algebird-core/src/main/scala/com/twitter/algebird/AdaptiveVector.scala index e47fb8792..31f5117bc 100644 --- a/algebird-core/src/main/scala/com/twitter/algebird/AdaptiveVector.scala +++ b/algebird-core/src/main/scala/com/twitter/algebird/AdaptiveVector.scala @@ -145,7 +145,7 @@ object AdaptiveVector { def iteq: Boolean = (lit.hasNext, rit.hasNext) match { case (true, true) => - val (lnext, rnext) = (lit.next, rit.next) + val (lnext, rnext) = (lit.next(), rit.next()) if (lnext._1 == rnext._1 && Equiv[V].equiv(lnext._2, rnext._2)) iteq else diff --git a/algebird-core/src/main/scala/com/twitter/algebird/Applicative.scala b/algebird-core/src/main/scala/com/twitter/algebird/Applicative.scala index 32a66339a..211cac612 100644 --- a/algebird-core/src/main/scala/com/twitter/algebird/Applicative.scala +++ b/algebird-core/src/main/scala/com/twitter/algebird/Applicative.scala @@ -42,7 +42,7 @@ trait Applicative[M[_]] extends Functor[M] { case _ => val mb = ms.foldLeft(apply(Seq.newBuilder[T]))((mb, mt) => joinWith(mb, mt)((b, t) => b += t)) - map(mb)(_.result) + map(mb)(_.result()) } def joinWith[T, U, V](mt: M[T], mu: M[U])(fn: (T, U) => V): M[V] = map(join(mt, mu)) { case (t, u) => fn(t, u) } @@ -102,7 +102,7 @@ object Applicative { )(implicit app: Applicative[M], cbf: Factory[T, R[T]]): M[R[T]] = { val bldr = cbf.newBuilder val mbldr = ms.iterator.foldLeft(app.apply(bldr))((mb, mt) => app.joinWith(mb, mt)(_ += _)) - app.map(mbldr)(_.result) + app.map(mbldr)(_.result()) } def joinWith[M[_], T, U, V](mt: M[T], mu: M[U])(fn: (T, U) => V)(implicit app: Applicative[M]): M[V] = diff --git a/algebird-core/src/main/scala/com/twitter/algebird/AveragedValue.scala b/algebird-core/src/main/scala/com/twitter/algebird/AveragedValue.scala index 9d684db79..efef198e3 100644 --- a/algebird-core/src/main/scala/com/twitter/algebird/AveragedValue.scala +++ b/algebird-core/src/main/scala/com/twitter/algebird/AveragedValue.scala @@ -112,7 +112,7 @@ object AveragedValue { */ def numericAggregator[N](implicit num: Numeric[N]): MonoidAggregator[N, AveragedValue, Double] = Aggregator - .prepareMonoid { n: N => AveragedValue(num.toDouble(n)) } + .prepareMonoid { (n: N) => AveragedValue(num.toDouble(n)) } .andThenPresent(_.value) /** diff --git a/algebird-core/src/main/scala/com/twitter/algebird/Batched.scala b/algebird-core/src/main/scala/com/twitter/algebird/Batched.scala index d209a98dc..0db108a3a 100644 --- a/algebird-core/src/main/scala/com/twitter/algebird/Batched.scala +++ b/algebird-core/src/main/scala/com/twitter/algebird/Batched.scala @@ -104,7 +104,7 @@ object Batched { if (ts.iterator.isEmpty) None else { val it = ts.iterator - val t0 = it.next + val t0 = it.next() Some(Item(t0).append(it)) } diff --git a/algebird-core/src/main/scala/com/twitter/algebird/BloomFilter.scala b/algebird-core/src/main/scala/com/twitter/algebird/BloomFilter.scala index bda97981d..5ea0f11d5 100644 --- a/algebird-core/src/main/scala/com/twitter/algebird/BloomFilter.scala +++ b/algebird-core/src/main/scala/com/twitter/algebird/BloomFilter.scala @@ -33,7 +33,7 @@ object RichCBitSet { def fromBitSet(bs: BitSet): CBitSet = { val nbs = new CBitSet val it = bs.iterator - while (it.hasNext) { nbs.set(it.next) } + while (it.hasNext) { nbs.set(it.next()) } nbs } implicit def cb2rcb(cb: CBitSet): RichCBitSet = new RichCBitSet(cb) @@ -235,7 +235,7 @@ case class BloomFilterMonoid[A](numHashes: Int, width: Int)(implicit hash: Hash1 case BFInstance(_, bitset, _) => // these Ints are boxed so, that's a minor bummer val iter = bitset.iterator - while (iter.hasNext) { set(iter.next) } + while (iter.hasNext) { set(iter.next()) } } if (sets == 0) Some(zero) else if (sets == numHashes && (oneItem != null)) Some(oneItem) @@ -307,7 +307,7 @@ object BF { new IntIterator { val boxedIter: Iterator[Int] = bitset.iterator override def hasNext: Boolean = boxedIter.hasNext - override def next: Int = boxedIter.next + override def next: Int = boxedIter.next() } case BFZero(_, _) => new IntIterator { diff --git a/algebird-core/src/main/scala/com/twitter/algebird/BufferedOperation.scala b/algebird-core/src/main/scala/com/twitter/algebird/BufferedOperation.scala index e8c45b668..102f2e3c7 100644 --- a/algebird-core/src/main/scala/com/twitter/algebird/BufferedOperation.scala +++ b/algebird-core/src/main/scala/com/twitter/algebird/BufferedOperation.scala @@ -45,7 +45,7 @@ abstract class ArrayBufferedOperation[I, O](size: Int) extends Buffered[I, O] { if (buffer.isEmpty) None else { val res = operate(buffer.toSeq) - buffer.clear + buffer.clear() Some(res) } diff --git a/algebird-core/src/main/scala/com/twitter/algebird/ExpHist.scala b/algebird-core/src/main/scala/com/twitter/algebird/ExpHist.scala index 2f6d6e988..3a01eee07 100644 --- a/algebird-core/src/main/scala/com/twitter/algebird/ExpHist.scala +++ b/algebird-core/src/main/scala/com/twitter/algebird/ExpHist.scala @@ -105,7 +105,7 @@ case class ExpHist( b += bucket }, _ => Vector.newBuilder[Bucket], - x => addAll(x.result) + x => addAll(x.result()) ) // This internal method assumes that the instance is stepped forward @@ -182,7 +182,7 @@ object ExpHist { case class Bucket(size: Long, timestamp: Timestamp) object Bucket { - implicit val ord: Ordering[Bucket] = Ordering.by { b: Bucket => (b.timestamp, b.size) } + implicit val ord: Ordering[Bucket] = Ordering.by { (b: Bucket) => (b.timestamp, b.size) } } /** @@ -260,7 +260,7 @@ object ExpHist { if (desired.isEmpty) Vector.empty else { val input = buckets.dropWhile(_.size == 0) - val bucketSize +: tail = desired + val bucketSize +: tail = desired : @unchecked val remaining = drop(bucketSize, input) input.head.copy(size = bucketSize) +: rebucket(remaining, tail) } @@ -275,7 +275,7 @@ object ExpHist { * If an element wasn't fully consumed, the remainder will be stuck back onto the head. */ @tailrec private[this] def drop(toDrop: Long, input: Vector[Bucket]): Vector[Bucket] = { - val (b @ Bucket(count, _)) +: tail = input + val (b @ Bucket(count, _)) +: tail = input : @unchecked (toDrop - count) match { case 0 => tail case x if x < 0 => b.copy(size = -x) +: tail diff --git a/algebird-core/src/main/scala/com/twitter/algebird/HashingTrick.scala b/algebird-core/src/main/scala/com/twitter/algebird/HashingTrick.scala index 0d86aa03e..03b1dad0c 100644 --- a/algebird-core/src/main/scala/com/twitter/algebird/HashingTrick.scala +++ b/algebird-core/src/main/scala/com/twitter/algebird/HashingTrick.scala @@ -27,7 +27,7 @@ class HashingTrickMonoid[V: Group](bits: Int, seed: Int = 123456) extends Monoid Monoid.plus(left, right) def init[K](kv: (K, V))(implicit ev: K => Array[Byte]): AdaptiveVector[V] = { - val (long1, long2) = hash(kv._1) + val (long1, long2):(Long,Long) = hash(kv._1) val index = (long1 & bitMask).toInt val isNegative = (long2 & 1) == 1 diff --git a/algebird-core/src/main/scala/com/twitter/algebird/HyperLogLog.scala b/algebird-core/src/main/scala/com/twitter/algebird/HyperLogLog.scala index adac1141d..0fc0b97e6 100644 --- a/algebird-core/src/main/scala/com/twitter/algebird/HyperLogLog.scala +++ b/algebird-core/src/main/scala/com/twitter/algebird/HyperLogLog.scala @@ -419,7 +419,7 @@ case class SparseHLL(override val bits: Int, maxRhow: Map[Int, Max[Byte]]) exten val iter: Iterator[(Int, Max[Byte])] = maxRhow.iterator while (iter.hasNext) { - val (idx, _) = iter.next + val (idx, _) = iter.next() val existing: Byte = newContents(idx) val other: Byte = maxRhow(idx).get @@ -575,7 +575,7 @@ class HyperLogLogMonoid(val bits: Int) extends Monoid[HLL] with BoundedSemilatti None } else { val iter = items.iterator.buffered - var curValue = iter.next + var curValue = iter.next() while (iter.hasNext) { curValue = (curValue, iter.head) match { case (DenseHLL(_, _), _) => denseUpdate(curValue, iter) diff --git a/algebird-core/src/main/scala/com/twitter/algebird/HyperLogLogSeries.scala b/algebird-core/src/main/scala/com/twitter/algebird/HyperLogLogSeries.scala index f795b1a4c..75b5c7ccc 100644 --- a/algebird-core/src/main/scala/com/twitter/algebird/HyperLogLogSeries.scala +++ b/algebird-core/src/main/scala/com/twitter/algebird/HyperLogLogSeries.scala @@ -62,7 +62,7 @@ case class HLLSeries(bits: Int, rows: Vector[Map[Int, Long]]) { while (i >= 0) { val it = rows(i).iterator while (it.hasNext) { - val (k, t) = it.next + val (k, t) = it.next() if (t >= threshold && seen.add(k)) { sum += HyperLogLog.negativePowersOfTwo(i + 1) } @@ -142,7 +142,7 @@ class HyperLogLogSeriesMonoid(val bits: Int) extends Monoid[HLLSeries] { val bldr = Vector.newBuilder[Map[Int, Long]] val lit = left.rows.iterator val rit = right.rows.iterator - while (lit.hasNext && rit.hasNext) bldr += combine(lit.next, rit.next) + while (lit.hasNext && rit.hasNext) bldr += combine(lit.next(), rit.next()) val zipped = bldr.result() HLLSeries(bits, zipped ++ right.rows.slice(ln, rn)) } diff --git a/algebird-core/src/main/scala/com/twitter/algebird/Max.scala b/algebird-core/src/main/scala/com/twitter/algebird/Max.scala index df95c4691..6e84c7541 100644 --- a/algebird-core/src/main/scala/com/twitter/algebird/Max.scala +++ b/algebird-core/src/main/scala/com/twitter/algebird/Max.scala @@ -160,8 +160,8 @@ private[algebird] sealed abstract class LowPriorityMaxInstances { while (true) { if (xs.hasNext) { if (ys.hasNext) { - val x = xs.next - val y = ys.next + val x = xs.next() + val y = ys.next() val cmp = ord.compare(x, y) if (cmp != 0) return cmp } else { diff --git a/algebird-core/src/main/scala/com/twitter/algebird/Metric.scala b/algebird-core/src/main/scala/com/twitter/algebird/Metric.scala index e5c6df39b..fc4dd10e8 100644 --- a/algebird-core/src/main/scala/com/twitter/algebird/Metric.scala +++ b/algebird-core/src/main/scala/com/twitter/algebird/Metric.scala @@ -73,7 +73,7 @@ object Metric { def minkowskiMap[K, V: Monoid: Metric](p: Double): Metric[Map[K, V]] = Metric.from { (a: Map[K, V], b: Map[K, V]) => - val outP = (a.keySet ++ b.keySet).map { key: K => + val outP = (a.keySet ++ b.keySet).map { (key: K) => val v1 = a.getOrElse(key, Monoid.zero[V]) val v2 = b.getOrElse(key, Monoid.zero[V]) math.pow(implicitly[Metric[V]].apply(v1, v2), p) diff --git a/algebird-core/src/main/scala/com/twitter/algebird/MinHasher.scala b/algebird-core/src/main/scala/com/twitter/algebird/MinHasher.scala index ada06450b..5c6b9ebc9 100644 --- a/algebird-core/src/main/scala/com/twitter/algebird/MinHasher.scala +++ b/algebird-core/src/main/scala/com/twitter/algebird/MinHasher.scala @@ -69,7 +69,7 @@ abstract class MinHasher[H](val numHashes: Int, val numBands: Int)(implicit n: N private val hashFunctions = { val r = new scala.util.Random(seed) val numHashFunctions = math.ceil(numBytes / 16.0).toInt - (1 to numHashFunctions).map(_ => MurmurHash128(r.nextLong)) + (1 to numHashFunctions).map(_ => MurmurHash128(r.nextLong())) } /** Signature for empty set, needed to be a proper Monoid */ diff --git a/algebird-core/src/main/scala/com/twitter/algebird/MomentsGroup.scala b/algebird-core/src/main/scala/com/twitter/algebird/MomentsGroup.scala index 74eb5a428..9da380b3e 100644 --- a/algebird-core/src/main/scala/com/twitter/algebird/MomentsGroup.scala +++ b/algebird-core/src/main/scala/com/twitter/algebird/MomentsGroup.scala @@ -248,7 +248,7 @@ object Moments { val fold: Fold[Double, Moments] = momentsMonoid.zero.fold def numericAggregator[N](implicit num: Numeric[N]): MonoidAggregator[N, Moments, Moments] = - Aggregator.prepareMonoid { n: N => Moments(num.toDouble(n)) } + Aggregator.prepareMonoid { (n: N) => Moments(num.toDouble(n)) } /** * Create a Moments object given a single value. This is useful for initializing moment calculations at the diff --git a/algebird-core/src/main/scala/com/twitter/algebird/Monad.scala b/algebird-core/src/main/scala/com/twitter/algebird/Monad.scala index de8c31a71..cd14c7a96 100644 --- a/algebird-core/src/main/scala/com/twitter/algebird/Monad.scala +++ b/algebird-core/src/main/scala/com/twitter/algebird/Monad.scala @@ -57,7 +57,7 @@ object Monad { if (xs.isEmpty) monad.apply(acc) else - monad.flatMap(fn(acc, xs.head)) { t: T => foldM(t, xs.tail)(fn) } + monad.flatMap(fn(acc, xs.head)) { (t: T) => foldM(t, xs.tail)(fn) } // Some instances of the Monad typeclass (case for a macro): implicit val list: Monad[List] = new Monad[List] { diff --git a/algebird-core/src/main/scala/com/twitter/algebird/Preparer.scala b/algebird-core/src/main/scala/com/twitter/algebird/Preparer.scala index a10d6d8a8..1d81a888e 100644 --- a/algebird-core/src/main/scala/com/twitter/algebird/Preparer.scala +++ b/algebird-core/src/main/scala/com/twitter/algebird/Preparer.scala @@ -187,10 +187,10 @@ trait FlatMapPreparer[A, T] extends Preparer[A, T] { def prepareFn: A => TraversableOnce[T] def map[U](fn: T => U): FlatMapPreparer[A, U] = - FlatMapPreparer { a: A => prepareFn(a).map(fn) } + FlatMapPreparer { (a: A) => prepareFn(a).map(fn) } override def flatMap[U](fn: T => TraversableOnce[U]): FlatMapPreparer[A, U] = - FlatMapPreparer { a: A => prepareFn(a).flatMap(fn) } + FlatMapPreparer { (a: A) => prepareFn(a).flatMap(fn) } override def monoidAggregate[B, C](aggregator: MonoidAggregator[T, B, C]): MonoidAggregator[A, B, C] = aggregator.sumBefore.composePrepare(prepareFn) @@ -242,10 +242,10 @@ object FlatMapPreparer { override val prepareFn: TraversableOnce[A] => TraversableOnce[A] = (a: TraversableOnce[A]) => a override def map[U](fn: A => U): FlatMapPreparer[TraversableOnce[A], U] = - FlatMapPreparer { a: TraversableOnce[A] => a.map(fn) } + FlatMapPreparer { (a: TraversableOnce[A]) => a.map(fn) } override def flatMap[U](fn: A => TraversableOnce[U]): FlatMapPreparer[TraversableOnce[A], U] = - FlatMapPreparer { a: TraversableOnce[A] => a.flatMap(fn) } + FlatMapPreparer { (a: TraversableOnce[A]) => a.flatMap(fn) } override def monoidAggregate[B, C]( aggregator: MonoidAggregator[A, B, C] diff --git a/algebird-core/src/main/scala/com/twitter/algebird/QTree.scala b/algebird-core/src/main/scala/com/twitter/algebird/QTree.scala index 2376cfbf8..c78897715 100644 --- a/algebird-core/src/main/scala/com/twitter/algebird/QTree.scala +++ b/algebird-core/src/main/scala/com/twitter/algebird/QTree.scala @@ -151,9 +151,9 @@ class QTreeSemigroup[A](k: Int)(implicit val underlyingMonoid: Monoid[A]) extend val batchSize = compressBatchSize var count = 1 // start at 1, so we only compress after batchSize items val iter = items.toIterator - var result = iter.next // due to not being empty, this does not throw + var result = iter.next() // due to not being empty, this does not throw while (iter.hasNext) { - result = result.merge(iter.next) + result = result.merge(iter.next()) count += 1 if (count % batchSize == 0) { result = result.compress(k) @@ -428,8 +428,8 @@ class QTree[@specialized(Int, Long, Float, Double) A] private[algebird] ( print(" (" + parentCount + ")") } println(" {" + _sum + "}") - lowerChild.foreach(_.dump) - upperChild.foreach(_.dump) + lowerChild.foreach(_.dump()) + upperChild.foreach(_.dump()) } /** diff --git a/algebird-core/src/main/scala/com/twitter/algebird/SketchMap.scala b/algebird-core/src/main/scala/com/twitter/algebird/SketchMap.scala index f5973c338..e327ed57c 100644 --- a/algebird-core/src/main/scala/com/twitter/algebird/SketchMap.scala +++ b/algebird-core/src/main/scala/com/twitter/algebird/SketchMap.scala @@ -145,7 +145,7 @@ case class SketchMapParams[K](seed: Int, width: Int, depth: Int, heavyHittersCou val numCounters = width (0 to (numHashes - 1)).map { _ => val smhash: SketchMapHash[K] = - SketchMapHash(CMSHash[Long](r.nextInt, 0, numCounters), seed)(serialization) + SketchMapHash(CMSHash[Long](r.nextInt(), 0, numCounters), seed)(serialization) (k: K) => smhash(k) } } diff --git a/algebird-core/src/main/scala/com/twitter/algebird/SummingCache.scala b/algebird-core/src/main/scala/com/twitter/algebird/SummingCache.scala index 4cd9a1505..e2302e899 100644 --- a/algebird-core/src/main/scala/com/twitter/algebird/SummingCache.scala +++ b/algebird-core/src/main/scala/com/twitter/algebird/SummingCache.scala @@ -57,7 +57,7 @@ class SummingCache[K, V](capacity: Int)(implicit sgv: Semigroup[V]) extends Stat override def flush: Option[Map[K, V]] = { // Get a copy of the cache, since it is mutable val res = optNonEmpty(cache.iterator.toMap) - cache.clear + cache.clear() res } override def isFlushed: Boolean = cache.isEmpty diff --git a/algebird-core/src/main/scala/com/twitter/algebird/SummingIterator.scala b/algebird-core/src/main/scala/com/twitter/algebird/SummingIterator.scala index cd9e7deaf..7644aca2e 100644 --- a/algebird-core/src/main/scala/com/twitter/algebird/SummingIterator.scala +++ b/algebird-core/src/main/scala/com/twitter/algebird/SummingIterator.scala @@ -49,16 +49,16 @@ class SummingIterator[V](summer: StatefulSummer[V], it: Iterator[V]) // This has to be lazy because it shouldn't be touched until the val it is exhausted protected lazy val tailIter: Iterator[V] = summer.flush.iterator override def hasNext: Boolean = it.hasNext || tailIter.hasNext - override def next: V = nextInternal + override def next(): V = nextInternal @tailrec private def nextInternal: V = if (it.hasNext) { - summer.put(it.next) match { + summer.put(it.next()) match { case None => nextInternal case Some(v) => v } } else { - tailIter.next + tailIter.next() } } diff --git a/algebird-core/src/main/scala/com/twitter/algebird/Window.scala b/algebird-core/src/main/scala/com/twitter/algebird/Window.scala index 8df431d7e..199553780 100644 --- a/algebird-core/src/main/scala/com/twitter/algebird/Window.scala +++ b/algebird-core/src/main/scala/com/twitter/algebird/Window.scala @@ -126,7 +126,7 @@ abstract class WindowMonoid[T](windowSize: Int) extends Monoid[Window[T]] { val it = ws.toIterator var queue = Queue.empty[T] while (it.hasNext) { - queue = (queue ++ it.next.items).takeRight(windowSize) + queue = (queue ++ it.next().items).takeRight(windowSize) } Some(Window(monoid.sum(queue), queue)) } @@ -140,7 +140,7 @@ abstract class WindowMonoid[T](windowSize: Int) extends Monoid[Window[T]] { while (it.hasNext) { // avoid materializing the whole list in memory // at one time - queue = queue :+ it.next + queue = queue :+ it.next() size = size + 1 if (size > windowSize) { queue = queue.tail diff --git a/algebird-core/src/main/scala/com/twitter/algebird/immutable/BitSet.scala b/algebird-core/src/main/scala/com/twitter/algebird/immutable/BitSet.scala index d58a6c9ab..3e90cadcf 100644 --- a/algebird-core/src/main/scala/com/twitter/algebird/immutable/BitSet.scala +++ b/algebird-core/src/main/scala/com/twitter/algebird/immutable/BitSet.scala @@ -573,7 +573,7 @@ object BitSet { BitSet.adoptedUnion(this, rhs) } else { // height == rhs.height, so we know rhs is a Branch. - val Branch(_, _, rcs) = rhs + val Branch(_, _, rcs) = rhs : @unchecked val cs = new Array[BitSet](32) var i = 0 while (i < 32) { @@ -605,7 +605,7 @@ object BitSet { Empty } else { // height == rhs.height, so we know rhs is a Branch. - val Branch(_, _, rcs) = rhs + val Branch(_, _, rcs) = rhs: @unchecked val cs = new Array[BitSet](32) var i = 0 var nonEmpty = false @@ -643,7 +643,7 @@ object BitSet { false } else { // height == rhs.height, so we know rhs is a Branch. - val Branch(_, _, rcs) = rhs + val Branch(_, _, rcs) = rhs : @unchecked var i = 0 while (i < 32) { val x = children(i) @@ -688,7 +688,7 @@ object BitSet { this | rhs } else { // height == rhs.height, so we know rhs is a Branch. - val Branch(_, _, rcs) = rhs + val Branch(_, _, rcs) = rhs : @unchecked val cs = new Array[BitSet](32) var i = 0 while (i < 32) { @@ -805,7 +805,7 @@ object BitSet { throw InternalError("branch misaligned") } else { // height == rhs.height, so we know rhs is a Branch. - val Branch(_, _, rcs) = rhs + val Branch(_, _, rcs) = rhs: @unchecked var i = 0 while (i < 32) { val x = children(i) diff --git a/algebird-core/src/main/scala/com/twitter/algebird/immutable/BloomFilter.scala b/algebird-core/src/main/scala/com/twitter/algebird/immutable/BloomFilter.scala index 71a861075..572dce367 100644 --- a/algebird-core/src/main/scala/com/twitter/algebird/immutable/BloomFilter.scala +++ b/algebird-core/src/main/scala/com/twitter/algebird/immutable/BloomFilter.scala @@ -272,7 +272,7 @@ final case class BloomFilter[A](numHashes: Int, width: Int)(implicit val hash: H override def +(other: A): Hash = { val bs = BitSet.newEmpty(0) - val hash = new Array[Int](numHashes) + val hash = new Array[Int](this.numHashes) hashToArray(item, hash) bs.mutableAdd(hash) @@ -336,7 +336,7 @@ final case class BloomFilter[A](numHashes: Int, width: Int)(implicit val hash: H // use an approximation width of 0.05 override def size: Approximate[Long] = - BloomFilter.sizeEstimate(numBits, numHashes, width, 0.05) + BloomFilter.sizeEstimate(this.numBits, numHashes, width, 0.05) } implicit val monoid: Monoid[Hash] with BoundedSemilattice[Hash] = @@ -402,7 +402,7 @@ final case class BloomFilter[A](numHashes: Int, width: Int)(implicit val hash: H /** * Create a bloom filter with multiple items from an iterator */ - def create(data: Iterator[A]): Hash = monoid.sum(data.map(Item)) + def create(data: Iterator[A]): Hash = monoid.sum(data.map(Item.apply)) val empty: Hash = Empty diff --git a/algebird-core/src/main/scala/com/twitter/algebird/matrix/AdaptiveMatrix.scala b/algebird-core/src/main/scala/com/twitter/algebird/matrix/AdaptiveMatrix.scala index f970c43f3..c50d912d7 100644 --- a/algebird-core/src/main/scala/com/twitter/algebird/matrix/AdaptiveMatrix.scala +++ b/algebird-core/src/main/scala/com/twitter/algebird/matrix/AdaptiveMatrix.scala @@ -95,7 +95,7 @@ object AdaptiveMatrix { var row = 0 val iter = storage.iterator while (iter.hasNext) { - val curRow = iter.next + val curRow = iter.next() curRow.foreach { case (col, value) => buffer(row * cols + col) = value } @@ -114,7 +114,7 @@ object AdaptiveMatrix { val sparseStorage = (0 until rows).map(_ => MMap[Int, V]()).toIndexedSeq while (iter.hasNext) { - val current = iter.next + val current = iter.next() current match { case d @ DenseMatrix(_, _, _) => return denseUpdate(d, iter) case s @ SparseColumnMatrix(_) => diff --git a/algebird-core/src/main/scala/com/twitter/algebird/matrix/SparseColumnMatrix.scala b/algebird-core/src/main/scala/com/twitter/algebird/matrix/SparseColumnMatrix.scala index 69f553360..96f201eb8 100644 --- a/algebird-core/src/main/scala/com/twitter/algebird/matrix/SparseColumnMatrix.scala +++ b/algebird-core/src/main/scala/com/twitter/algebird/matrix/SparseColumnMatrix.scala @@ -49,7 +49,7 @@ case class SparseColumnMatrix[V: Monoid](rowsByColumns: IndexedSeq[AdaptiveVecto while (row < rows) { val iter = rowsByColumns(row).denseIterator while (iter.hasNext) { - val (col, value) = iter.next + val (col, value):(Int,V) = iter.next() val indx = row * lcols + col buffer(indx) = valueMonoid.plus(buffer(indx), value) } diff --git a/algebird-core/src/main/scala/com/twitter/algebird/statistics/IterCallStatistics.scala b/algebird-core/src/main/scala/com/twitter/algebird/statistics/IterCallStatistics.scala index 5c3e4c37b..38c026937 100644 --- a/algebird-core/src/main/scala/com/twitter/algebird/statistics/IterCallStatistics.scala +++ b/algebird-core/src/main/scala/com/twitter/algebird/statistics/IterCallStatistics.scala @@ -36,7 +36,7 @@ private class IterCallStatistics(threadSafe: Boolean) { total.add(v) // log2(v + 1) for v up to 2^maxBucket val bucket = min(64 - numberOfLeadingZeros(v), maxBucket) - distribution(bucket).increment + distribution(bucket).increment() } def count: Long = distribution.foldLeft(0L)(_ + _.get) // sum @@ -59,8 +59,8 @@ private class IterCallStatistics(threadSafe: Boolean) { private class CountingIterator[T](val i: Iterator[T]) extends Iterator[T] { private[this] final var nextCount: Long = 0 override def hasNext: Boolean = i.hasNext - override def next: T = { - val n = i.next + override def next(): T = { + val n = i.next() nextCount += 1 n } diff --git a/algebird-core/src/main/scala/com/twitter/algebird/statistics/Statistics.scala b/algebird-core/src/main/scala/com/twitter/algebird/statistics/Statistics.scala index ce166c250..3becb8b8a 100644 --- a/algebird-core/src/main/scala/com/twitter/algebird/statistics/Statistics.scala +++ b/algebird-core/src/main/scala/com/twitter/algebird/statistics/Statistics.scala @@ -37,7 +37,7 @@ class StatisticsSemigroup[T](threadSafe: Boolean = true)(implicit wrappedSemigro def getSumOptionCallTime: Long = sumOptionCallsStats.getTotalCallTime override def plus(x: T, y: T): T = { - plusCallsCount.increment + plusCallsCount.increment() Semigroup.plus(x, y) } @@ -66,7 +66,7 @@ class StatisticsMonoid[T](threadSafe: Boolean = true)(implicit wrappedMonoid: Mo def getSumCallTime: Long = sumCallsStats.getTotalCallTime override def zero: T = { - zeroCallsCount.increment + zeroCallsCount.increment() Monoid.zero } @@ -95,12 +95,12 @@ class StatisticsGroup[T](threadSafe: Boolean = true)(implicit group: Group[T]) def getMinusCallCount: Long = minusCallsCount.get override def negate(x: T): T = { - negateCallsCount.increment + negateCallsCount.increment() Group.negate(x) } override def minus(l: T, r: T): T = { - minusCallsCount.increment + minusCallsCount.increment() Group.minus(l, r) } @@ -129,12 +129,12 @@ class StatisticsRing[T](threadSafe: Boolean = true)(implicit ring: Ring[T]) def getProductCallTime: Long = productCallsStats.getTotalCallTime override def one: T = { - oneCallsCount.increment + oneCallsCount.increment() Ring.one } override def times(x: T, y: T): T = { - timesCallsCount.increment + timesCallsCount.increment() Ring.times(x, y) } diff --git a/build.sbt b/build.sbt index afc7de9c7..bcd23c4f2 100644 --- a/build.sbt +++ b/build.sbt @@ -31,6 +31,8 @@ def scalaBinaryVersion(scalaVersion: String) = scalaVersion match { case version => sys.error(s"unsupported scala version $version") } +def isScala3(scalaVersion: String) = scalaVersion.startsWith("3.") + def isScala212x(scalaVersion: String) = scalaBinaryVersion(scalaVersion) == "2.12" def isScala213x(scalaVersion: String) = scalaBinaryVersion(scalaVersion) == "2.13" @@ -110,6 +112,16 @@ val sharedSettings = Seq( scalaVersion.value ) ) ++ mimaSettings +// NOTE: After dropping Scala 2.11, we can remove src/main/scala-2.11 and share sources between scala 2.12, 2.13 and 3.x. +lazy val kindprojectorSettings = Seq( + Compile / scalacOptions ++= { + CrossVersion.partialVersion(scalaVersion.value) match { + case Some((3, _)) => Seq("-Ykind-projector:underscores") + case Some((2, 12 | 13)) => Seq("-Xsource:3", "-P:kind-projector:underscore-placeholders") + case _ => Seq.empty + } + } +) lazy val noPublishSettings = Seq( publish / skip := true, @@ -208,33 +220,43 @@ def module(name: String) = { .settings(sharedSettings ++ Seq(Keys.name := id, mimaPreviousArtifacts := previousVersion(name).toSet)) } -lazy val algebirdCore = module("core").settings( - crossScalaVersions += "2.13.8", - initialCommands := """ +lazy val algebirdCore = module("core") + .settings( + crossScalaVersions += "2.13.8", + // crossScalaVersions += "3.2.2", + initialCommands := """ import com.twitter.algebird._ """.stripMargin('|'), - libraryDependencies ++= - Seq( - "com.googlecode.javaewah" % "JavaEWAH" % javaEwahVersion, - "org.typelevel" %% "algebra" % algebraVersion, - "org.scala-lang" % "scala-reflect" % scalaVersion.value, - "org.scalatest" %% "scalatest" % scalaTestVersion % "test", - "org.scala-lang.modules" %% "scala-collection-compat" % scalaCollectionCompat - ) ++ { - if (isScala213x(scalaVersion.value)) { - Seq() - } else { - Seq(compilerPlugin(("org.scalamacros" % "paradise" % paradiseVersion).cross(CrossVersion.full))) - } - }, - addCompilerPlugin(("org.typelevel" % "kind-projector" % kindProjectorVersion).cross(CrossVersion.full)), - Compile / sourceGenerators += Def.task { - GenTupleAggregators.gen((Compile / sourceManaged).value) - }.taskValue, - // Scala 2.12's doc task was failing. - Compile / doc / sources ~= (_.filterNot(_.absolutePath.contains("javaapi"))), - Test / testOptions := Seq(Tests.Argument(TestFrameworks.JUnit, "-a")) -) + libraryDependencies ++= + Seq( + "com.googlecode.javaewah" % "JavaEWAH" % javaEwahVersion, + ("org.typelevel" %% "algebra" % algebraVersion).cross(CrossVersion.for3Use2_13), + "org.scalatest" %% "scalatest" % scalaTestVersion % "test", + "org.scala-lang.modules" %% "scala-collection-compat" % scalaCollectionCompat + ) ++ { + if (isScala3(scalaVersion.value)) { + Seq.empty + } else if (isScala213x(scalaVersion.value)) { + Seq( + "org.scala-lang" % "scala-reflect" % scalaVersion.value, + compilerPlugin("org.typelevel" % "kind-projector" % kindProjectorVersion).cross(CrossVersion.full) + ) + } else { + Seq( + "org.scala-lang" % "scala-reflect" % scalaVersion.value, + compilerPlugin(("org.scalamacros" % "paradise" % paradiseVersion).cross(CrossVersion.full)), + compilerPlugin("org.typelevel" % "kind-projector" % kindProjectorVersion).cross(CrossVersion.full) + ) + } + }, + Compile / sourceGenerators += Def.task { + GenTupleAggregators.gen((Compile / sourceManaged).value) + }.taskValue, + // Scala 2.12's doc task was failing. + Compile / doc / sources ~= (_.filterNot(_.absolutePath.contains("javaapi"))), + Test / testOptions := Seq(Tests.Argument(TestFrameworks.JUnit, "-a")) + ) + .settings(kindprojectorSettings) lazy val algebirdTest = module("test") .settings(