diff --git a/.gitignore b/.gitignore
index 836905ce4..3913864c6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -48,6 +48,7 @@ sonatype.sbt
BUILD
target/
lib_managed/
+project/metals.sbt
project/boot/
project/build/target/
project/plugins/target/
diff --git a/.scalafmt.conf b/.scalafmt.conf
index c9f903c4f..d4daaafab 100644
--- a/.scalafmt.conf
+++ b/.scalafmt.conf
@@ -1,7 +1,10 @@
version=3.6.0
runner.dialect = scala212
fileOverride {
- "glob:**/scala-2.13*/**" {
+ "glob:**/scala-3/**" {
+ runner.dialect = scala3
+ }
+ "glob:**/scala-2*/**" {
runner.dialect = scala213
}
}
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/Aggregator.scala b/algebird-core/src/main/scala-2.11/Aggregator.scala
similarity index 99%
rename from algebird-core/src/main/scala/com/twitter/algebird/Aggregator.scala
rename to algebird-core/src/main/scala-2.11/Aggregator.scala
index 4e78d234b..fd380a15d 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/Aggregator.scala
+++ b/algebird-core/src/main/scala-2.11/Aggregator.scala
@@ -20,7 +20,7 @@ object Aggregator extends java.io.Serializable {
* This is a trivial aggregator that always returns a single value
*/
def const[T](t: T): MonoidAggregator[Any, Unit, T] =
- prepareMonoid { _: Any => () }.andThenPresent(_ => t)
+ prepareMonoid((_: Any) => ()).andThenPresent(_ => t)
/**
* Using Aggregator.prepare,present you can add to this aggregator
@@ -172,7 +172,7 @@ object Aggregator extends java.io.Serializable {
* How many items satisfy a predicate
*/
def count[T](pred: T => Boolean): MonoidAggregator[T, Long, Long] =
- prepareMonoid { t: T => if (pred(t)) 1L else 0L }
+ prepareMonoid((t: T) => if (pred(t)) 1L else 0L)
/**
* Do any items satisfy some predicate
@@ -310,7 +310,7 @@ object Aggregator extends java.io.Serializable {
* Put everything in a Set. Note, this could fill the memory if the Set is very large.
*/
def toSet[T]: MonoidAggregator[T, Set[T], Set[T]] =
- prepareMonoid { t: T => Set(t) }
+ prepareMonoid((t: T) => Set(t))
/**
* This builds an in-memory Set, and then finally gets the size of that set. This may not be scalable if the
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/CountMinSketch.scala b/algebird-core/src/main/scala-2.11/CountMinSketch.scala
similarity index 99%
rename from algebird-core/src/main/scala/com/twitter/algebird/CountMinSketch.scala
rename to algebird-core/src/main/scala-2.11/CountMinSketch.scala
index f000c7fe3..809d8785f 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/CountMinSketch.scala
+++ b/algebird-core/src/main/scala-2.11/CountMinSketch.scala
@@ -185,9 +185,9 @@ class CMSSummation[K](params: CMSParams[K]) {
val rit = matrix.iterator
while (rit.hasNext) {
var col = 0
- val cit = rit.next.iterator
+ val cit = rit.next().iterator
while (cit.hasNext) {
- cells(offset + col) += cit.next
+ cells(offset + col) += cit.next()
col += 1
}
offset += width
@@ -206,7 +206,7 @@ class CMSSummation[K](params: CMSParams[K]) {
b += cells(offset + col)
col += 1
}
- b.result
+ b.result()
}
val b = Vector.newBuilder[Vector[Long]]
@@ -215,7 +215,7 @@ class CMSSummation[K](params: CMSParams[K]) {
b += vectorize(row)
row += 1
}
- CMSInstance(CMSInstance.CountsTable(b.result), totalCount, params)
+ CMSInstance(CMSInstance.CountsTable(b.result()), totalCount, params)
}
}
@@ -724,7 +724,7 @@ case class CMSInstance[K](
val it = countsTable.counts.iterator
var i = 0
while (it.hasNext) {
- val row = it.next
+ val row = it.next()
val count = row(hs(i)(item))
if (count < freq) freq = count
i += 1
@@ -817,13 +817,13 @@ object CMSInstance {
val yss = other.counts.iterator
val rows = Vector.newBuilder[Vector[Long]]
while (xss.hasNext) {
- val xs = xss.next.iterator
- val ys = yss.next.iterator
+ val xs = xss.next().iterator
+ val ys = yss.next().iterator
val row = Vector.newBuilder[Long]
- while (xs.hasNext) row += (xs.next + ys.next)
- rows += row.result
+ while (xs.hasNext) row += (xs.next() + ys.next())
+ rows += row.result()
}
- CountsTable[K](rows.result)
+ CountsTable[K](rows.result())
}
}
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/DecayedVector.scala b/algebird-core/src/main/scala-2.11/DecayedVector.scala
similarity index 100%
rename from algebird-core/src/main/scala/com/twitter/algebird/DecayedVector.scala
rename to algebird-core/src/main/scala-2.11/DecayedVector.scala
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/DecayingCMS.scala b/algebird-core/src/main/scala-2.11/DecayingCMS.scala
similarity index 98%
rename from algebird-core/src/main/scala/com/twitter/algebird/DecayingCMS.scala
rename to algebird-core/src/main/scala-2.11/DecayingCMS.scala
index 2b6a5f157..fd8433754 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/DecayingCMS.scala
+++ b/algebird-core/src/main/scala-2.11/DecayingCMS.scala
@@ -210,7 +210,7 @@ final class DecayingCMS[K](
val hashFns: Array[K => Int] = {
val rng = new Random(seed)
def genPos(): Int =
- rng.nextInt match {
+ rng.nextInt() match {
case 0 => genPos()
case n => n & 0x7fffffff
}
@@ -323,10 +323,10 @@ final class DecayingCMS[K](
var i = 0
while (i < cells.length) {
val it = cells(i).iterator
- var localMax = it.next // we know it doesn't start empty
+ var localMax = it.next() // we know it doesn't start empty
if (localMax < minMinimum) minMinimum = localMax
while (it.hasNext) {
- val n = it.next
+ val n = it.next()
if (n > localMax) localMax = n
else if (n < minMinimum) minMinimum = n
}
@@ -362,7 +362,7 @@ final class DecayingCMS[K](
val it0 = this.cells(i).iterator
val it1 = that.cells(i).iterator
while (it0.hasNext) {
- val x = it0.next * it1.next
+ val x = it0.next() * it1.next()
if (x != 0.0) sum += x
}
if (sum < res) res = sum
@@ -426,7 +426,7 @@ final class DecayingCMS[K](
val x = this
val y = other
val timeInHL = Math.max(x.timeInHL, y.timeInHL)
- val cms = new CMS(allocCells, 0.0, timeInHL)
+ val cms = new CMS(allocCells(), 0.0, timeInHL)
val xscale = x.getScale(timeInHL)
val yscale = y.getScale(timeInHL)
@@ -445,7 +445,7 @@ final class DecayingCMS[K](
bldr += prod(left(j), xscale) + prod(right(j), yscale)
j += 1
}
- cms.cells(i) = bldr.result
+ cms.cells(i) = bldr.result()
i += 1
}
cms
@@ -505,7 +505,7 @@ final class DecayingCMS[K](
if (expL == 0.0) {
new CMS(monoid.zero.cells, 0.0, ts)
} else {
- val cms = new CMS(allocCells, 0.0, ts)
+ val cms = new CMS(allocCells(), 0.0, ts)
var i = 0
while (i < depth) {
val ci = cells(i)
@@ -547,7 +547,7 @@ final class DecayingCMS[K](
bldr += scratch(j)
j += 1
}
- cells(i) = bldr.result
+ cells(i) = bldr.result()
i += 1
}
cells
@@ -606,7 +606,7 @@ final class DecayingCMS[K](
val arr = new Array[CMS](ChunkSize)
while (it.hasNext) {
while (it.hasNext && i < ChunkSize) {
- arr(i) = it.next
+ arr(i) = it.next()
i += 1
}
if (i > 1) {
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/Fold.scala b/algebird-core/src/main/scala-2.11/Fold.scala
similarity index 99%
rename from algebird-core/src/main/scala/com/twitter/algebird/Fold.scala
rename to algebird-core/src/main/scala-2.11/Fold.scala
index c2f21d145..ded32e628 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/Fold.scala
+++ b/algebird-core/src/main/scala-2.11/Fold.scala
@@ -66,8 +66,8 @@ sealed trait Fold[-I, +O] extends Serializable {
val self = this
new Fold[I, P] {
type X = self.X
- override def build: FoldState[X, I, P] =
- self.build.map(f)
+ override def build(): FoldState[X, I, P] =
+ self.build().map(f)
}
}
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/Interval.scala b/algebird-core/src/main/scala-2.11/Interval.scala
similarity index 100%
rename from algebird-core/src/main/scala/com/twitter/algebird/Interval.scala
rename to algebird-core/src/main/scala-2.11/Interval.scala
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/InvariantAlgebras.scala b/algebird-core/src/main/scala-2.11/InvariantAlgebras.scala
similarity index 100%
rename from algebird-core/src/main/scala/com/twitter/algebird/InvariantAlgebras.scala
rename to algebird-core/src/main/scala-2.11/InvariantAlgebras.scala
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/JavaMonoids.scala b/algebird-core/src/main/scala-2.11/JavaMonoids.scala
similarity index 100%
rename from algebird-core/src/main/scala/com/twitter/algebird/JavaMonoids.scala
rename to algebird-core/src/main/scala-2.11/JavaMonoids.scala
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/MapAlgebra.scala b/algebird-core/src/main/scala-2.11/MapAlgebra.scala
similarity index 99%
rename from algebird-core/src/main/scala/com/twitter/algebird/MapAlgebra.scala
rename to algebird-core/src/main/scala-2.11/MapAlgebra.scala
index 8ee81c42d..55a9f8e54 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/MapAlgebra.scala
+++ b/algebird-core/src/main/scala-2.11/MapAlgebra.scala
@@ -224,7 +224,7 @@ object MapAlgebra {
} else oldVOpt.get
bldr += v
}
- mutable.iterator.map { case (k, bldr) => (k, bldr.result) }.toMap
+ mutable.iterator.map { case (k, bldr) => (k, bldr.result()) }.toMap
}
// Consider this as edges from k -> v, produce a Map[K,Set[V]]
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/Scan.scala b/algebird-core/src/main/scala-2.11/Scan.scala
similarity index 99%
rename from algebird-core/src/main/scala/com/twitter/algebird/Scan.scala
rename to algebird-core/src/main/scala-2.11/Scan.scala
index ff0dce400..d1d10ced7 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/Scan.scala
+++ b/algebird-core/src/main/scala-2.11/Scan.scala
@@ -169,9 +169,9 @@ sealed abstract class Scan[-I, +O] extends Serializable {
def scanIterator(iter: Iterator[I]): Iterator[O] = new AbstractIterator[O] {
override def hasNext: Boolean = iter.hasNext
var state: State = initialState
- override def next: O = {
+ override def next(): O = {
val thisState = state
- val thisA = iter.next
+ val thisA = iter.next()
val (thisC, nextState) = presentAndNextState(thisA, thisState)
state = nextState
thisC
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/SpaceSaver.scala b/algebird-core/src/main/scala-2.11/SpaceSaver.scala
similarity index 99%
rename from algebird-core/src/main/scala/com/twitter/algebird/SpaceSaver.scala
rename to algebird-core/src/main/scala-2.11/SpaceSaver.scala
index 68830547e..d18b58dd6 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/SpaceSaver.scala
+++ b/algebird-core/src/main/scala-2.11/SpaceSaver.scala
@@ -78,7 +78,7 @@ object SpaceSaver {
buff.putLong(b)
buffer ++= buff.array()
}
- buffer.result.toArray
+ buffer.result().toArray
}
// Make sure to be reversible so fromBytes(toBytes(x)) == x
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/VectorSpace.scala b/algebird-core/src/main/scala-2.11/VectorSpace.scala
similarity index 100%
rename from algebird-core/src/main/scala/com/twitter/algebird/VectorSpace.scala
rename to algebird-core/src/main/scala-2.11/VectorSpace.scala
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/monad/EitherMonad.scala b/algebird-core/src/main/scala-2.11/monad/EitherMonad.scala
similarity index 100%
rename from algebird-core/src/main/scala/com/twitter/algebird/monad/EitherMonad.scala
rename to algebird-core/src/main/scala-2.11/monad/EitherMonad.scala
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/monad/Reader.scala b/algebird-core/src/main/scala-2.11/monad/Reader.scala
similarity index 100%
rename from algebird-core/src/main/scala/com/twitter/algebird/monad/Reader.scala
rename to algebird-core/src/main/scala-2.11/monad/Reader.scala
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/monad/StateWithError.scala b/algebird-core/src/main/scala-2.11/monad/StateWithError.scala
similarity index 100%
rename from algebird-core/src/main/scala/com/twitter/algebird/monad/StateWithError.scala
rename to algebird-core/src/main/scala-2.11/monad/StateWithError.scala
diff --git a/algebird-core/src/main/scala-2.12/Aggregator.scala b/algebird-core/src/main/scala-2.12/Aggregator.scala
new file mode 100644
index 000000000..8a4d2b230
--- /dev/null
+++ b/algebird-core/src/main/scala-2.12/Aggregator.scala
@@ -0,0 +1,637 @@
+package com.twitter.algebird
+
+import java.util.PriorityQueue
+import scala.collection.compat._
+import scala.collection.generic.CanBuildFrom
+
+/**
+ * Aggregators compose well.
+ *
+ * To create a parallel aggregator that operates on a single input in parallel, use:
+ * GeneratedTupleAggregator.from2((agg1, agg2))
+ */
+object Aggregator extends java.io.Serializable {
+ implicit def applicative[I]: Applicative[({ type L[O] = Aggregator[I, ?, O] })#L] =
+ new AggregatorApplicative[I]
+
+ private val DefaultSeed = 471312384
+
+ /**
+ * This is a trivial aggregator that always returns a single value
+ */
+ def const[T](t: T): MonoidAggregator[Any, Unit, T] =
+ prepareMonoid { (_: Any) => () }.andThenPresent(_ => t)
+
+ /**
+ * Using Aggregator.prepare,present you can add to this aggregator
+ */
+ def fromReduce[T](red: (T, T) => T): Aggregator[T, T, T] =
+ fromSemigroup(Semigroup.from(red))
+ def fromSemigroup[T](implicit sg: Semigroup[T]): Aggregator[T, T, T] =
+ new Aggregator[T, T, T] {
+ override def prepare(input: T): T = input
+ override def semigroup: Semigroup[T] = sg
+ override def present(reduction: T): T = reduction
+ }
+ def fromMonoid[T](implicit mon: Monoid[T]): MonoidAggregator[T, T, T] =
+ prepareMonoid(identity[T])
+ // Uses the product from the ring
+ def fromRing[T](implicit rng: Ring[T]): RingAggregator[T, T, T] =
+ fromRing[T, T](rng, identity[T])
+
+ def fromMonoid[F, T](implicit mon: Monoid[T], prep: F => T): MonoidAggregator[F, T, T] =
+ prepareMonoid(prep)(mon)
+
+ def prepareSemigroup[F, T](prep: F => T)(implicit sg: Semigroup[T]): Aggregator[F, T, T] =
+ new Aggregator[F, T, T] {
+ override def prepare(input: F): T = prep(input)
+ override def semigroup: Semigroup[T] = sg
+ override def present(reduction: T): T = reduction
+ }
+ def prepareMonoid[F, T](prep: F => T)(implicit m: Monoid[T]): MonoidAggregator[F, T, T] =
+ new MonoidAggregator[F, T, T] {
+ override def prepare(input: F): T = prep(input)
+ override def monoid: Monoid[T] = m
+ override def present(reduction: T): T = reduction
+ }
+ // Uses the product from the ring
+ def fromRing[F, T](implicit rng: Ring[T], prep: F => T): RingAggregator[F, T, T] =
+ new RingAggregator[F, T, T] {
+ override def prepare(input: F): T = prep(input)
+ override def ring: Ring[T] = rng
+ override def present(reduction: T): T = reduction
+ }
+
+ /**
+ * Obtain an [[Aggregator]] that uses an efficient append operation for faster aggregation. Equivalent to
+ * {{{appendSemigroup(prep, appnd, identity[T]_)(sg)}}}
+ */
+ def appendSemigroup[F, T](prep: F => T, appnd: (T, F) => T)(implicit
+ sg: Semigroup[T]
+ ): Aggregator[F, T, T] =
+ appendSemigroup(prep, appnd, identity[T])(sg)
+
+ /**
+ * Obtain an [[Aggregator]] that uses an efficient append operation for faster aggregation
+ * @tparam F
+ * Data input type
+ * @tparam T
+ * Aggregating [[Semigroup]] type
+ * @tparam P
+ * Presentation (output) type
+ * @param prep
+ * The preparation function. Expected to construct an instance of type T from a single data element.
+ * @param appnd
+ * Function that appends the [[Semigroup]]. Defines the [[Aggregator.append]] method for this aggregator.
+ * Analogous to the 'seqop' function in Scala's sequence 'aggregate' method
+ * @param pres
+ * The presentation function
+ * @param sg
+ * The [[Semigroup]] type class
+ * @note
+ * The functions 'appnd' and 'prep' are expected to obey the law: {{{appnd(t, f) == sg.plus(t, prep(f))}}}
+ */
+ def appendSemigroup[F, T, P](prep: F => T, appnd: (T, F) => T, pres: T => P)(implicit
+ sg: Semigroup[T]
+ ): Aggregator[F, T, P] =
+ new Aggregator[F, T, P] {
+ override def semigroup: Semigroup[T] = sg
+ override def prepare(input: F): T = prep(input)
+ override def present(reduction: T): P = pres(reduction)
+
+ override def apply(inputs: TraversableOnce[F]): P =
+ applyOption(inputs).get
+
+ override def applyOption(inputs: TraversableOnce[F]): Option[P] =
+ agg(inputs).map(pres)
+
+ override def append(l: T, r: F): T = appnd(l, r)
+
+ override def appendAll(old: T, items: TraversableOnce[F]): T =
+ if (items.iterator.isEmpty) old else reduce(old, agg(items).get)
+
+ private def agg(inputs: TraversableOnce[F]): Option[T] =
+ if (inputs.iterator.isEmpty) None
+ else {
+ val itr = inputs.iterator
+ val t = prepare(itr.next)
+ Some(itr.foldLeft(t)(appnd))
+ }
+ }
+
+ /**
+ * Obtain a [[MonoidAggregator]] that uses an efficient append operation for faster aggregation. Equivalent
+ * to {{{appendMonoid(appnd, identity[T]_)(m)}}}
+ */
+ def appendMonoid[F, T](appnd: (T, F) => T)(implicit m: Monoid[T]): MonoidAggregator[F, T, T] =
+ appendMonoid(appnd, identity[T])(m)
+
+ /**
+ * Obtain a [[MonoidAggregator]] that uses an efficient append operation for faster aggregation
+ * @tparam F
+ * Data input type
+ * @tparam T
+ * Aggregating [[Monoid]] type
+ * @tparam P
+ * Presentation (output) type
+ * @param appnd
+ * Function that appends the [[Monoid]]. Defines the [[MonoidAggregator.append]] method for this
+ * aggregator. Analogous to the 'seqop' function in Scala's sequence 'aggregate' method
+ * @param pres
+ * The presentation function
+ * @param m
+ * The [[Monoid]] type class
+ * @note
+ * The function 'appnd' is expected to obey the law: {{{appnd(t, f) == m.plus(t, appnd(m.zero, f))}}}
+ */
+ def appendMonoid[F, T, P](appnd: (T, F) => T, pres: T => P)(implicit
+ m: Monoid[T]
+ ): MonoidAggregator[F, T, P] =
+ new MonoidAggregator[F, T, P] {
+ override def monoid: Monoid[T] = m
+ override def prepare(input: F): T = appnd(m.zero, input)
+ override def present(reduction: T): P = pres(reduction)
+
+ override def apply(inputs: TraversableOnce[F]): P = present(agg(inputs))
+
+ override def applyOption(inputs: TraversableOnce[F]): Option[P] =
+ if (inputs.isEmpty) None else Some(apply(inputs))
+
+ override def append(l: T, r: F): T = appnd(l, r)
+
+ override def appendAll(old: T, items: TraversableOnce[F]): T =
+ reduce(old, agg(items))
+
+ override def appendAll(items: TraversableOnce[F]): T = agg(items)
+
+ private def agg(inputs: TraversableOnce[F]): T =
+ inputs.foldLeft(m.zero)(append)
+ }
+
+ /**
+ * How many items satisfy a predicate
+ */
+ def count[T](pred: T => Boolean): MonoidAggregator[T, Long, Long] =
+ prepareMonoid { (t: T) => if (pred(t)) 1L else 0L }
+
+ /**
+ * Do any items satisfy some predicate
+ */
+ def exists[T](pred: T => Boolean): MonoidAggregator[T, Boolean, Boolean] =
+ prepareMonoid(pred)(OrVal.unboxedMonoid)
+
+ /**
+ * Do all items satisfy a predicate
+ */
+ def forall[T](pred: T => Boolean): MonoidAggregator[T, Boolean, Boolean] =
+ prepareMonoid(pred)(AndVal.unboxedMonoid)
+
+ /**
+ * Take the first (left most in reduce order) item found
+ */
+ def head[T]: Aggregator[T, T, T] = fromReduce[T]((l, _) => l)
+
+ /**
+ * Take the last (right most in reduce order) item found
+ */
+ def last[T]: Aggregator[T, T, T] = fromReduce[T]((_, r) => r)
+
+ /**
+ * Get the maximum item
+ */
+ def max[T: Ordering]: Aggregator[T, T, T] = new MaxAggregator[T]
+ def maxBy[U, T: Ordering](fn: U => T): Aggregator[U, U, U] = {
+ implicit val ordU: Ordering[U] = Ordering.by(fn)
+ max[U]
+ }
+
+ /**
+ * Get the minimum item
+ */
+ def min[T: Ordering]: Aggregator[T, T, T] = new MinAggregator[T]
+ def minBy[U, T: Ordering](fn: U => T): Aggregator[U, U, U] = {
+ implicit val ordU: Ordering[U] = Ordering.by(fn)
+ min[U]
+ }
+
+ /**
+ * This returns the number of items we find
+ */
+ def size: MonoidAggregator[Any, Long, Long] =
+ prepareMonoid((_: Any) => 1L)
+
+ /**
+ * Take the smallest `count` items using a heap
+ */
+ def sortedTake[T: Ordering](count: Int): MonoidAggregator[T, PriorityQueue[T], Seq[T]] =
+ new mutable.PriorityQueueToListAggregator[T](count)
+
+ /**
+ * Same as sortedTake, but using a function that returns a value that has an Ordering.
+ *
+ * This function is like writing list.sortBy(fn).take(count).
+ */
+ def sortByTake[T, U: Ordering](count: Int)(fn: T => U): MonoidAggregator[T, PriorityQueue[T], Seq[T]] =
+ Aggregator.sortedTake(count)(Ordering.by(fn))
+
+ /**
+ * Take the largest `count` items using a heap
+ */
+ def sortedReverseTake[T: Ordering](count: Int): MonoidAggregator[T, PriorityQueue[T], Seq[T]] =
+ new mutable.PriorityQueueToListAggregator[T](count)(implicitly[Ordering[T]].reverse)
+
+ /**
+ * Same as sortedReverseTake, but using a function that returns a value that has an Ordering.
+ *
+ * This function is like writing list.sortBy(fn).reverse.take(count).
+ */
+ def sortByReverseTake[T, U: Ordering](
+ count: Int
+ )(fn: T => U): MonoidAggregator[T, PriorityQueue[T], Seq[T]] =
+ Aggregator.sortedReverseTake(count)(Ordering.by(fn))
+
+ /**
+ * Immutable version of sortedTake, for frameworks that check immutability of reduce functions.
+ */
+ def immutableSortedTake[T: Ordering](count: Int): MonoidAggregator[T, TopK[T], Seq[T]] =
+ new TopKToListAggregator[T](count)
+
+ /**
+ * Immutable version of sortedReverseTake, for frameworks that check immutability of reduce functions.
+ */
+ def immutableSortedReverseTake[T: Ordering](count: Int): MonoidAggregator[T, TopK[T], Seq[T]] =
+ new TopKToListAggregator[T](count)(implicitly[Ordering[T]].reverse)
+
+ /**
+ * Randomly selects input items where each item has an independent probability 'prob' of being selected.
+ * This assumes that all sampled records can fit in memory, so use this only when the expected number of
+ * sampled values is small.
+ */
+ def randomSample[T](
+ prob: Double,
+ seed: Int = DefaultSeed
+ ): MonoidAggregator[T, Option[Batched[T]], List[T]] = {
+ assert(prob >= 0 && prob <= 1, "randomSample.prob must lie in [0, 1]")
+ val rng = new java.util.Random(seed)
+ Preparer[T]
+ .filter(_ => rng.nextDouble() <= prob)
+ .monoidAggregate(toList)
+ }
+
+ /**
+ * Selects exactly 'count' of the input records randomly (or all of the records if there are less then
+ * 'count' total records). This assumes that all 'count' of the records can fit in memory, so use this only
+ * for small values of 'count'.
+ */
+ def reservoirSample[T](
+ count: Int,
+ seed: Int = DefaultSeed
+ ): MonoidAggregator[T, PriorityQueue[(Double, T)], Seq[T]] = {
+ val rng = new java.util.Random(seed)
+ Preparer[T]
+ .map(rng.nextDouble() -> _)
+ .monoidAggregate(sortByTake(count)(_._1))
+ .andThenPresent(_.map(_._2))
+ }
+
+ /**
+ * Put everything in a List. Note, this could fill the memory if the List is very large.
+ */
+ def toList[T]: MonoidAggregator[T, Option[Batched[T]], List[T]] =
+ new MonoidAggregator[T, Option[Batched[T]], List[T]] {
+ override def prepare(t: T): Option[Batched[T]] = Some(Batched(t))
+ override def monoid: Monoid[Option[Batched[T]]] =
+ Monoid.optionMonoid(Batched.semigroup)
+ override def present(o: Option[Batched[T]]): List[T] =
+ o.map(_.toList).getOrElse(Nil)
+ }
+
+ /**
+ * Put everything in a Set. Note, this could fill the memory if the Set is very large.
+ */
+ def toSet[T]: MonoidAggregator[T, Set[T], Set[T]] =
+ prepareMonoid { (t: T) => Set(t) }
+
+ /**
+ * This builds an in-memory Set, and then finally gets the size of that set. This may not be scalable if the
+ * Uniques are very large. You might check the approximateUniqueCount or HyperLogLog Aggregator to get an
+ * approximate version of this that is scalable.
+ */
+ def uniqueCount[T]: MonoidAggregator[T, Set[T], Int] =
+ toSet[T].andThenPresent(_.size)
+
+ /**
+ * Using a constant amount of memory, give an approximate unique count (~ 1% error). This uses an exact set
+ * for up to 100 items, then HyperLogLog (HLL) with an 1.2% standard error which uses at most 8192 bytes for
+ * each HLL. For more control, see HyperLogLogAggregator.
+ */
+ def approximateUniqueCount[T: Hash128]: MonoidAggregator[T, Either[HLL, Set[T]], Long] =
+ SetSizeHashAggregator[T](hllBits = 13, maxSetSize = 100)
+
+ /**
+ * Returns the lower bound of a given percentile where the percentile is between (0,1] The items that are
+ * iterated over cannot be negative.
+ */
+ def approximatePercentile[T](percentile: Double, k: Int = QTreeAggregator.DefaultK)(implicit
+ num: Numeric[T]
+ ): QTreeAggregatorLowerBound[T] =
+ QTreeAggregatorLowerBound[T](percentile, k)
+
+ /**
+ * Returns the intersection of a bounded percentile where the percentile is between (0,1] The items that are
+ * iterated over cannot be negative.
+ */
+ def approximatePercentileBounds[T](percentile: Double, k: Int = QTreeAggregator.DefaultK)(implicit
+ num: Numeric[T]
+ ): QTreeAggregator[T] =
+ QTreeAggregator[T](percentile, k)
+
+ /**
+ * An aggregator that sums Numeric values into Doubles.
+ *
+ * This is really no more than converting to Double and then summing. The conversion to double means we
+ * don't have the overflow semantics of integer types on the jvm (e.g. Int.MaxValue + 1 == Int.MinValue).
+ *
+ * Note that if you instead wanted to aggregate Numeric values of a type T into the same type T (e.g. if you
+ * want MonoidAggregator[T, T, T] for some Numeric type T), you can directly use Aggregator.fromMonoid[T]
+ * after importing the numericRing implicit:
+ *
+ * > import com.twitter.algebird.Ring.numericRing > def numericAggregator[T: Numeric]: MonoidAggregator[T,
+ * T, T] = Aggregator.fromMonoid[T]
+ */
+ def numericSum[T](implicit num: Numeric[T]): MonoidAggregator[T, Double, Double] =
+ Preparer[T].map(num.toDouble).monoidAggregate(Aggregator.fromMonoid)
+
+}
+
+/**
+ * This is a type that models map/reduce(map). First each item is mapped, then we reduce with a semigroup,
+ * then finally we present the results.
+ *
+ * Unlike Fold, Aggregator keeps it's middle aggregation type externally visible. This is because Aggregators
+ * are useful in parallel map/reduce systems where there may be some additional types needed to cross the
+ * map/reduce boundary (such a serialization and intermediate storage). If you don't care about the middle
+ * type, an _ may be used and the main utility of the instance is still preserved (e.g. def operate[T](ag:
+ * Aggregator[T, _, Int]): Int)
+ *
+ * Note, join is very useful to combine multiple aggregations with one pass. Also
+ * GeneratedTupleAggregator.fromN((agg1, agg2, ... aggN)) can glue these together well.
+ *
+ * This type is the the Fold.M from Haskell's fold package:
+ * https://hackage.haskell.org/package/folds-0.6.2/docs/Data-Fold-M.html
+ */
+trait Aggregator[-A, B, +C] extends java.io.Serializable { self =>
+ def prepare(input: A): B
+ def semigroup: Semigroup[B]
+ def present(reduction: B): C
+
+ /* *****
+ * All the following are in terms of the above
+ */
+
+ /**
+ * combine two inner values
+ */
+ def reduce(l: B, r: B): B = semigroup.plus(l, r)
+
+ /**
+ * This may error if items is empty. To be safe you might use reduceOption if you don't know that items is
+ * non-empty
+ */
+ def reduce(items: TraversableOnce[B]): B = semigroup.sumOption(items).get
+
+ /**
+ * This is the safe version of the above. If the input in empty, return None, else reduce the items
+ */
+ def reduceOption(items: TraversableOnce[B]): Option[B] =
+ semigroup.sumOption(items)
+
+ /**
+ * This may error if inputs are empty (for Monoid Aggregators it never will, instead you see
+ * present(Monoid.zero[B])
+ */
+ def apply(inputs: TraversableOnce[A]): C =
+ present(reduce(inputs.iterator.map(prepare)))
+
+ /**
+ * This returns None if the inputs are empty
+ */
+ def applyOption(inputs: TraversableOnce[A]): Option[C] =
+ reduceOption(inputs.iterator.map(prepare))
+ .map(present)
+
+ /**
+ * This returns the cumulative sum of its inputs, in the same order. If the inputs are empty, the result
+ * will be empty too.
+ */
+ def cumulativeIterator(inputs: Iterator[A]): Iterator[C] =
+ inputs
+ .scanLeft(None: Option[B]) {
+ case (None, a) => Some(prepare(a))
+ case (Some(b), a) => Some(append(b, a))
+ }
+ .collect { case Some(b) => present(b) }
+
+ /**
+ * This returns the cumulative sum of its inputs, in the same order. If the inputs are empty, the result
+ * will be empty too.
+ */
+ def applyCumulatively[In <: TraversableOnce[A], Out](
+ inputs: In
+ )(implicit bf: CanBuildFrom[In, C, Out]): Out =
+ (bf: BuildFrom[In, C, Out]).fromSpecific(inputs)(cumulativeIterator(inputs.iterator))
+
+ def append(l: B, r: A): B = reduce(l, prepare(r))
+
+ def appendAll(old: B, items: TraversableOnce[A]): B =
+ if (items.iterator.isEmpty) old else reduce(old, reduce(items.iterator.map(prepare)))
+
+ /** Like calling andThen on the present function */
+ def andThenPresent[D](present2: C => D): Aggregator[A, B, D] =
+ new Aggregator[A, B, D] {
+ override def prepare(input: A): B = self.prepare(input)
+ override def semigroup: Semigroup[B] = self.semigroup
+ override def present(reduction: B): D = present2(self.present(reduction))
+ }
+
+ /** Like calling compose on the prepare function */
+ def composePrepare[A1](prepare2: A1 => A): Aggregator[A1, B, C] =
+ new Aggregator[A1, B, C] {
+ override def prepare(input: A1): B = self.prepare(prepare2(input))
+ override def semigroup: Semigroup[B] = self.semigroup
+ override def present(reduction: B): C = self.present(reduction)
+ }
+
+ /**
+ * This allows you to run two aggregators on the same data with a single pass
+ */
+ def join[A2 <: A, B2, C2](that: Aggregator[A2, B2, C2]): Aggregator[A2, (B, B2), (C, C2)] =
+ GeneratedTupleAggregator.from2((this, that))
+
+ /**
+ * This allows you to join two aggregators into one that takes a tuple input, which in turn allows you to
+ * chain .composePrepare onto the result if you have an initial input that has to be prepared differently
+ * for each of the joined aggregators.
+ *
+ * The law here is: ag1.zip(ag2).apply(as.zip(bs)) == (ag1(as), ag2(bs))
+ */
+ def zip[A2, B2, C2](ag2: Aggregator[A2, B2, C2]): Aggregator[(A, A2), (B, B2), (C, C2)] = {
+ val ag1 = this
+ new Aggregator[(A, A2), (B, B2), (C, C2)] {
+ override def prepare(a: (A, A2)): (B, B2) = (ag1.prepare(a._1), ag2.prepare(a._2))
+ override val semigroup = new Tuple2Semigroup()(ag1.semigroup, ag2.semigroup)
+ override def present(b: (B, B2)): (C, C2) = (ag1.present(b._1), ag2.present(b._2))
+ }
+ }
+
+ /**
+ * An Aggregator can be converted to a Fold, but not vice-versa Note, a Fold is more constrained so only do
+ * this if you require joining a Fold with an Aggregator to produce a Fold
+ */
+ def toFold: Fold[A, Option[C]] =
+ Fold.fold[Option[B], A, Option[C]](
+ {
+ case (None, a) => Some(self.prepare(a))
+ case (Some(b), a) => Some(self.append(b, a))
+ },
+ None,
+ _.map(self.present)
+ )
+
+ def lift: MonoidAggregator[A, Option[B], Option[C]] =
+ new MonoidAggregator[A, Option[B], Option[C]] {
+ override def prepare(input: A): Option[B] = Some(self.prepare(input))
+ override def present(reduction: Option[B]): Option[C] = reduction.map(self.present)
+ override def monoid = new OptionMonoid[B]()(self.semigroup)
+ }
+}
+
+/**
+ * Aggregators are Applicatives, but this hides the middle type. If you need a join that does not hide the
+ * middle type use join on the trait, or GeneratedTupleAggregator.fromN
+ */
+class AggregatorApplicative[I] extends Applicative[({ type L[O] = Aggregator[I, ?, O] })#L] {
+ override def map[T, U](mt: Aggregator[I, ?, T])(fn: T => U): Aggregator[I, ?, U] =
+ mt.andThenPresent(fn)
+ override def apply[T](v: T): Aggregator[I, ?, T] =
+ Aggregator.const(v)
+ override def join[T, U](mt: Aggregator[I, ?, T], mu: Aggregator[I, ?, U]): Aggregator[I, ?, (T, U)] =
+ mt.join(mu)
+ override def join[T1, T2, T3](
+ m1: Aggregator[I, ?, T1],
+ m2: Aggregator[I, ?, T2],
+ m3: Aggregator[I, ?, T3]
+ ): Aggregator[I, ?, (T1, T2, T3)] =
+ GeneratedTupleAggregator.from3((m1, m2, m3))
+
+ override def join[T1, T2, T3, T4](
+ m1: Aggregator[I, ?, T1],
+ m2: Aggregator[I, ?, T2],
+ m3: Aggregator[I, ?, T3],
+ m4: Aggregator[I, ?, T4]
+ ): Aggregator[I, ?, (T1, T2, T3, T4)] =
+ GeneratedTupleAggregator.from4((m1, m2, m3, m4))
+
+ override def join[T1, T2, T3, T4, T5](
+ m1: Aggregator[I, ?, T1],
+ m2: Aggregator[I, ?, T2],
+ m3: Aggregator[I, ?, T3],
+ m4: Aggregator[I, ?, T4],
+ m5: Aggregator[I, ?, T5]
+ ): Aggregator[I, ?, (T1, T2, T3, T4, T5)] =
+ GeneratedTupleAggregator.from5((m1, m2, m3, m4, m5))
+}
+
+trait MonoidAggregator[-A, B, +C] extends Aggregator[A, B, C] { self =>
+ def monoid: Monoid[B]
+ override def semigroup: Monoid[B] = monoid
+ final override def reduce(items: TraversableOnce[B]): B =
+ monoid.sum(items)
+
+ def appendAll(items: TraversableOnce[A]): B = reduce(items.iterator.map(prepare))
+
+ override def andThenPresent[D](present2: C => D): MonoidAggregator[A, B, D] = {
+ val self = this
+ new MonoidAggregator[A, B, D] {
+ override def prepare(a: A): B = self.prepare(a)
+ override def monoid: Monoid[B] = self.monoid
+ override def present(b: B): D = present2(self.present(b))
+ }
+ }
+ override def composePrepare[A2](prepare2: A2 => A): MonoidAggregator[A2, B, C] = {
+ val self = this
+ new MonoidAggregator[A2, B, C] {
+ override def prepare(a: A2): B = self.prepare(prepare2(a))
+ override def monoid: Monoid[B] = self.monoid
+ override def present(b: B): C = self.present(b)
+ }
+ }
+
+ /**
+ * Build a MonoidAggregator that either takes left or right input and outputs the pair from both
+ */
+ def either[A2, B2, C2](
+ that: MonoidAggregator[A2, B2, C2]
+ ): MonoidAggregator[Either[A, A2], (B, B2), (C, C2)] =
+ new MonoidAggregator[Either[A, A2], (B, B2), (C, C2)] {
+ override def prepare(e: Either[A, A2]): (B, B2) = e match {
+ case Left(a) => (self.prepare(a), that.monoid.zero)
+ case Right(a2) => (self.monoid.zero, that.prepare(a2))
+ }
+ override val monoid = new Tuple2Monoid[B, B2]()(self.monoid, that.monoid)
+ override def present(bs: (B, B2)): (C, C2) = (self.present(bs._1), that.present(bs._2))
+ }
+
+ /**
+ * Only transform values where the function is defined, else discard
+ */
+ def collectBefore[A2](fn: PartialFunction[A2, A]): MonoidAggregator[A2, B, C] =
+ new MonoidAggregator[A2, B, C] {
+ override def prepare(a: A2): B =
+ if (fn.isDefinedAt(a)) self.prepare(fn(a)) else self.monoid.zero
+ override def monoid: Monoid[B] = self.monoid
+ override def present(b: B): C = self.present(b)
+ }
+
+ /**
+ * Only aggregate items that match a predicate
+ */
+ def filterBefore[A1 <: A](pred: A1 => Boolean): MonoidAggregator[A1, B, C] =
+ new MonoidAggregator[A1, B, C] {
+ override def prepare(a: A1): B = if (pred(a)) self.prepare(a) else self.monoid.zero
+ override def monoid: Monoid[B] = self.monoid
+ override def present(b: B): C = self.present(b)
+ }
+
+ /**
+ * This maps the inputs to Bs, then sums them, effectively flattening the inputs to the MonoidAggregator
+ */
+ def sumBefore: MonoidAggregator[TraversableOnce[A], B, C] =
+ new MonoidAggregator[TraversableOnce[A], B, C] {
+ override def monoid: Monoid[B] = self.monoid
+ override def prepare(input: TraversableOnce[A]): B =
+ monoid.sum(input.iterator.map(self.prepare))
+ override def present(reduction: B): C = self.present(reduction)
+ }
+
+ /**
+ * This allows you to join two aggregators into one that takes a tuple input, which in turn allows you to
+ * chain .composePrepare onto the result if you have an initial input that has to be prepared differently
+ * for each of the joined aggregators.
+ *
+ * The law here is: ag1.zip(ag2).apply(as.zip(bs)) == (ag1(as), ag2(bs))
+ */
+ def zip[A2, B2, C2](ag2: MonoidAggregator[A2, B2, C2]): MonoidAggregator[(A, A2), (B, B2), (C, C2)] = {
+ val ag1 = self
+ new MonoidAggregator[(A, A2), (B, B2), (C, C2)] {
+ override def prepare(a: (A, A2)): (B, B2) = (ag1.prepare(a._1), ag2.prepare(a._2))
+ override val monoid = new Tuple2Monoid[B, B2]()(ag1.monoid, ag2.monoid)
+ override def present(b: (B, B2)): (C, C2) = (ag1.present(b._1), ag2.present(b._2))
+ }
+ }
+}
+
+trait RingAggregator[-A, B, +C] extends MonoidAggregator[A, B, C] {
+ def ring: Ring[B]
+ override def monoid: Monoid[B] = Ring.asTimesMonoid(ring)
+}
diff --git a/algebird-core/src/main/scala-2.12/CountMinSketch.scala b/algebird-core/src/main/scala-2.12/CountMinSketch.scala
new file mode 100644
index 000000000..826aebd5a
--- /dev/null
+++ b/algebird-core/src/main/scala-2.12/CountMinSketch.scala
@@ -0,0 +1,1420 @@
+/*
+Copyright 2012 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+ */
+
+package com.twitter.algebird
+
+import algebra.CommutativeMonoid
+
+import scala.collection.compat._
+
+/**
+ * A Count-Min sketch is a probabilistic data structure used for summarizing streams of data in sub-linear
+ * space.
+ *
+ * It works as follows. Let `(eps, delta)` be two parameters that describe the confidence in our error
+ * estimates, and let `d = ceil(ln 1/delta)` and `w = ceil(e / eps)`.
+ *
+ * Note: Throughout the code `d` and `w` are called `depth` and `width`, respectively.
+ *
+ * Then:
+ *
+ * - Take `d` pairwise independent hash functions `h_i`, each of which maps onto the domain `[0, w - 1]`.
+ * - Create a 2-dimensional table of counts, with `d` rows and `w` columns, initialized with all zeroes.
+ * - When a new element x arrives in the stream, update the table of counts by setting `counts[i, h_i[x]] +=
+ * 1`, for each `1 <= i <= d`.
+ * - (Note the rough similarity to a Bloom filter.)
+ *
+ * As an example application, suppose you want to estimate the number of times an element `x` has appeared in
+ * a data stream so far. The Count-Min sketch estimate of this frequency is
+ *
+ * min_i { counts[i, h_i[x]] }
+ *
+ * With probability at least `1 - delta`, this estimate is within `eps * N` of the true frequency (i.e., `true
+ * frequency <= estimate <= true frequency + eps * N`), where N is the total size of the stream so far.
+ *
+ * See http://www.eecs.harvard.edu/~michaelm/CS222/countmin.pdf for technical details, including proofs of the
+ * estimates and error bounds used in this implementation.
+ *
+ * Parts of this implementation are taken from
+ * https://github.com/clearspring/stream-lib/blob/master/src/main/java/com/clearspring/analytics/stream/frequency/CountMinSketch.java
+ *
+ * @author
+ * Edwin Chen
+ */
+/**
+ * Monoid for adding CMS sketches.
+ *
+ * =Usage=
+ *
+ * `eps` and `delta` are parameters that bound the error of each query estimate. For example, errors in
+ * answering point queries (e.g., how often has element x appeared in the stream described by the sketch?) are
+ * often of the form: "with probability p >= 1 - delta, the estimate is close to the truth by some factor
+ * depending on eps."
+ *
+ * The type `K` is the type of items you want to count. You must provide an implicit `CMSHasher[K]` for `K`,
+ * and Algebird ships with several such implicits for commonly used types such as `Long` and `BigInt`.
+ *
+ * If your type `K` is not supported out of the box, you have two options: 1) You provide a "translation"
+ * function to convert items of your (unsupported) type `K` to a supported type such as Double, and then use
+ * the `contramap` function of [[CMSHasher]] to create the required `CMSHasher[K]` for your type (see the
+ * documentation of [[CMSHasher]] for an example); 2) You implement a `CMSHasher[K]` from scratch, using the
+ * existing CMSHasher implementations as a starting point.
+ *
+ * Note: Because Arrays in Scala/Java not have sane `equals` and `hashCode` implementations, you cannot safely
+ * use types such as `Array[Byte]`. Extra work is required for Arrays. For example, you may opt to convert
+ * `Array[T]` to a `Seq[T]` via `toSeq`, or you can provide appropriate wrapper classes. Algebird provides one
+ * such wrapper class, [[Bytes]], to safely wrap an `Array[Byte]` for use with CMS.
+ *
+ * @param eps
+ * One-sided error bound on the error of each point query, i.e. frequency estimate.
+ * @param delta
+ * A bound on the probability that a query estimate does not lie within some small interval (an interval
+ * that depends on `eps`) around the truth.
+ * @param seed
+ * A seed to initialize the random number generator used to create the pairwise independent hash functions.
+ * @param maxExactCountOpt
+ * An Option parameter about how many exact counts a sparse CMS wants to keep.
+ * @tparam K
+ * The type used to identify the elements to be counted. For example, if you want to count the occurrence of
+ * user names, you could map each username to a unique numeric ID expressed as a `Long`, and then count the
+ * occurrences of those `Long`s with a CMS of type `K=Long`. Note that this mapping between the elements of
+ * your problem domain and their identifiers used for counting via CMS should be bijective. We require a
+ * [[CMSHasher]] context bound for `K`, see [[CMSHasherImplicits]] for available implicits that can be
+ * imported. Which type K should you pick in practice? For domains that have less than `2^64` unique
+ * elements, you'd typically use `Long`. For larger domains you can try `BigInt`, for example. Other
+ * possibilities include Spire's `SafeLong` and `Numerical` data types (https://github.com/non/spire),
+ * though Algebird does not include the required implicits for CMS-hashing (cf. [[CMSHasherImplicits]].
+ */
+class CMSMonoid[K: CMSHasher](eps: Double, delta: Double, seed: Int, maxExactCountOpt: Option[Int] = None)
+ extends Monoid[CMS[K]]
+ with CommutativeMonoid[CMS[K]] {
+
+ val params: CMSParams[K] = {
+ val hashes: Seq[CMSHash[K]] = CMSFunctions.generateHashes(eps, delta, seed)
+ CMSParams(hashes, eps, delta, maxExactCountOpt)
+ }
+
+ override val zero: CMS[K] = CMSZero[K](params)
+
+ /**
+ * Combines the two sketches.
+ *
+ * The sketches must use the same hash functions.
+ */
+ override def plus(left: CMS[K], right: CMS[K]): CMS[K] = {
+ require(left.params.hashes == right.params.hashes, "The sketches must use the same hash functions.")
+ left ++ right
+ }
+
+ /**
+ * Creates a sketch out of a single item.
+ */
+ def create(item: K): CMS[K] = CMSItem[K](item, 1L, params)
+
+ /**
+ * Creates a sketch out of multiple items.
+ */
+ def create(data: Seq[K]): CMS[K] = {
+ val summation = new CMSSummation(params)
+ data.foreach(k => summation.insert(k, 1L))
+ summation.result
+ }
+
+ override def sumOption(sketches: TraversableOnce[CMS[K]]): Option[CMS[K]] =
+ if (sketches.iterator.isEmpty) None else Some(sum(sketches))
+
+ override def sum(sketches: TraversableOnce[CMS[K]]): CMS[K] = {
+ val summation = new CMSSummation(params)
+ summation.updateAll(sketches)
+ summation.result
+ }
+}
+
+/**
+ * This mutable builder can be used when speed is essential and you can be sure the scope of the mutability
+ * cannot escape in an unsafe way. The intended use is to allocate and call result in one method without
+ * letting a reference to the instance escape into a closure.
+ */
+class CMSSummation[K](params: CMSParams[K]) {
+ private[this] val hashes = params.hashes.toArray
+ private[this] val height = CMSFunctions.depth(params.delta)
+ private[this] val width = CMSFunctions.width(params.eps)
+ private[this] val cells = new Array[Long](height * width)
+ private[this] var totalCount = 0L
+
+ final def insert(k: K, count: Long): Unit = {
+ var row = 0
+ var offset = 0
+ val hs = hashes
+ while (row < hs.length) {
+ cells(offset + hs(row)(k)) += count
+ offset += width
+ row += 1
+ }
+ totalCount += count
+ }
+
+ def updateAll(sketches: TraversableOnce[CMS[K]]): Unit =
+ sketches.iterator.foreach(updateInto)
+
+ def updateInto(cms: CMS[K]): Unit =
+ cms match {
+ case CMSZero(_) =>
+ ()
+ case CMSItem(item, count, _) =>
+ insert(item, count)
+ case SparseCMS(table, _, _) =>
+ table.foreach { case (item, c) =>
+ insert(item, c)
+ }
+ case CMSInstance(CMSInstance.CountsTable(matrix), count, _) =>
+ var offset = 0
+ val rit = matrix.iterator
+ while (rit.hasNext) {
+ var col = 0
+ val cit = rit.next().iterator
+ while (cit.hasNext) {
+ cells(offset + col) += cit.next()
+ col += 1
+ }
+ offset += width
+ }
+ totalCount += count
+ }
+
+ def result: CMS[K] =
+ if (totalCount == 0L) CMSZero(params)
+ else {
+ def vectorize(row: Int): Vector[Long] = {
+ val offset = row * width
+ val b = Vector.newBuilder[Long]
+ var col = 0
+ while (col < width) {
+ b += cells(offset + col)
+ col += 1
+ }
+ b.result()
+ }
+
+ val b = Vector.newBuilder[Vector[Long]]
+ var row = 0
+ while (row < height) {
+ b += vectorize(row)
+ row += 1
+ }
+ CMSInstance(CMSInstance.CountsTable(b.result()), totalCount, params)
+ }
+}
+
+/**
+ * An Aggregator for [[CMS]]. Can be created using CMS.aggregator.
+ */
+case class CMSAggregator[K](cmsMonoid: CMSMonoid[K]) extends MonoidAggregator[K, CMS[K], CMS[K]] {
+ override val monoid: CMSMonoid[K] = cmsMonoid
+
+ override def prepare(value: K): CMS[K] = monoid.create(value)
+
+ override def present(cms: CMS[K]): CMS[K] = cms
+
+}
+
+/**
+ * Configuration parameters for [[CMS]].
+ *
+ * @param hashes
+ * Pair-wise independent hashes functions. We need `N=depth` such functions (`depth` can be derived from
+ * `delta`).
+ * @param eps
+ * One-sided error bound on the error of each point query, i.e. frequency estimate.
+ * @param delta
+ * A bound on the probability that a query estimate does not lie within some small interval (an interval
+ * that depends on `eps`) around the truth.
+ * @param maxExactCountOpt
+ * An Option parameter about how many exact counts a sparse CMS wants to keep.
+ * @tparam K
+ * The type used to identify the elements to be counted.
+ */
+case class CMSParams[K](
+ hashes: Seq[CMSHash[K]],
+ eps: Double,
+ delta: Double,
+ maxExactCountOpt: Option[Int] = None
+) {
+
+ require(0 < eps && eps < 1, "eps must lie in (0, 1)")
+ require(0 < delta && delta < 1, "delta must lie in (0, 1)")
+ require(
+ hashes.size >= CMSFunctions.depth(delta),
+ s"we require at least ${CMSFunctions.depth(delta)} hash functions"
+ )
+
+}
+
+/**
+ * Helper functions to generate or to translate between various CMS parameters (cf. [[CMSParams]]).
+ */
+object CMSFunctions {
+
+ /**
+ * Translates from `width` to `eps`.
+ */
+ def eps(width: Int): Double = scala.math.exp(1.0) / width
+
+ /**
+ * Translates from `depth` to `delta`.
+ */
+ @throws[IllegalArgumentException]("if depth is too large, causing precision errors when computing delta")
+ def delta(depth: Int): Double = {
+ val i = scala.math.exp(-depth)
+ require(
+ i > 0.0,
+ s"depth must be smaller as it causes precision errors when computing delta ($depth led to an invalid delta of $i)"
+ )
+ i
+ }
+
+ /**
+ * Translates from `delta` to `depth`.
+ */
+ @throws[IllegalArgumentException]("if delta is is not in (0, 1)")
+ def depth(delta: Double): Int = {
+ require(0 < delta && delta < 1, "delta must lie in (0, 1)")
+ scala.math.ceil(scala.math.log(1.0 / delta)).toInt
+ }
+
+ /**
+ * Translates from `eps` to `width`.
+ */
+ def width(eps: Double): Int =
+ scala.math.ceil(truncatePrecisionError(scala.math.exp(1) / eps)).toInt
+
+ /**
+ * Compute maxExactCount from parameters or `depth` and `width`
+ */
+ def maxExactCount(maxExactCountOpt: Option[Int], depth: Int, width: Int): Int =
+ maxExactCountOpt.getOrElse(math.max(width * depth / 100, 50))
+
+ // Eliminates precision errors such as the following:
+ //
+ // scala> val width = 39
+ // scala> scala.math.exp(1) / CMSFunctions.eps(width)
+ // res171: Double = 39.00000000000001 <<< should be 39.0
+ //
+ // Because of the actual types on which CMSFunctions operates (i.e. Int and Double), the maximum number of decimal
+ // places should be 6.
+ private def truncatePrecisionError(i: Double, decimalPlaces: Int = 6) =
+ BigDecimal(i)
+ .setScale(decimalPlaces, BigDecimal.RoundingMode.HALF_UP)
+ .toDouble
+
+ /**
+ * Generates `N=depth` pair-wise independent hash functions.
+ *
+ * @param eps
+ * One-sided error bound on the error of each point query, i.e. frequency estimate.
+ * @param delta
+ * Error bound on the probability that a query estimate does NOT lie within some small interval around the
+ * truth.
+ * @param seed
+ * Seed for the random number generator.
+ * @tparam K
+ * The type used to identify the elements to be counted.
+ * @return
+ * The generated hash functions.
+ */
+ def generateHashes[K: CMSHasher](eps: Double, delta: Double, seed: Int): Seq[CMSHash[K]] = {
+ // Typically, we would use d -- aka depth -- pair-wise independent hash functions of the form
+ //
+ // h_i(x) = a_i * x + b_i (mod p)
+ //
+ // But for this particular application, setting b_i does not matter (since all it does is shift the results of a
+ // particular hash), so we omit it (by setting b_i to 0) and simply use hash functions of the form
+ //
+ // h_i(x) = a_i * x (mod p)
+ //
+ val r = new scala.util.Random(seed)
+ val numHashes = depth(delta)
+ val numCounters = width(eps)
+ (0 to (numHashes - 1)).map(_ => CMSHash[K](r.nextInt(), 0, numCounters))
+ }
+
+}
+
+/**
+ * A trait for CMS implementations that can count elements in a data stream and that can answer point queries
+ * (i.e. frequency estimates) for these elements.
+ *
+ * Known implementations: [[CMS]], [[TopCMS]].
+ *
+ * @tparam K
+ * The type used to identify the elements to be counted.
+ * @tparam C
+ * The type of the actual CMS that implements this trait.
+ */
+trait CMSCounting[K, C[_]] {
+
+ /**
+ * Returns the one-sided error bound on the error of each point query, i.e. frequency estimate.
+ */
+ def eps: Double
+
+ /**
+ * Returns the bound on the probability that a query estimate does NOT lie within some small interval (an
+ * interval that depends on `eps`) around the truth.
+ */
+ def delta: Double
+
+ /**
+ * Number of hash functions (also: number of rows in the counting table). This number is derived from
+ * `delta`.
+ */
+ def depth: Int = CMSFunctions.depth(delta)
+
+ /**
+ * Number of counters per hash function (also: number of columns in the counting table). This number is
+ * derived from `eps`.
+ */
+ def width: Int = CMSFunctions.width(eps)
+
+ /**
+ * An Option parameter about how many exact counts a sparse CMS wants to keep
+ */
+ def maxExactCountOpt: Option[Int]
+
+ /**
+ * Number of exact counts a sparse CMS wants to keep. This number is derived from `maxExactCountOpt`.
+ */
+ def maxExactCount: Int =
+ CMSFunctions.maxExactCount(maxExactCountOpt, depth, width)
+
+ /**
+ * Returns a new sketch that is the combination of this sketch and the other sketch.
+ */
+ def ++(other: C[K]): C[K]
+
+ /**
+ * Counts the item and returns the result as a new sketch.
+ */
+ def +(item: K): C[K] = this + (item, 1L)
+
+ /**
+ * Counts the item `count` times and returns the result as a new sketch.
+ */
+ def +(item: K, count: Long): C[K]
+
+ /**
+ * Returns an estimate of the total number of times this item has been seen in the stream so far. This
+ * estimate is an upper bound.
+ *
+ * It is always true that `estimatedFrequency >= trueFrequency`. With probability `p >= 1 - delta`, it also
+ * holds that `estimatedFrequency <= trueFrequency + eps * totalCount`.
+ */
+ def frequency(item: K): Approximate[Long]
+
+ /**
+ * Returns an estimate of the inner product against another data stream.
+ *
+ * In other words, let a_i denote the number of times element i has been seen in the data stream summarized
+ * by this CMS, and let b_i denote the same for the other CMS. Then this returns an estimate of ` =
+ * \sum a_i b_i`.
+ *
+ * Note: This can also be viewed as the join size between two relations.
+ *
+ * It is always true that actualInnerProduct <= estimatedInnerProduct. With probability `p >= 1 - delta`, it
+ * also holds that `estimatedInnerProduct <= actualInnerProduct + eps * thisTotalCount * otherTotalCount`.
+ */
+ def innerProduct(other: C[K]): Approximate[Long]
+
+ /**
+ * Total number of elements counted (i.e. seen in the data stream) so far.
+ */
+ def totalCount: Long
+
+ /**
+ * The first frequency moment is the total number of elements in the stream.
+ */
+ def f1: Long = totalCount
+
+ /**
+ * The second frequency moment is `\sum a_i^2`, where `a_i` is the count of the i-th element.
+ */
+ def f2: Approximate[Long]
+
+}
+
+/**
+ * A trait for CMS implementations that can track heavy hitters in a data stream.
+ *
+ * It is up to the implementation how the semantics of tracking heavy hitters are defined. For instance, one
+ * implementation could track the "top %" heavy hitters whereas another implementation could track the "top N"
+ * heavy hitters.
+ *
+ * Known implementations: [[TopCMS]].
+ *
+ * @tparam K
+ * The type used to identify the elements to be counted.
+ */
+trait CMSHeavyHitters[K] {
+
+ /**
+ * The pluggable logic of how heavy hitters are being tracked.
+ */
+ def heavyHittersLogic: HeavyHittersLogic[K]
+
+ /**
+ * Returns the set of heavy hitters.
+ */
+ def heavyHitters: Set[K]
+
+}
+
+object CMS {
+
+ def monoid[K: CMSHasher](eps: Double, delta: Double, seed: Int): CMSMonoid[K] =
+ monoid(eps, delta, seed, None)
+ def monoid[K: CMSHasher](
+ eps: Double,
+ delta: Double,
+ seed: Int,
+ maxExactCountOpt: Option[Int]
+ ): CMSMonoid[K] =
+ new CMSMonoid[K](eps, delta, seed, maxExactCountOpt)
+
+ def monoid[K: CMSHasher](depth: Int, width: Int, seed: Int): CMSMonoid[K] =
+ monoid(depth, width, seed, None)
+ def monoid[K: CMSHasher](depth: Int, width: Int, seed: Int, maxExactCountOpt: Option[Int]): CMSMonoid[K] =
+ monoid(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, maxExactCountOpt)
+
+ def aggregator[K: CMSHasher](eps: Double, delta: Double, seed: Int): CMSAggregator[K] =
+ aggregator(eps, delta, seed, None)
+ def aggregator[K: CMSHasher](
+ eps: Double,
+ delta: Double,
+ seed: Int,
+ maxExactCountOpt: Option[Int]
+ ): CMSAggregator[K] =
+ new CMSAggregator[K](monoid(eps, delta, seed, maxExactCountOpt))
+
+ def aggregator[K: CMSHasher](depth: Int, width: Int, seed: Int): CMSAggregator[K] =
+ aggregator(depth, width, seed, None)
+ def aggregator[K: CMSHasher](
+ depth: Int,
+ width: Int,
+ seed: Int,
+ maxExactCountOpt: Option[Int]
+ ): CMSAggregator[K] =
+ aggregator(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, maxExactCountOpt)
+
+ /**
+ * Returns a fresh, zeroed CMS instance.
+ */
+ def apply[K: CMSHasher](
+ eps: Double,
+ delta: Double,
+ seed: Int,
+ maxExactCountOpt: Option[Int] = None
+ ): CMS[K] = {
+ val params = {
+ val hashes: Seq[CMSHash[K]] =
+ CMSFunctions.generateHashes(eps, delta, seed)
+ CMSParams(hashes, eps, delta, maxExactCountOpt)
+ }
+ CMSZero[K](params)
+ }
+
+}
+
+/**
+ * A Count-Min sketch data structure that allows for counting and frequency estimation of elements in a data
+ * stream.
+ *
+ * Tip: If you also need to track heavy hitters ("Top N" problems), take a look at [[TopCMS]].
+ *
+ * =Usage=
+ *
+ * This example demonstrates how to count `Long` elements with [[CMS]], i.e. `K=Long`.
+ *
+ * Note that the actual counting is always performed with a `Long`, regardless of your choice of `K`. That is,
+ * the counting table behind the scenes is backed by `Long` values (at least in the current implementation),
+ * and thus the returned frequency estimates are always instances of `Approximate[Long]`.
+ *
+ * @example
+ * {{{
+ *
+ * // Creates a monoid for a CMS that can count `Long` elements. val cmsMonoid: CMSMonoid[Long] = { val eps =
+ * 0.001 val delta = 1E-10 val seed = 1 CMS.monoid[Long](eps, delta, seed) }
+ *
+ * // Creates a CMS instance that has counted the element `1L`. val cms: CMS[Long] = cmsMonoid.create(1L)
+ *
+ * // Estimates the frequency of `1L` val estimate: Approximate[Long] = cms.frequency(1L)
+ * }}}
+ *
+ * @tparam K
+ * The type used to identify the elements to be counted.
+ */
+sealed abstract class CMS[K](val params: CMSParams[K]) extends java.io.Serializable with CMSCounting[K, CMS] {
+
+ override val eps: Double = params.eps
+
+ override val delta: Double = params.delta
+
+ override val maxExactCountOpt: Option[Int] = params.maxExactCountOpt
+
+ override def f2: Approximate[Long] = innerProduct(this)
+
+}
+
+/**
+ * Zero element. Used for initialization.
+ */
+case class CMSZero[K](override val params: CMSParams[K]) extends CMS[K](params) {
+
+ override val totalCount: Long = 0L
+
+ override def +(item: K, count: Long): CMS[K] = CMSItem[K](item, count, params)
+
+ override def ++(other: CMS[K]): CMS[K] = other
+
+ override def frequency(item: K): Approximate[Long] = Approximate.exact(0L)
+
+ override def innerProduct(other: CMS[K]): Approximate[Long] =
+ Approximate.exact(0L)
+
+}
+
+/**
+ * Used for holding a single element, to avoid repeatedly adding elements from sparse counts tables.
+ */
+case class CMSItem[K](item: K, override val totalCount: Long, override val params: CMSParams[K])
+ extends CMS[K](params) {
+
+ override def +(x: K, count: Long): CMS[K] =
+ SparseCMS[K](params) + (item, totalCount) + (x, count)
+
+ override def ++(other: CMS[K]): CMS[K] =
+ other match {
+ case _: CMSZero[?] => this
+ case other: CMSItem[K] =>
+ CMSInstance[K](params) + (item, totalCount) + (other.item, other.totalCount)
+ case _ => other + item
+ }
+
+ override def frequency(x: K): Approximate[Long] =
+ if (item == x) Approximate.exact(totalCount) else Approximate.exact(0L)
+
+ override def innerProduct(other: CMS[K]): Approximate[Long] =
+ Approximate.exact(totalCount) * other.frequency(item)
+
+}
+
+/**
+ * A sparse Count-Min sketch structure, used for situations where the key is highly skewed.
+ */
+case class SparseCMS[K](
+ exactCountTable: Map[K, Long],
+ override val totalCount: Long,
+ override val params: CMSParams[K]
+) extends CMS[K](params) {
+ import SparseCMS._
+
+ override def +(x: K, count: Long): CMS[K] = {
+ val currentCount = exactCountTable.getOrElse(x, 0L)
+ val newTable = exactCountTable.updated(x, currentCount + count)
+ if (newTable.size < maxExactCount) {
+ // still sparse
+ SparseCMS(newTable, totalCount = totalCount + count, params = params)
+ } else {
+ toDense(newTable, params)
+ }
+ }
+
+ override def ++(other: CMS[K]): CMS[K] =
+ other match {
+ case _: CMSZero[?] => this
+ case other: CMSItem[K] => this + (other.item, other.totalCount)
+ case other: SparseCMS[K] =>
+ // This SparseCMS's maxExactCount is used, so ++ is not communitive
+ val newTable = Semigroup.plus(exactCountTable, other.exactCountTable)
+ if (newTable.size < maxExactCount) {
+ // still sparse
+ SparseCMS(newTable, totalCount = totalCount + other.totalCount, params = params)
+ } else {
+ toDense(newTable, params)
+ }
+
+ case other: CMSInstance[K] => other ++ this
+ }
+
+ override def frequency(x: K): Approximate[Long] =
+ Approximate.exact(exactCountTable.getOrElse(x, 0L))
+
+ override def innerProduct(other: CMS[K]): Approximate[Long] =
+ exactCountTable.iterator
+ .map { case (x, count) => Approximate.exact(count) * other.frequency(x) }
+ .reduceOption(_ + _)
+ .getOrElse(Approximate.exact(0L))
+}
+
+object SparseCMS {
+
+ /**
+ * Creates a new [[SparseCMS]] with empty exactCountTable
+ */
+ def apply[K](params: CMSParams[K]): SparseCMS[K] = {
+ val exactCountTable = Map[K, Long]()
+ SparseCMS[K](exactCountTable, totalCount = 0, params = params)
+ }
+
+ /**
+ * Creates a new [[CMSInstance]] from a Map[K, Long]
+ */
+ def toDense[K](exactCountTable: Map[K, Long], params: CMSParams[K]): CMS[K] =
+ // Create new CMSInstace
+ exactCountTable.foldLeft(CMSInstance[K](params)) { case (cms, (x, count)) =>
+ cms + (x, count)
+ }
+}
+
+/**
+ * The general Count-Min sketch structure, used for holding any number of elements.
+ */
+case class CMSInstance[K](
+ countsTable: CMSInstance.CountsTable[K],
+ override val totalCount: Long,
+ override val params: CMSParams[K]
+) extends CMS[K](params) {
+
+ override def ++(other: CMS[K]): CMS[K] =
+ other match {
+ case _: CMSZero[?] => this
+ case other: CMSItem[K] => this + other.item
+ case other: SparseCMS[K] =>
+ other.exactCountTable.foldLeft(this) { case (cms, (x, count)) =>
+ cms + (x, count)
+ }
+ case other: CMSInstance[K] =>
+ val newTable = countsTable ++ other.countsTable
+ val newTotalCount = totalCount + other.totalCount
+ CMSInstance[K](newTable, newTotalCount, params)
+ }
+
+ private def makeApprox(est: Long): Approximate[Long] =
+ if (est == 0L) Approximate.exact(0L)
+ else {
+ val lower = math.max(0L, est - (eps * totalCount).toLong)
+ Approximate(lower, est, est, 1 - delta)
+ }
+
+ override def frequency(item: K): Approximate[Long] = {
+ var freq = Long.MaxValue
+ val hs = params.hashes
+ val it = countsTable.counts.iterator
+ var i = 0
+ while (it.hasNext) {
+ val row = it.next()
+ val count = row(hs(i)(item))
+ if (count < freq) freq = count
+ i += 1
+ }
+ makeApprox(freq)
+ }
+
+ /**
+ * Let X be a CMS, and let count_X[j, k] denote the value in X's 2-dimensional count table at row j and
+ * column k. Then the Count-Min sketch estimate of the inner product between A and B is the minimum inner
+ * product between their rows: estimatedInnerProduct = min_j (\sum_k count_A[j, k] * count_B[j, k]|)
+ */
+ override def innerProduct(other: CMS[K]): Approximate[Long] =
+ other match {
+ case other: CMSInstance[?] =>
+ require(other.depth == depth && other.width == width, "Tables must have the same dimensions.")
+
+ def innerProductAtDepth(d: Int) =
+ (0 to (width - 1)).iterator.map { w =>
+ countsTable.getCount((d, w)) * other.countsTable.getCount((d, w))
+ }.sum
+
+ val est = (0 to (depth - 1)).iterator.map(innerProductAtDepth).min
+ val minimum =
+ math.max(est - (eps * totalCount * other.totalCount).toLong, 0)
+ Approximate(minimum, est, est, 1 - delta)
+ case _ => other.innerProduct(this)
+ }
+
+ override def +(item: K, count: Long): CMSInstance[K] = {
+ require(count >= 0, "count must be >= 0 (negative counts not implemented")
+ if (count != 0L) {
+ val newCountsTable =
+ (0 to (depth - 1)).foldLeft(countsTable) { case (table, row) =>
+ val pos = (row, params.hashes(row)(item))
+ table + (pos, count)
+ }
+ CMSInstance[K](newCountsTable, totalCount + count, params)
+ } else this
+ }
+
+}
+
+object CMSInstance {
+
+ /**
+ * Initializes a [[CMSInstance]] with all zeroes, i.e. nothing has been counted yet.
+ */
+ def apply[K](params: CMSParams[K]): CMSInstance[K] = {
+ val countsTable = CountsTable[K](CMSFunctions.depth(params.delta), CMSFunctions.width(params.eps))
+ CMSInstance[K](countsTable, 0, params)
+ }
+
+ /**
+ * The 2-dimensional table of counters used in the Count-Min sketch. Each row corresponds to a particular
+ * hash function.
+ */
+ // TODO: implement a dense matrix type, and use it here
+ case class CountsTable[K](counts: Vector[Vector[Long]]) {
+ require(depth > 0, "Table must have at least 1 row.")
+ require(width > 0, "Table must have at least 1 column.")
+
+ def depth: Int = counts.size
+
+ def width: Int = counts(0).size
+
+ def getCount(pos: (Int, Int)): Long = {
+ val (row, col) = pos
+ require(row < depth && col < width, "Position must be within the bounds of this table.")
+ counts(row)(col)
+ }
+
+ /**
+ * Updates the count of a single cell in the table.
+ */
+ def +(pos: (Int, Int), count: Long): CountsTable[K] = {
+ val (row, col) = pos
+ val currCount = getCount(pos)
+ val newCounts =
+ counts.updated(row, counts(row).updated(col, currCount + count))
+ CountsTable[K](newCounts)
+ }
+
+ /**
+ * Adds another counts table to this one, through element-wise addition.
+ */
+ def ++(other: CountsTable[K]): CountsTable[K] = {
+ require(depth == other.depth && width == other.width, "Tables must have the same dimensions.")
+ val xss = this.counts.iterator
+ val yss = other.counts.iterator
+ val rows = Vector.newBuilder[Vector[Long]]
+ while (xss.hasNext) {
+ val xs = xss.next().iterator
+ val ys = yss.next().iterator
+ val row = Vector.newBuilder[Long]
+ while (xs.hasNext) row += (xs.next() + ys.next())
+ rows += row.result()
+ }
+ CountsTable[K](rows.result())
+ }
+ }
+
+ object CountsTable {
+
+ /**
+ * Creates a new [[CountsTable]] with counts initialized to all zeroes.
+ */
+ def apply[K](depth: Int, width: Int): CountsTable[K] =
+ CountsTable[K](Vector.fill[Long](depth, width)(0L))
+
+ }
+
+}
+
+case class TopCMSParams[K](logic: HeavyHittersLogic[K])
+
+/**
+ * A Count-Min sketch data structure that allows for (a) counting and frequency estimation of elements in a
+ * data stream and (b) tracking the heavy hitters among these elements.
+ *
+ * The logic of how heavy hitters are computed is pluggable, see [[HeavyHittersLogic]].
+ *
+ * Tip: If you do not need to track heavy hitters, take a look at [[CMS]], which is more efficient in this
+ * case.
+ *
+ * =Usage=
+ *
+ * This example demonstrates how to count `Long` elements with [[TopCMS]], i.e. `K=Long`.
+ *
+ * Note that the actual counting is always performed with a `Long`, regardless of your choice of `K`. That is,
+ * the counting table behind the scenes is backed by `Long` values (at least in the current implementation),
+ * and thus the returned frequency estimates are always instances of `Approximate[Long]`.
+ *
+ * @example
+ * {{{ // Creates a monoid for a CMS that can count `Long` elements. val topPctCMSMonoid:
+ * TopPctCMSMonoid[Long] = { val eps = 0.001 val delta = 1E-10 val seed = 1 val heavyHittersPct = 0.1
+ * TopPctCMS.monoid[Long](eps, delta, seed, heavyHittersPct) }
+ *
+ * // Creates a TopCMS instance that has counted the element `1L`. val topCMS: TopCMS[Long] =
+ * topPctCMSMonoid.create(1L)
+ *
+ * // Estimates the frequency of `1L` val estimate: Approximate[Long] = topCMS.frequency(1L)
+ *
+ * // What are the heavy hitters so far? val heavyHitters: Set[Long] = topCMS.heavyHitters }}}
+ *
+ * @tparam K
+ * The type used to identify the elements to be counted.
+ */
+sealed abstract class TopCMS[K](val cms: CMS[K], params: TopCMSParams[K])
+ extends java.io.Serializable
+ with CMSCounting[K, TopCMS]
+ with CMSHeavyHitters[K] {
+
+ override val eps: Double = cms.eps
+
+ override val delta: Double = cms.delta
+
+ override val totalCount: Long = cms.totalCount
+
+ override val maxExactCountOpt: Option[Int] = cms.maxExactCountOpt
+
+ override def frequency(item: K): Approximate[Long] = cms.frequency(item)
+
+ override def innerProduct(other: TopCMS[K]): Approximate[Long] =
+ cms.innerProduct(other.cms)
+
+ override def f2: Approximate[Long] = innerProduct(this)
+
+ /**
+ * The pluggable logic with which heavy hitters are being tracked.
+ */
+ override def heavyHittersLogic: HeavyHittersLogic[K] = params.logic
+
+}
+
+/**
+ * Zero element. Used for initialization.
+ */
+case class TopCMSZero[K](override val cms: CMS[K], params: TopCMSParams[K]) extends TopCMS[K](cms, params) {
+
+ override val heavyHitters: Set[K] = Set.empty[K]
+
+ override def +(item: K, count: Long): TopCMS[K] =
+ TopCMSInstance(cms, params) + (item, count)
+
+ override def ++(other: TopCMS[K]): TopCMS[K] = other
+
+}
+
+/**
+ * Used for holding a single element, to avoid repeatedly adding elements from sparse counts tables.
+ */
+case class TopCMSItem[K](item: K, override val cms: CMS[K], params: TopCMSParams[K])
+ extends TopCMS[K](cms, params) {
+
+ override val heavyHitters: Set[K] = Set(item)
+
+ override def +(x: K, count: Long): TopCMS[K] = toCMSInstance + (x, count)
+
+ override def ++(other: TopCMS[K]): TopCMS[K] = other match {
+ case _: TopCMSZero[?] => this
+ case other: TopCMSItem[K] => toCMSInstance + other.item
+ case other: TopCMSInstance[K] => other + item
+ }
+
+ private def toCMSInstance: TopCMSInstance[K] = {
+ val hhs = HeavyHitters.from(HeavyHitter(item, 1L))
+ TopCMSInstance(cms, hhs, params)
+ }
+
+}
+
+object TopCMSInstance {
+
+ def apply[K](cms: CMS[K], params: TopCMSParams[K]): TopCMSInstance[K] =
+ TopCMSInstance[K](cms, HeavyHitters.empty[K], params)
+
+}
+
+case class TopCMSInstance[K](override val cms: CMS[K], hhs: HeavyHitters[K], params: TopCMSParams[K])
+ extends TopCMS[K](cms, params) {
+
+ override def heavyHitters: Set[K] = hhs.items
+
+ override def +(item: K, count: Long): TopCMSInstance[K] = {
+ require(count >= 0, "count must be >= 0 (negative counts not implemented")
+ if (count != 0L) {
+ val newCms = cms + (item, count)
+ val newHhs =
+ heavyHittersLogic.updateHeavyHitters(cms, newCms)(hhs, item, count)
+ TopCMSInstance[K](newCms, newHhs, params)
+ } else this
+ }
+
+ override def ++(other: TopCMS[K]): TopCMS[K] = other match {
+ case _: TopCMSZero[?] => this
+ case other: TopCMSItem[K] => this + other.item
+ case other: TopCMSInstance[K] =>
+ val newCms = cms ++ other.cms
+ val newHhs = heavyHittersLogic.updateHeavyHitters(newCms)(hhs, other.hhs)
+ TopCMSInstance(newCms, newHhs, params)
+ }
+
+}
+
+class TopCMSMonoid[K](emptyCms: CMS[K], logic: HeavyHittersLogic[K]) extends Monoid[TopCMS[K]] {
+
+ val params: TopCMSParams[K] = TopCMSParams(logic)
+
+ override val zero: TopCMS[K] = TopCMSZero[K](emptyCms, params)
+
+ /**
+ * Combines the two sketches.
+ *
+ * The sketches must use the same hash functions.
+ */
+ override def plus(left: TopCMS[K], right: TopCMS[K]): TopCMS[K] = {
+ require(
+ left.cms.params.hashes == right.cms.params.hashes,
+ "The sketches must use the same hash functions."
+ )
+ left ++ right
+ }
+
+ /**
+ * Creates a sketch out of a single item.
+ */
+ def create(item: K): TopCMS[K] =
+ TopCMSItem[K](item, emptyCms + item, params)
+
+ /**
+ * Creates a sketch out of multiple items.
+ */
+ def create(data: Seq[K]): TopCMS[K] =
+ data.foldLeft(zero) { case (acc, x) => plus(acc, create(x)) }
+
+ override def sum(sketches: TraversableOnce[TopCMS[K]]): TopCMS[K] = {
+ val topCandidates = scala.collection.mutable.Set.empty[K]
+ val summation = new CMSSummation(emptyCms.params)
+ sketches.iterator.foreach { sketch =>
+ summation.updateInto(sketch.cms)
+ topCandidates ++= sketch.heavyHitters
+ }
+ val cms = summation.result
+ val ests =
+ topCandidates.map(k => HeavyHitter(k, cms.frequency(k).estimate)).toSet
+ val hhs = logic.purgeHeavyHitters(cms)(HeavyHitters(ests))
+ TopCMSInstance(cms, hhs, params)
+ }
+
+ override def sumOption(sketches: TraversableOnce[TopCMS[K]]): Option[TopCMS[K]] =
+ if (sketches.iterator.isEmpty) None else Some(sum(sketches))
+}
+
+class TopCMSAggregator[K](cmsMonoid: TopCMSMonoid[K]) extends MonoidAggregator[K, TopCMS[K], TopCMS[K]] {
+
+ override def monoid: TopCMSMonoid[K] = cmsMonoid
+
+ override def prepare(value: K): TopCMS[K] = monoid.create(value)
+
+ override def present(cms: TopCMS[K]): TopCMS[K] = cms
+
+}
+
+/**
+ * Controls how a CMS that implements [[CMSHeavyHitters]] tracks heavy hitters.
+ */
+abstract class HeavyHittersLogic[K] extends java.io.Serializable {
+
+ def updateHeavyHitters(
+ oldCms: CMS[K],
+ newCms: CMS[K]
+ )(hhs: HeavyHitters[K], item: K, count: Long): HeavyHitters[K] = {
+ val oldItemCount = oldCms.frequency(item).estimate
+ val oldHh = HeavyHitter[K](item, oldItemCount)
+ val newItemCount = oldItemCount + count
+ val newHh = HeavyHitter[K](item, newItemCount)
+ purgeHeavyHitters(newCms)(hhs - oldHh + newHh)
+ }
+
+ def updateHeavyHitters(cms: CMS[K])(left: HeavyHitters[K], right: HeavyHitters[K]): HeavyHitters[K] = {
+ val candidates = (left.items ++ right.items).map { case i =>
+ HeavyHitter[K](i, cms.frequency(i).estimate)
+ }
+ val newHhs = HeavyHitters.from(candidates)
+ purgeHeavyHitters(cms)(newHhs)
+ }
+
+ def purgeHeavyHitters(cms: CMS[K])(hhs: HeavyHitters[K]): HeavyHitters[K]
+
+}
+
+/**
+ * Finds all heavy hitters, i.e., elements in the stream that appear at least `(heavyHittersPct * totalCount)`
+ * times.
+ *
+ * Every item that appears at least `(heavyHittersPct * totalCount)` times is output, and with probability `p
+ * >= 1 - delta`, no item whose count is less than `(heavyHittersPct - eps) * totalCount` is output.
+ *
+ * This also means that this parameter is an upper bound on the number of heavy hitters that will be tracked:
+ * the set of heavy hitters contains at most `1 / heavyHittersPct` elements. For example, if
+ * `heavyHittersPct=0.01` (or 0.25), then at most `1 / 0.01 = 100` items (or `1 / 0.25 = 4` items) will be
+ * tracked/returned as heavy hitters. This parameter can thus control the memory footprint required for
+ * tracking heavy hitters.
+ */
+case class TopPctLogic[K](heavyHittersPct: Double) extends HeavyHittersLogic[K] {
+
+ require(0 < heavyHittersPct && heavyHittersPct < 1, "heavyHittersPct must lie in (0, 1)")
+
+ override def purgeHeavyHitters(cms: CMS[K])(hitters: HeavyHitters[K]): HeavyHitters[K] = {
+ val minCount = heavyHittersPct * cms.totalCount
+ HeavyHitters[K](hitters.hhs.filter(_.count >= minCount))
+ }
+
+}
+
+/**
+ * Tracks the top N heavy hitters, where `N` is defined by `heavyHittersN`.
+ *
+ * '''Warning:''' top-N computations are not associative. The effect is that a top-N CMS has an ordering bias
+ * (with regard to heavy hitters) when merging instances. This means merging heavy hitters across CMS
+ * instances may lead to incorrect, biased results: the outcome is biased by the order in which CMS instances
+ * / heavy hitters are being merged, with the rule of thumb being that the earlier a set of heavy hitters is
+ * being merged, the more likely is the end result biased towards these heavy hitters.
+ *
+ * @see
+ * Discussion in [[https://github.com/twitter/algebird/issues/353 Algebird issue 353]]
+ */
+case class TopNLogic[K](heavyHittersN: Int) extends HeavyHittersLogic[K] {
+
+ require(heavyHittersN > 0, "heavyHittersN must be > 0")
+
+ override def purgeHeavyHitters(cms: CMS[K])(hitters: HeavyHitters[K]): HeavyHitters[K] = {
+ val sorted =
+ hitters.hhs.toSeq.sortBy(hh => hh.count).takeRight(heavyHittersN)
+ HeavyHitters[K](sorted.toSet)
+ }
+
+}
+
+/**
+ * Containers for holding heavy hitter items and their associated counts.
+ */
+case class HeavyHitters[K](hhs: Set[HeavyHitter[K]]) extends java.io.Serializable {
+
+ def -(hh: HeavyHitter[K]): HeavyHitters[K] = HeavyHitters[K](hhs - hh)
+
+ def +(hh: HeavyHitter[K]): HeavyHitters[K] = HeavyHitters[K](hhs + hh)
+
+ def ++(other: HeavyHitters[K]): HeavyHitters[K] =
+ HeavyHitters[K](hhs ++ other.hhs)
+
+ def items: Set[K] = hhs.map(_.item)
+
+}
+
+object HeavyHitters {
+
+ def empty[K]: HeavyHitters[K] = HeavyHitters(emptyHhs)
+
+ private def emptyHhs[K]: Set[HeavyHitter[K]] = Set[HeavyHitter[K]]()
+
+ def from[K](hhs: Set[HeavyHitter[K]]): HeavyHitters[K] =
+ hhs.foldLeft(empty[K])(_ + _)
+
+ def from[K](hh: HeavyHitter[K]): HeavyHitters[K] = HeavyHitters(emptyHhs + hh)
+
+}
+
+case class HeavyHitter[K](item: K, count: Long) extends java.io.Serializable
+
+/**
+ * Monoid for Top-% based [[TopCMS]] sketches.
+ *
+ * =Usage=
+ *
+ * The type `K` is the type of items you want to count. You must provide an implicit `CMSHasher[K]` for `K`,
+ * and Algebird ships with several such implicits for commonly used types such as `Long` and `BigInt`.
+ *
+ * If your type `K` is not supported out of the box, you have two options: 1) You provide a "translation"
+ * function to convert items of your (unsupported) type `K` to a supported type such as Double, and then use
+ * the `contramap` function of [[CMSHasher]] to create the required `CMSHasher[K]` for your type (see the
+ * documentation of [[CMSHasher]] for an example); 2) You implement a `CMSHasher[K]` from scratch, using the
+ * existing CMSHasher implementations as a starting point.
+ *
+ * Note: Because Arrays in Scala/Java not have sane `equals` and `hashCode` implementations, you cannot safely
+ * use types such as `Array[Byte]`. Extra work is required for Arrays. For example, you may opt to convert
+ * `Array[T]` to a `Seq[T]` via `toSeq`, or you can provide appropriate wrapper classes. Algebird provides one
+ * such wrapper class, [[Bytes]], to safely wrap an `Array[Byte]` for use with CMS.
+ *
+ * @param cms
+ * A [[CMS]] instance, which is used for the counting and the frequency estimation performed by this class.
+ * @param heavyHittersPct
+ * A threshold for finding heavy hitters, i.e., elements that appear at least (heavyHittersPct * totalCount)
+ * times in the stream.
+ * @tparam K
+ * The type used to identify the elements to be counted. For example, if you want to count the occurrence of
+ * user names, you could map each username to a unique numeric ID expressed as a `Long`, and then count the
+ * occurrences of those `Long`s with a CMS of type `K=Long`. Note that this mapping between the elements of
+ * your problem domain and their identifiers used for counting via CMS should be bijective. We require a
+ * [[CMSHasher]] context bound for `K`, see [[CMSHasher]] for available implicits that can be imported.
+ * Which type K should you pick in practice? For domains that have less than `2^64` unique elements, you'd
+ * typically use `Long`. For larger domains you can try `BigInt`, for example.
+ */
+class TopPctCMSMonoid[K](cms: CMS[K], heavyHittersPct: Double = 0.01)
+ extends TopCMSMonoid[K](cms, TopPctLogic[K](heavyHittersPct))
+
+object TopPctCMS {
+
+ def monoid[K: CMSHasher](
+ eps: Double,
+ delta: Double,
+ seed: Int,
+ heavyHittersPct: Double
+ ): TopPctCMSMonoid[K] =
+ new TopPctCMSMonoid[K](CMS(eps, delta, seed), heavyHittersPct)
+
+ def monoid[K: CMSHasher](depth: Int, width: Int, seed: Int, heavyHittersPct: Double): TopPctCMSMonoid[K] =
+ monoid(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersPct)
+
+ def aggregator[K: CMSHasher](
+ eps: Double,
+ delta: Double,
+ seed: Int,
+ heavyHittersPct: Double
+ ): TopPctCMSAggregator[K] =
+ new TopPctCMSAggregator[K](monoid(eps, delta, seed, heavyHittersPct))
+
+ def aggregator[K: CMSHasher](
+ depth: Int,
+ width: Int,
+ seed: Int,
+ heavyHittersPct: Double
+ ): TopPctCMSAggregator[K] =
+ aggregator(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersPct)
+
+}
+
+/**
+ * An Aggregator for [[TopPctCMS]]. Can be created using [[TopPctCMS.aggregator]].
+ */
+case class TopPctCMSAggregator[K](cmsMonoid: TopPctCMSMonoid[K]) extends TopCMSAggregator(cmsMonoid)
+
+/**
+ * Monoid for top-N based [[TopCMS]] sketches. '''Use with care! (see warning below)'''
+ *
+ * =Warning: Adding top-N CMS instances (`++`) is an unsafe operation=
+ *
+ * Top-N computations are not associative. The effect is that a top-N CMS has an ordering bias (with regard to
+ * heavy hitters) when ''merging'' CMS instances (e.g. via `++`). This means merging heavy hitters across CMS
+ * instances may lead to incorrect, biased results: the outcome is biased by the order in which CMS instances
+ * / heavy hitters are being merged, with the rule of thumb being that the earlier a set of heavy hitters is
+ * being merged, the more likely is the end result biased towards these heavy hitters.
+ *
+ * The warning above only applies when ''adding CMS instances'' (think: `cms1 ++ cms2`). In comparison, heavy
+ * hitters are correctly computed when:
+ *
+ * - a top-N CMS instance is created from a single data stream, i.e. `Seq[K]`
+ * - items are added/counted individually, i.e. `cms + item` or `cms + (item, count)`.
+ *
+ * See the discussion in [[https://github.com/twitter/algebird/issues/353 Algebird issue 353]] for further
+ * details.
+ *
+ * =Alternatives=
+ *
+ * The following, alternative data structures may be better picks than a top-N based CMS given the warning
+ * above:
+ *
+ * - [[TopPctCMS]]: Has safe merge semantics for its instances including heavy hitters.
+ * - [[SpaceSaver]]: Has the same ordering bias than a top-N CMS, but at least it provides bounds on the
+ * bias.
+ *
+ * =Usage=
+ *
+ * The type `K` is the type of items you want to count. You must provide an implicit `CMSHasher[K]` for `K`,
+ * and Algebird ships with several such implicits for commonly used types such as `Long` and `BigInt`.
+ *
+ * If your type `K` is not supported out of the box, you have two options: 1) You provide a "translation"
+ * function to convert items of your (unsupported) type `K` to a supported type such as [[Double]], and then
+ * use the `contramap` function of [[CMSHasher]] to create the required `CMSHasher[K]` for your type (see the
+ * documentation of [[CMSHasher]] for an example); 2) You implement a `CMSHasher[K]` from scratch, using the
+ * existing CMSHasher implementations as a starting point.
+ *
+ * Note: Because Arrays in Scala/Java not have sane `equals` and `hashCode` implementations, you cannot safely
+ * use types such as `Array[Byte]`. Extra work is required for Arrays. For example, you may opt to convert
+ * `Array[T]` to a `Seq[T]` via `toSeq`, or you can provide appropriate wrapper classes. Algebird provides one
+ * such wrapper class, [[Bytes]], to safely wrap an `Array[Byte]` for use with CMS.
+ *
+ * @param cms
+ * A [[CMS]] instance, which is used for the counting and the frequency estimation performed by this class.
+ * @param heavyHittersN
+ * The maximum number of heavy hitters to track.
+ * @tparam K
+ * The type used to identify the elements to be counted. For example, if you want to count the occurrence of
+ * user names, you could map each username to a unique numeric ID expressed as a `Long`, and then count the
+ * occurrences of those `Long`s with a CMS of type `K=Long`. Note that this mapping between the elements of
+ * your problem domain and their identifiers used for counting via CMS should be bijective. We require a
+ * [[CMSHasher]] context bound for `K`, see [[CMSHasher]] for available implicits that can be imported.
+ * Which type K should you pick in practice? For domains that have less than `2^64` unique elements, you'd
+ * typically use `Long`. For larger domains you can try `BigInt`, for example.
+ */
+class TopNCMSMonoid[K](cms: CMS[K], heavyHittersN: Int = 100)
+ extends TopCMSMonoid[K](cms, TopNLogic[K](heavyHittersN))
+
+object TopNCMS {
+
+ def monoid[K: CMSHasher](eps: Double, delta: Double, seed: Int, heavyHittersN: Int): TopNCMSMonoid[K] =
+ new TopNCMSMonoid[K](CMS(eps, delta, seed), heavyHittersN)
+
+ def monoid[K: CMSHasher](depth: Int, width: Int, seed: Int, heavyHittersN: Int): TopNCMSMonoid[K] =
+ monoid(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersN)
+
+ def aggregator[K: CMSHasher](
+ eps: Double,
+ delta: Double,
+ seed: Int,
+ heavyHittersN: Int
+ ): TopNCMSAggregator[K] =
+ new TopNCMSAggregator[K](monoid(eps, delta, seed, heavyHittersN))
+
+ def aggregator[K: CMSHasher](depth: Int, width: Int, seed: Int, heavyHittersN: Int): TopNCMSAggregator[K] =
+ aggregator(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersN)
+
+}
+
+/**
+ * An Aggregator for [[TopNCMS]]. Can be created using [[TopNCMS.aggregator]].
+ */
+case class TopNCMSAggregator[K](cmsMonoid: TopNCMSMonoid[K]) extends TopCMSAggregator(cmsMonoid)
+
+/**
+ * K1 defines a scope for the CMS. For each k1, keep the top heavyHittersN associated k2 values.
+ */
+case class ScopedTopNLogic[K1, K2](heavyHittersN: Int) extends HeavyHittersLogic[(K1, K2)] {
+
+ require(heavyHittersN > 0, "heavyHittersN must be > 0")
+
+ override def purgeHeavyHitters(
+ cms: CMS[(K1, K2)]
+ )(hitters: HeavyHitters[(K1, K2)]): HeavyHitters[(K1, K2)] = {
+ val grouped = hitters.hhs.groupBy(hh => hh.item._1)
+ val (underLimit, overLimit) = grouped.partition {
+ _._2.size <= heavyHittersN
+ }
+ val sorted = overLimit.transform { case (_, hhs) =>
+ hhs.toSeq.sortBy(hh => hh.count)
+ }
+ val purged = sorted.transform { case (_, hhs) =>
+ hhs.takeRight(heavyHittersN)
+ }
+ HeavyHitters[(K1, K2)](purged.values.flatten.toSet ++ underLimit.values.flatten.toSet)
+ }
+
+}
+
+/*
+ * Monoid for Top-N values per key in an associative [[TopCMS]].
+ *
+ * Typical use case for this might be (Country, City) pairs. For a stream of such
+ * pairs, we might want to keep track of the most popular cities for each country.
+ *
+ * This can, of course, be achieved using a Map[Country, TopNCMS[City]], but this
+ * requires storing one CMS per distinct Country.
+ *
+ * Similarly, one could attempt to use a TopNCMS[(Country, City)], but less common
+ * countries may not make the cut if N is not "very large".
+ *
+ * ScopedTopNCMSMonoid[Country, City] will avoid having one Country drown others
+ * out, while still only using a single CMS.
+ *
+ * In general the eviction of K1 is not supported, and all distinct K1 values must
+ * be retained. Therefore it is important to only use this Monoid when the number
+ * of distinct K1 values is known to be reasonably bounded.
+ */
+class ScopedTopNCMSMonoid[K1, K2](cms: CMS[(K1, K2)], heavyHittersN: Int = 100)
+ extends TopCMSMonoid[(K1, K2)](cms, ScopedTopNLogic[K1, K2](heavyHittersN))
+
+object ScopedTopNCMS {
+
+ def scopedHasher[K1: CMSHasher, K2: CMSHasher]: CMSHasher[(K1, K2)] = new CMSHasher[(K1, K2)] {
+ private val k1Hasher = implicitly[CMSHasher[K1]]
+ private val k2Hasher = implicitly[CMSHasher[K2]]
+
+ override def hash(a: Int, b: Int, width: Int)(x: (K1, K2)): Int = {
+ val (k1, k2) = x
+ val xs = Seq(k1Hasher.hash(a, b, width)(k1), k2Hasher.hash(a, b, width)(k2), a, b)
+ (scala.util.hashing.MurmurHash3.seqHash(xs) & Int.MaxValue) % width
+ }
+ }
+
+ def monoid[K1: CMSHasher, K2: CMSHasher](
+ eps: Double,
+ delta: Double,
+ seed: Int,
+ heavyHittersN: Int
+ ): ScopedTopNCMSMonoid[K1, K2] =
+ new ScopedTopNCMSMonoid[K1, K2](CMS(eps, delta, seed)(scopedHasher[K1, K2]), heavyHittersN)
+
+ def monoid[K1: CMSHasher, K2: CMSHasher](
+ depth: Int,
+ width: Int,
+ seed: Int,
+ heavyHittersN: Int
+ ): ScopedTopNCMSMonoid[K1, K2] =
+ monoid(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersN)
+
+ def aggregator[K1: CMSHasher, K2: CMSHasher](
+ eps: Double,
+ delta: Double,
+ seed: Int,
+ heavyHittersN: Int
+ ): TopCMSAggregator[(K1, K2)] =
+ new TopCMSAggregator(monoid(eps, delta, seed, heavyHittersN))
+
+ def aggregator[K1: CMSHasher, K2: CMSHasher](
+ depth: Int,
+ width: Int,
+ seed: Int,
+ heavyHittersN: Int
+ ): TopCMSAggregator[(K1, K2)] =
+ aggregator(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersN)
+
+}
+
+case class CMSHash[K: CMSHasher](a: Int, b: Int, width: Int) extends java.io.Serializable {
+
+ /**
+ * Returns `a * x + b (mod p) (mod width)`.
+ */
+ def apply(x: K): Int = implicitly[CMSHasher[K]].hash(a, b, width)(x)
+
+}
+
+/**
+ * This formerly held the instances that moved to object CMSHasher
+ *
+ * These instances are slow, but here for compatibility with old serialized data. For new code, avoid these
+ * and instead use the implicits found in the CMSHasher companion object.
+ */
+object CMSHasherImplicits {
+
+ implicit object CMSHasherBigInt extends CMSHasher[BigInt] {
+ override def hash(a: Int, b: Int, width: Int)(x: BigInt): Int =
+ CMSHasher.hashBytes(a, b, width)(x.toByteArray)
+ }
+
+ implicit object CMSHasherString extends CMSHasher[String] {
+ override def hash(a: Int, b: Int, width: Int)(x: String): Int =
+ CMSHasher.hashBytes(a, b, width)(x.getBytes("UTF-8"))
+ }
+
+ def cmsHasherShort: CMSHasher[Short] = CMSHasher.cmsHasherShort
+}
diff --git a/algebird-core/src/main/scala-2.12/DecayedVector.scala b/algebird-core/src/main/scala-2.12/DecayedVector.scala
new file mode 100644
index 000000000..18e816fe4
--- /dev/null
+++ b/algebird-core/src/main/scala-2.12/DecayedVector.scala
@@ -0,0 +1,75 @@
+/*
+Copyright 2012 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+ */
+
+package com.twitter.algebird
+
+/**
+ * Represents a container class together with time. Its monoid consists of exponentially scaling the older
+ * value and summing with the newer one.
+ */
+object DecayedVector extends CompatDecayedVector {
+ def buildWithHalflife[C[_]](vector: C[Double], time: Double, halfLife: Double): DecayedVector[C] =
+ DecayedVector(vector, time * scala.math.log(2.0) / halfLife)
+
+ def monoidWithEpsilon[C[_]](
+ eps: Double
+ )(implicit vs: VectorSpace[Double, C], metric: Metric[C[Double]]): Monoid[DecayedVector[C]] =
+ new Monoid[DecayedVector[C]] {
+ override val zero = DecayedVector(vs.group.zero, Double.NegativeInfinity)
+ override def plus(left: DecayedVector[C], right: DecayedVector[C]) =
+ if (left.scaledTime <= right.scaledTime) {
+ scaledPlus(right, left, eps)
+ } else {
+ scaledPlus(left, right, eps)
+ }
+ }
+
+ def forMap[K](m: Map[K, Double], scaledTime: Double): DecayedVector[Map[K, _]] =
+ DecayedVector[Map[K, _]](m, scaledTime)
+ def forMapWithHalflife[K](m: Map[K, Double], time: Double, halfLife: Double): DecayedVector[Map[K, _]] =
+ forMap(m, time * scala.math.log(2.0) / halfLife)
+
+ def mapMonoidWithEpsilon[K](
+ eps: Double
+ )(implicit
+ vs: VectorSpace[Double, Map[K, _]],
+ metric: Metric[Map[K, Double]]
+ ): Monoid[DecayedVector[Map[K, _]]] =
+ monoidWithEpsilon[Map[K, _]](eps)
+
+ implicit def mapMonoid[K](implicit
+ vs: VectorSpace[Double, Map[K, _]],
+ metric: Metric[Map[K, Double]]
+ ): Monoid[DecayedVector[Map[K, _]]] =
+ mapMonoidWithEpsilon(-1.0)
+
+ def scaledPlus[C[_]](newVal: DecayedVector[C], oldVal: DecayedVector[C], eps: Double)(implicit
+ vs: VectorSpace[Double, C],
+ metric: Metric[C[Double]]
+ ): DecayedVector[C] = {
+ implicit val mon: Monoid[C[Double]] = vs.group
+ val expFactor = scala.math.exp(oldVal.scaledTime - newVal.scaledTime)
+ val newVector =
+ Monoid.plus(newVal.vector, vs.scale(expFactor, oldVal.vector))
+ if (eps < 0.0 || Metric.norm(newVector) > eps) {
+ DecayedVector(newVector, newVal.scaledTime)
+ } else {
+ DecayedVector(mon.zero, Double.NegativeInfinity)
+ }
+ }
+}
+
+case class DecayedVector[C[_]](vector: C[Double], scaledTime: Double)
diff --git a/algebird-core/src/main/scala-2.12/DecayingCMS.scala b/algebird-core/src/main/scala-2.12/DecayingCMS.scala
new file mode 100644
index 000000000..54809e2a8
--- /dev/null
+++ b/algebird-core/src/main/scala-2.12/DecayingCMS.scala
@@ -0,0 +1,650 @@
+package com.twitter.algebird
+
+import java.lang.Double.{compare => cmp}
+import java.lang.Math
+import java.util.Arrays.deepHashCode
+import scala.concurrent.duration.Duration
+import scala.util.Random
+
+/**
+ * DecayingCMS is a module to build count-min sketch instances whose counts decay exponentially.
+ *
+ * Similar to a Map[K, com.twitter.algebird.DecayedValue], each key is associated with a single count value
+ * that decays over time. Unlike a map, the decyaing CMS is an approximate count -- in exchange for the
+ * possibility of over-counting, we can bound its size in memory.
+ *
+ * The intended use case is for metrics or machine learning where exact values aren't needed.
+ *
+ * You can expect the keys with the biggest values to be fairly accurate but the very small values (rare keys
+ * or very old keys) to be lost in the noise. For both metrics and ML this should be fine: you can't learn too
+ * much from very rare values.
+ *
+ * We recommend depth of at least 5, and width of at least 100, but you should do some experiments to
+ * determine the smallest parameters that will work for your use case.
+ */
+final class DecayingCMS[K](
+ seed: Long,
+ val halfLife: Duration,
+ val depth: Int, // number of hashing functions
+ val width: Int, // number of table cells per hashing function
+ hasher: CMSHasher[K]
+) extends Serializable { module =>
+
+ override def toString: String =
+ s"DecayingCMS(seed=$seed, halfLife=$halfLife, depth=$depth, width=$width)"
+
+ @inline private def getNextLogScale(
+ logScale: Double,
+ oldTimeInHL: Double,
+ nowInHL: Double
+ ): Double =
+ if (nowInHL == oldTimeInHL) logScale else logScale + (nowInHL - oldTimeInHL) * log2
+
+ @inline private def getScale(logScale: Double, oldTimeInHL: Double, nowInHL: Double): Double = {
+ val logScale1 = getNextLogScale(logScale, oldTimeInHL, nowInHL)
+ Math.exp(-logScale1)
+ }
+
+ val empty: CMS =
+ new CMS(Array.fill(depth)(Vector.fill[Double](width)(0.0)), 0.0, Double.NegativeInfinity)
+
+ /**
+ * Represents a decaying scalar value at a particular point in time.
+ *
+ * The value decays according to halfLife. Another way to think about DoubleAt is that it represents a
+ * particular decay curve (and in particular, a point along that curve). Two DoubleAt values may be
+ * equivalent if they are two points on the same curve.
+ *
+ * The `timeToZero` and `timeToUnit` methods can be used to "normalize" DoubleAt values. If two DoubleAt
+ * values do not produce the same (approximate) Double values from these methods, they represent different
+ * curves.
+ */
+ class DoubleAt private[algebird] (val value: Double, val timeInHL: Double) extends Serializable {
+ lhs =>
+
+ // this is not public because it's not safe in general -- you need
+ // to run a function that is time-commutative.
+ private[algebird] def map(f: Double => Double): DoubleAt =
+ new DoubleAt(f(value), timeInHL)
+
+ // this is not public because it's not safe in general -- you need
+ // to run a function that is time-commutative.
+ private[algebird] def map2(rhs: DoubleAt)(f: (Double, Double) => Double): DoubleAt =
+ if (lhs.timeInHL < rhs.timeInHL) {
+ val x = lhs.scaledAt(rhs.timeInHL)
+ new DoubleAt(f(x, rhs.value), rhs.timeInHL)
+ } else if (lhs.timeInHL == rhs.timeInHL) {
+ new DoubleAt(f(lhs.value, rhs.value), rhs.timeInHL)
+ } else {
+ val y = rhs.scaledAt(lhs.timeInHL)
+ new DoubleAt(f(lhs.value, y), lhs.timeInHL)
+ }
+
+ def unary_- : DoubleAt = new DoubleAt(-value, timeInHL)
+ def abs: DoubleAt = new DoubleAt(Math.abs(value), timeInHL)
+ def *(n: Double): DoubleAt = new DoubleAt(value * n, timeInHL)
+
+ def +(rhs: DoubleAt): DoubleAt = map2(rhs)(_ + _)
+ def -(rhs: DoubleAt): DoubleAt = map2(rhs)(_ - _)
+ def min(rhs: DoubleAt): DoubleAt = map2(rhs)(Math.min)
+ def max(rhs: DoubleAt): DoubleAt = map2(rhs)(Math.max)
+
+ def /(rhs: DoubleAt): Double = map2(rhs)(_ / _).value
+
+ /**
+ * We consider two DoubleAt values equal not just if their elements are equal, but also if they represent
+ * the same value at different points of decay.
+ */
+ def compare(rhs: DoubleAt): Int = {
+ val vc = cmp(lhs.value, rhs.value)
+ val tc = cmp(lhs.timeInHL, rhs.timeInHL)
+ if (vc == tc) vc
+ else if (tc == 0) vc
+ else if (vc == 0) tc
+ else if (tc < 0) cmp(lhs.scaledAt(rhs.timeInHL), rhs.value)
+ else cmp(lhs.value, rhs.scaledAt(lhs.timeInHL))
+ }
+
+ /**
+ * Time when this value will reach the smallest double value bigger than zero, unless we are already at
+ * zero in which case we return the current time
+ */
+ def timeToZero: Double =
+ if (java.lang.Double.isNaN(value)) Double.NaN
+ else if (java.lang.Double.isInfinite(value)) Double.PositiveInfinity
+ else if (value == 0.0) timeInHL
+ else timeToUnit + DoubleAt.TimeFromUnitToZero
+
+ /**
+ * This is the scaled time when the current value will reach 1 (or -1 for negative values)
+ *
+ * This method is a way of collapsing a DoubleAt into a single value (the time in the past or future where
+ * its value would be 1, the unit value).
+ */
+ def timeToUnit: Double =
+ if (java.lang.Double.isNaN(value)) Double.NaN
+ else if (java.lang.Double.isInfinite(value)) Double.PositiveInfinity
+ else if (value == 0.0) Double.NegativeInfinity
+ else {
+ // solve for result:
+ //
+ // 1 = value * module.getScale(0.0, timeInHL, result)
+ // 1 = value * Math.exp(-getNextLogScale(0.0, timeInHL, result))
+ // 1 / value = Math.exp(-getNextLogScale(0.0, timeInHL, result))
+ // log(1 / value) = -getNextLogScale(0.0, timeInHL, result)
+ // -log(1 / value) = getNextLogScale(0.0, timeInHL, result)
+ // log(value) = getNextLogScale(0.0, timeInHL, result)
+ // log(value) = if (result == timeInHL) 0 else 0 + (result - timeInHL) * log2
+ // log(value) = if (result == timeInHL) 0 else (result - timeInHL) * log2
+ //
+ // log(value) = (result - timeInHL) * log2
+ // log(value) / log2 = result - timeInHL
+ // log(value) / log2 + timeInHL = result
+ Math.log(Math.abs(value)) / log2 + timeInHL
+ }
+
+ override def equals(that: Any): Boolean =
+ that match {
+ case d: DoubleAt => compare(d) == 0
+ case _ => false
+ }
+
+ override def hashCode: Int =
+ timeToUnit.##
+
+ override def toString: String =
+ s"DoubleAt($value, $timeInHL)"
+
+ def <(rhs: DoubleAt): Boolean = (lhs.compare(rhs)) < 0
+ def <=(rhs: DoubleAt): Boolean = (lhs.compare(rhs)) <= 0
+ def >(rhs: DoubleAt): Boolean = (lhs.compare(rhs)) > 0
+ def >=(rhs: DoubleAt): Boolean = (lhs.compare(rhs)) >= 0
+
+ def time: Long =
+ toTimestamp(timeInHL)
+
+ private def scaledAt(t: Double): Double =
+ if (value == 0.0) 0.0
+ else value * module.getScale(0.0, timeInHL, t)
+
+ def at(time: Long): Double =
+ if (value == 0.0) 0.0
+ else value * module.getScale(0.0, timeInHL, fromTimestamp(time))
+ }
+
+ object DoubleAt {
+ def apply(x: Double, t: Long): DoubleAt =
+ new DoubleAt(x, fromTimestamp(t))
+
+ val zero: DoubleAt =
+ new DoubleAt(0.0, Double.NegativeInfinity)
+
+ private val TimeFromUnitToZero: Double =
+ -Math.log(Double.MinPositiveValue) / log2
+ }
+
+ val totalCells: Int = depth * width
+
+ val halfLifeSecs: Double =
+ halfLife.toMillis.toDouble / 1000.0
+
+ // TODO: consider a smaller number?
+ // we are trading accuracy for possible performence
+ private[this] val maxLogScale: Double = 20.0
+
+ /**
+ * Allocate an empty array of row.
+ *
+ * The elements start as null. It's an important optimization _not_ to allocate vectors here, since we're
+ * often building up cells mutably.
+ */
+ private def allocCells(): Array[Vector[Double]] =
+ new Array[Vector[Double]](depth)
+
+ def toTimestamp(t: Double): Long =
+ (t * halfLifeSecs * 1000.0).toLong
+
+ def fromTimestamp(t: Long): Double =
+ (t.toDouble / 1000.0) / halfLifeSecs
+
+ val hashFns: Array[K => Int] = {
+ val rng = new Random(seed)
+ def genPos(): Int =
+ rng.nextInt() match {
+ case 0 => genPos()
+ case n => n & 0x7fffffff
+ }
+
+ (0 until depth).map { _ =>
+ val n = genPos()
+ (k: K) => hasher.hash(n, 0, width)(k)
+ }.toArray
+ }
+
+ private final val log2 = Math.log(2.0)
+
+ /**
+ * The idealized formula for the updating current value for a key (y0 -> y1) is given as:
+ *
+ * delta = (t1 - t0) / halflife y1 = y0 * 2^(-delta) + n
+ *
+ * However, we want to avoid having to rescale every single cell every time we update; i.e. a cell with a
+ * zero value should continue to have a zero value when n=0.
+ *
+ * Therefore, we introduce a change of variable to cell values (z) along with a scale factor (scale), and
+ * the following formula:
+ *
+ * (1) zN = yN * scaleN
+ *
+ * Our constraint is expressed as:
+ *
+ * (2) If n=0, z1 = z0
+ *
+ * In that case:
+ *
+ * (3) If n=0, (y1 * scale1) = (y0 * scale0) (4) Substituting for y1, (y0 * 2^(-delta) + 0) * scale1 = y0 *
+ * scale0 (5) 2^(-delta) * scale1 = scale0 (6) scale1 = scale0 * 2^(delta)
+ *
+ * Also, to express z1 in terms of z0, we say:
+ *
+ * (7) z1 = y1 * scale1 (8) z1 = (y0 * 2^(-delta) + n) * scale1 (9) z1 = ((z0 / scale0) * 2^(-delta) + n) *
+ * scale1 (10) z1 / scale1 = (z0 / (scale1 * 2^(-delta))) * 2^(-delta) + n (11) z1 / scale1 = z0 / scale1 +
+ * n (12) z1 = z0 + n * scale1
+ *
+ * So, for cells where n=0, we just update scale0 to scale1, and for cells where n is non-zero, we update z1
+ * in terms of z0 and scale1.
+ *
+ * If we convert scale to logscale, we have:
+ *
+ * (13) logscale1 = logscale0 + delta * log(2) (14) z1 = z0 + n * exp(logscale1)
+ *
+ * When logscale1 gets big, we start to distort z1. For example, exp(36) is close to 2^53. We can measure
+ * when n * exp(logscale1) gets big, and in those cases we can rescale all our cells (set each z to its
+ * corresponding y) and set the logscale to 0.
+ *
+ * (15) y1 = z1 / scale1 (16) y1 = z1 / exp(logscale1) (17) y1 = z1 * exp(-logscale1)
+ */
+ final class CMS(
+ val cells: Array[Vector[Double]],
+ val logScale: Double,
+ val timeInHL: Double
+ ) extends Serializable {
+
+ @inline private def scale: Double =
+ Math.exp(-logScale)
+
+ override def toString: String = {
+ val s = cells.iterator.map(_.toString).mkString("Array(", ", ", ")")
+ s"CMS($s, $logScale, $timeInHL)"
+ }
+
+ override def hashCode: Int =
+ deepHashCode(cells.asInstanceOf[Array[Object]]) * 59 +
+ logScale.## * 17 +
+ timeInHL.## * 37 +
+ 19
+
+ // unfortunately we can't check the path-dependent type of this
+ // CMS, which we signal by using a type projection here.
+ override def equals(any: Any): Boolean =
+ any match {
+ case that: DecayingCMS[?]#CMS =>
+ this.logScale == that.logScale &&
+ this.timeInHL == that.timeInHL &&
+ this.cells.length == that.cells.length && {
+ var i = 0
+ while (i < depth) {
+ if (this.cells(i) != that.cells(i)) return false
+ i += 1
+ }
+ true
+ }
+ case _ =>
+ false
+ }
+
+ def lastUpdateTime: Long =
+ toTimestamp(timeInHL)
+
+ /**
+ * Provide lower and upper bounds on values returned for any possible key.
+ *
+ * The first value is a lower bound: even keys that have never been counted will return this value or
+ * greater. This will be zero unless the CMS is saturated.
+ *
+ * The second value is an upper bound: the key with the largest cardinality will not be reported as being
+ * larger than this value (though it might be reported as being smaller).
+ *
+ * Together these values indicate how saturated and skewed the CMS might be.
+ */
+ def range: (DoubleAt, DoubleAt) = {
+ var minMinimum = Double.PositiveInfinity
+ var minMaximum = Double.PositiveInfinity
+ var i = 0
+ while (i < cells.length) {
+ val it = cells(i).iterator
+ var localMax = it.next() // we know it doesn't start empty
+ if (localMax < minMinimum) minMinimum = localMax
+ while (it.hasNext) {
+ val n = it.next()
+ if (n > localMax) localMax = n
+ else if (n < minMinimum) minMinimum = n
+ }
+ if (localMax < minMaximum) minMaximum = localMax
+ i += 1
+ }
+
+ val s = scale
+ def sc(x: Double): DoubleAt =
+ new DoubleAt(if (x == 0.0) 0.0 else x * s, timeInHL)
+
+ (sc(minMinimum), sc(minMaximum))
+ }
+
+ /**
+ * Returns the square-root of the inner product of two decaying CMSs.
+ *
+ * We want the result to decay at the same rate as the CMS for this method to be valid. Taking the square
+ * root ensures that this is true. Without it, we would violate the following equality (assuming we had
+ * at() on a CMS):
+ *
+ * x.innerProduct(y).at(t) = x.at(t).innerProduct(y.at(t))
+ *
+ * This is why we don't support innerProduct, only innerProductRoot.
+ */
+ def innerProductRoot(that: CMS): DoubleAt = {
+ var i = 0
+ var res = Double.PositiveInfinity
+ val t = Math.max(this.timeInHL, that.timeInHL)
+ val scale = this.getScale(t) * that.getScale(t)
+ while (i < depth) {
+ var sum = 0.0
+ val it0 = this.cells(i).iterator
+ val it1 = that.cells(i).iterator
+ while (it0.hasNext) {
+ val x = it0.next() * it1.next()
+ if (x != 0.0) sum += x
+ }
+ if (sum < res) res = sum
+ i += 1
+ }
+ val x = if (res != 0.0) Math.sqrt(res * scale) else 0.0
+ new DoubleAt(x, t)
+ }
+
+ def l2Norm: DoubleAt =
+ innerProductRoot(this)
+
+ def scale(x: Double): CMS =
+ if (java.lang.Double.isNaN(x)) {
+ throw new IllegalArgumentException(s"invalid scale: $x")
+ } else if (x < 0.0) {
+ throw new IllegalArgumentException(s"negative scale is not allowed: $x")
+ } else if (x == 0.0) {
+ module.empty
+ } else {
+ val s = logScale + Math.log(x)
+ val c = new CMS(cells, s, timeInHL)
+ if (s > maxLogScale) c.rescaleTo(timeInHL) else c
+ }
+
+ /**
+ * Get the total count of all items in the CMS.
+ *
+ * The total is the same as the l1Norm, since we don't allow negative values.
+ *
+ * Total is one of the few non-approximate statistics that DecayingCMS supports. We expect the total to be
+ * exact (except for floating-point error).
+ */
+ def total: DoubleAt = {
+ val n = cells(0).sum
+ val x = if (n == 0.0) 0.0 else scale * n
+ new DoubleAt(x, timeInHL)
+ }
+
+ def get(k: K): DoubleAt = {
+ var minValue = Double.PositiveInfinity
+ var didx = 0
+ while (didx < depth) {
+ val i = hashFns(didx)(k)
+ val inner = cells(didx)
+ val value = inner(i)
+ if (value < minValue) minValue = value
+ didx += 1
+ }
+ val x = if (minValue == 0.0) 0.0 else scale * minValue
+ new DoubleAt(x, timeInHL)
+ }
+
+ def getScale(t: Double): Double =
+ module.getScale(logScale, timeInHL, t)
+
+ private final def nextLogScale(t: Double): Double =
+ module.getNextLogScale(logScale, timeInHL, t)
+
+ def +(other: CMS): CMS = {
+ val x = this
+ val y = other
+ val timeInHL = Math.max(x.timeInHL, y.timeInHL)
+ val cms = new CMS(allocCells(), 0.0, timeInHL)
+
+ val xscale = x.getScale(timeInHL)
+ val yscale = y.getScale(timeInHL)
+
+ // a zero count is zero, no matter, how big the scale is.
+ @inline def prod(x: Double, y: Double): Double =
+ if (x == 0.0) 0.0 else x * y
+
+ var i = 0
+ while (i < depth) {
+ val left = x.cells(i)
+ val right = y.cells(i)
+ var j = 0
+ val bldr = rowBuilder()
+ while (j < width) {
+ bldr += prod(left(j), xscale) + prod(right(j), yscale)
+ j += 1
+ }
+ cms.cells(i) = bldr.result()
+ i += 1
+ }
+ cms
+ }
+
+ def add(t: Long, k: K, n: Double): CMS =
+ scaledAdd(fromTimestamp(t), k, n)
+
+ // TODO: we could allocate a mutable scratch pad, write all the
+ // values into it, and then build a CMS out of it. if items is
+ // very small, this would be less efficient than what we're doing
+ // now. probably the "ideal" solution would be determine how many
+ // items there are. if we have fewer than ~width items, this
+ // approach is fine. for more, a scratch pad would be better
+ // (assuming we wrote that code).
+ //
+ // alternately, you could map items into (zero + item) and then
+ // use the monoid's sum to boil it down.
+ //
+ // we only use this in testing currently so the current code is
+ // fine until we rely on it in production. any change here should
+ // probably include benchmarks justifying the design.
+ def bulkAdd(items: Iterable[(Long, K, Double)]): CMS =
+ items.foldLeft(this) { case (c, (t, k, v)) => c.add(t, k, v) }
+
+ private[algebird] def scaledAdd(ts1: Double, k: K, n: Double): CMS =
+ if (n < 0.0) {
+ val t = toTimestamp(ts1)
+ throw new IllegalArgumentException(
+ s"we can only add non-negative numbers to a CMS, got $n for key: $k at time: $t"
+ )
+ } else if (n == 0.0) {
+ this
+ } else {
+ val logScale1 = nextLogScale(ts1)
+ if (logScale1 > maxLogScale) {
+ rescaleTo(ts1).scaledAdd(ts1, k, n)
+ } else {
+ val increment = n * Math.exp(logScale1)
+ val cells1 = allocCells()
+ var didx = 0
+ while (didx < depth) {
+ val cell = cells(didx)
+ val w = hashFns(didx)(k)
+ cells1(didx) = cell.updated(w, cell(w) + increment)
+ didx += 1
+ }
+ new CMS(cells1, logScale1, ts1)
+ }
+ }
+
+ // Set the scale back to 0.0
+ // input time is in half-lives
+ private[algebird] def rescaleTo(ts: Double): CMS = {
+ val logScale1 = nextLogScale(ts)
+ val expL = Math.exp(-logScale1)
+ if (expL == 0.0) {
+ new CMS(monoid.zero.cells, 0.0, ts)
+ } else {
+ val cms = new CMS(allocCells(), 0.0, ts)
+ var i = 0
+ while (i < depth) {
+ val ci = cells(i)
+ cms.cells(i) = ci.map(_ * expL)
+ i += 1
+ }
+ cms
+ }
+ }
+ }
+
+ private def rowBuilder() = {
+ val bldr = Vector.newBuilder[Double]
+ bldr.sizeHint(width)
+ bldr
+ }
+
+ object CMS {
+
+ implicit val monoidForCMS: Monoid[CMS] =
+ new Monoid[CMS] {
+
+ def zero: CMS = module.empty
+
+ def plus(x: CMS, y: CMS): CMS =
+ x + y
+
+ /**
+ * Turn a flat array into an array of vectors.
+ */
+ private def scratchToCells(scratch: Array[Double]): Array[Vector[Double]] = {
+ val cells = new Array[Vector[Double]](depth)
+ var i = 0
+ while (i < depth) {
+ var j = i * width
+ val limit = j + width
+ val bldr = rowBuilder()
+ while (j < limit) {
+ bldr += scratch(j)
+ j += 1
+ }
+ cells(i) = bldr.result()
+ i += 1
+ }
+ cells
+ }
+
+ /**
+ * This method sums the first `num` items in `arr`.
+ */
+ private def innerSum(arr: Array[CMS], num: Int): CMS =
+ if (num == 0) zero
+ else if (num == 1) arr(0)
+ else if (num == 2) plus(arr(0), arr(1))
+ else {
+ // start with zero
+ val scratch: Array[Double] = new Array(totalCells)
+
+ val latestTimeInHL: Double =
+ arr.iterator.take(num).map(cms => cms.timeInHL).max
+
+ var i = 0
+ while (i < num) {
+ val cms = arr(i)
+ val scale = cms.getScale(latestTimeInHL)
+ var j = 0
+ while (j < depth) {
+ val row = cms.cells(j)
+ val stride = j * width
+ var k = 0
+ while (k < width) {
+ val n = row(k)
+ if (n > 0.0) {
+ scratch(stride + k) += scale * n
+ }
+ k += 1
+ }
+ j += 1
+ }
+ i += 1
+ }
+
+ val cells = scratchToCells(scratch)
+
+ new CMS(cells, 0.0, latestTimeInHL)
+ }
+
+ override def sumOption(xs: TraversableOnce[CMS]): Option[CMS] = {
+
+ val it: Iterator[CMS] = xs.toIterator
+ val ChunkSize = 1000
+
+ // the idea here is that we read up to 1000 CMS values into
+ // a fixed array, crunch them down to a single CMS, store it
+ // in the first array index, read up to 999 more CMS values
+ // in, crunch them down, and so on.
+ var i = 0
+ val arr = new Array[CMS](ChunkSize)
+ while (it.hasNext) {
+ while (it.hasNext && i < ChunkSize) {
+ arr(i) = it.next()
+ i += 1
+ }
+ if (i > 1) {
+ arr(0) = innerSum(arr, i)
+ }
+ i = 1
+ }
+ if (i == 0) None else Some(arr(0))
+ }
+ }
+ }
+
+ val monoid: Monoid[CMS] = CMS.monoidForCMS
+}
+
+object DecayingCMS {
+
+ /**
+ * Construct a DecayingCMS module.
+ *
+ * The seed is used to initialize the hash families used by the count-min sketch. Using the same seed will
+ * always produce the same hash family.
+ *
+ * Half-life determines the rate at which values in the CMS decay. If a key was counted once at time t, by
+ * time (t + halfLife), the value for that key will be 0.5. After enough half lives the value will decay to
+ * zero.
+ *
+ * The size of the CMS in bytes is O(depth * width).
+ *
+ * Width controls the relative error due to over-counting (approximately 1/width). For 1% error, use
+ * width=100, for 0.1% error, use width=1000, etc.
+ *
+ * Depth controls the probability the error bounds are broken and that probability scales with exp(-alpha *
+ * depth) so, a small depth (e.g. 5-10) is fine. Each update requires O(depth) work so you want to keep this
+ * as small as possible.
+ */
+ def apply[K](seed: Long, halfLife: Duration, depth: Int, width: Int)(implicit
+ hasher: CMSHasher[K]
+ ): DecayingCMS[K] =
+ new DecayingCMS(seed, halfLife, depth, width, hasher)
+}
diff --git a/algebird-core/src/main/scala-2.12/Fold.scala b/algebird-core/src/main/scala-2.12/Fold.scala
new file mode 100644
index 000000000..0b89f2d62
--- /dev/null
+++ b/algebird-core/src/main/scala-2.12/Fold.scala
@@ -0,0 +1,352 @@
+/*
+Copyright 2014 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+ */
+package com.twitter.algebird
+
+import java.io.Serializable
+import scala.collection.compat._
+
+/**
+ * Folds are first-class representations of "Traversable.foldLeft." They have the nice property that they can
+ * be fused to work in parallel over an input sequence.
+ *
+ * A Fold accumulates inputs (I) into some internal type (X), converting to a defined output type (O) when
+ * done. We use existential types to hide internal details and to allow for internal and external (X and O)
+ * types to differ for "map" and "join."
+ *
+ * In discussing this type we draw parallels to Function1 and related types. You can think of a fold as a
+ * function "Seq[I] => O" but in reality we do not have to materialize the input sequence at once to "run" the
+ * fold.
+ *
+ * The traversal of the input data structure is NOT done by Fold itself. Instead we expose some methods like
+ * "overTraversable" that know how to iterate through various sequence types and drive the fold. We also
+ * expose some internal state so library authors can fold over their own types.
+ *
+ * See the companion object for constructors.
+ */
+sealed trait Fold[-I, +O] extends Serializable {
+
+ /**
+ * Users can ignore this type.
+ *
+ * The internal accumulator type. No one outside this Fold needs to know what this is, and that's a good
+ * thing. It keeps type signatures sane and makes this easy to use for the amount of flexibility it
+ * provides.
+ */
+ type X
+
+ /**
+ * Users can ignore this method. It is exposed so library authors can run folds over their own sequence
+ * types.
+ *
+ * "build" constructs a FoldState, which tells us how to run the fold. It is expected that we can run the
+ * same Fold many times over different data structures, but we must build a new FoldState every time.
+ *
+ * See FoldState for information on how to use this for your own sequence types.
+ */
+ def build(): FoldState[X, I, O]
+
+ /**
+ * Transforms the output of the Fold after iteration is complete. This is analogous to "Future.map" or
+ * "Function1.compose."
+ */
+ def map[P](f: O => P): Fold[I, P] = {
+ val self = this
+ new Fold[I, P] {
+ type X = self.X
+ override def build(): FoldState[X, I, P] =
+ self.build().map(f)
+ }
+ }
+
+ /**
+ * Joins two folds into one and combines the results. The fused fold accumulates with both at the same time
+ * and combines at the end.
+ */
+ def joinWith[I2 <: I, P, Q](other: Fold[I2, P])(f: (O, P) => Q): Fold[I2, Q] = {
+ val self = this
+ new Fold[I2, Q] {
+ type X = (self.X, other.X)
+ override def build(): FoldState[X, I2, Q] = {
+ val first = self.build()
+ val second = other.build()
+ new FoldState(
+ { case ((x, y), i) => (first.add(x, i), second.add(y, i)) },
+ (first.start, second.start),
+ { case (x, y) => f(first.end(x), second.end(y)) }
+ )
+ }
+ }
+ }
+
+ /**
+ * Convenient shorthand for joining Folds without combining at the end.
+ */
+ def join[I2 <: I, P](other: Fold[I2, P]): Fold[I2, (O, P)] =
+ joinWith(other) { case (o, p) => (o, p) }
+
+ /**
+ * Transforms the input of the fold before every accumulation. (The name comes from "contravariant map.")
+ * This is analogous to "Function1.andThen."
+ */
+ def contramap[H](f: H => I): Fold[H, O] = {
+ val self = this
+ new Fold[H, O] {
+ type X = self.X
+ override def build(): FoldState[X, H, O] =
+ self.build().contramap(f)
+ }
+ }
+
+ /**
+ * Trivially runs a Fold over an empty sequence.
+ */
+ def overEmpty: O = {
+ // build is a "def" so we construct the state once and use the pieces to run the fold
+ val state = build()
+ state.end(state.start)
+ }
+
+ /**
+ * Trivially runs a Fold over a single element sequence.
+ */
+ def overSingleton(i: I): O = {
+ val state = build()
+ state.end(state.add(state.start, i))
+ }
+
+ /**
+ * Runs a Fold over a Traversable.
+ */
+ def overTraversable(is: TraversableOnce[I]): O = {
+ val state = build()
+ state.end(is.iterator.foldLeft(state.start)(state.add))
+ }
+}
+
+/**
+ * A FoldState defines a left fold with a "hidden" accumulator type. It is exposed so library authors can run
+ * Folds over their own sequence types.
+ *
+ * The fold can be executed correctly according to the properties of "add" and your traversed data structure.
+ * For example, the "add" function of a monoidal fold will be associative. A FoldState is valid for only one
+ * iteration because the accumulator (seeded by "start" may be mutable.
+ *
+ * The three components of a fold are add: (X, I) => X - updates and returns internal state for every input I
+ * start: X - the initial state end: X => O - transforms internal state to a final result
+ *
+ * Folding over Seq(x, y) would produce the result end(add(add(start, x), y))
+ */
+final class FoldState[X, -I, +O] private[algebird] (val add: (X, I) => X, val start: X, val end: X => O)
+ extends Serializable {
+
+ /**
+ * Transforms the output type of the FoldState (see Fold.map).
+ */
+ def map[P](f: O => P): FoldState[X, I, P] =
+ new FoldState(add, start, end.andThen(f))
+
+ /**
+ * Transforms the input type of the FoldState (see Fold.contramap).
+ */
+ def contramap[H](f: H => I): FoldState[X, H, O] =
+ new FoldState((x, h) => add(x, f(h)), start, end)
+}
+
+/**
+ * Methods to create and run Folds.
+ *
+ * The Folds defined here are immutable and serializable, which we expect by default. It is important that you
+ * as a user indicate mutability or non-serializability when defining new Folds. Additionally, it is
+ * recommended that "end" functions not mutate the accumulator in order to support scans (producing a stream
+ * of intermediate outputs by calling "end" at each step).
+ */
+object Fold extends CompatFold {
+
+ /**
+ * "import Fold.applicative" will bring the Applicative instance into scope. See FoldApplicative.
+ */
+ implicit def applicative[I]: Applicative[Fold[I, _]] =
+ new FoldApplicative[I]
+
+ /**
+ * Turn a common Scala foldLeft into a Fold. The accumulator MUST be immutable and serializable.
+ */
+ def foldLeft[I, O](o: O)(add: (O, I) => O): Fold[I, O] =
+ fold[O, I, O](add, o, o => o)
+
+ /**
+ * A general way of defining Folds that supports a separate accumulator type. The accumulator MUST be
+ * immutable and serializable.
+ */
+ def fold[M, I, O](add: (M, I) => M, start: M, end: M => O): Fold[I, O] =
+ new Fold[I, O] {
+ type X = M
+ override def build(): FoldState[X, I, O] =
+ new FoldState(add, start, end)
+ }
+
+ /**
+ * A general way of defining Folds that supports constructing mutable or non-serializable accumulators.
+ */
+ def foldMutable[M, I, O](add: (M, I) => M, start: Unit => M, end: M => O): Fold[I, O] =
+ new Fold[I, O] {
+ type X = M
+ override def build(): FoldState[X, I, O] =
+ new FoldState(add, start(()), end)
+ }
+
+ /**
+ * Fuse a sequence of Folds into one that outputs the result of each.
+ */
+ def sequence[I, O](ms: Seq[Fold[I, O]]): Fold[I, Seq[O]] =
+ new Fold[I, Seq[O]] {
+ type X = Seq[Any]
+ override def build(): FoldState[Seq[Any], I, Seq[O]] = {
+ val bs: Seq[FoldState[Any, I, O]] =
+ ms.map(_.build().asInstanceOf[FoldState[Any, I, O]])
+ val adds =
+ bs.map(_.add)
+ val ends =
+ bs.map(_.end)
+ val starts: Seq[Any] =
+ bs.map(_.start)
+ val add: (Seq[Any], I) => Seq[Any] = { (xs, i) => adds.zip(xs).map { case (f, x) => f(x, i) } }
+ val end: (Seq[Any] => Seq[O]) = { xs => ends.zip(xs).map { case (f, x) => f(x) } }
+ new FoldState(add, starts, end)
+ }
+ }
+
+ /**
+ * An even simpler Fold that collects into a Seq. Shorthand for "container[I, Seq];" fewer type arguments,
+ * better type inferrence.
+ */
+ def seq[I]: Fold[I, Seq[I]] =
+ container[I, Seq]
+
+ /**
+ * A Fold that does no work and returns a constant. Analogous to Function1 const: def const[A, B](b: B): (A
+ * \=> B) = { _ => b }
+ */
+ def const[O](value: O): Fold[Any, O] =
+ Fold.foldLeft(value) { case (u, _) => u }
+
+ /**
+ * A Fold that runs the given side effect for every element.
+ */
+ def foreach[I](e: I => Unit): Fold[I, Unit] =
+ Fold.foldLeft(()) { case (_, i) => e(i) }
+
+ /**
+ * A Fold that returns the first value in a sequence.
+ */
+ def first[I]: Fold[I, Option[I]] =
+ Fold.foldLeft[I, Option[I]](None) {
+ case (None, i) => Some(i)
+ case (x, _) => x
+ }
+
+ /**
+ * A Fold that returns the last value in a sequence.
+ */
+ def last[I]: Fold[I, Option[I]] =
+ Fold.foldLeft[I, Option[I]](None) { case (_, i) => Some(i) }
+
+ /**
+ * A Fold that returns the max value in a sequence. (Biased to earlier equal values.)
+ */
+ def max[I](implicit ordering: Ordering[I]): Fold[I, Option[I]] =
+ Fold.foldLeft[I, Option[I]](None) {
+ case (None, i) => Some(i)
+ case (Some(y), i) if ordering.compare(y, i) < 0 => Some(i)
+ case (x, _) => x
+ }
+
+ /**
+ * A Fold that returns a min value in a sequence. (Biased to earlier equal values.)
+ */
+ def min[I](implicit ordering: Ordering[I]): Fold[I, Option[I]] =
+ Fold.foldLeft[I, Option[I]](None) {
+ case (None, i) => Some(i)
+ case (Some(y), i) if ordering.compare(y, i) > 0 => Some(i)
+ case (x, _) => x
+ }
+
+ /**
+ * A Fold that returns the sum of a numeric sequence. Does not protect against overflow.
+ */
+ def sum[I](implicit numeric: Monoid[I]): Fold[I, I] =
+ Fold.foldLeft(numeric.zero) { case (x, i) => numeric.plus(x, i) }
+
+ /**
+ * For a semigroup, if we get more than 0 items, use plus
+ */
+ def sumOption[T](implicit sg: Semigroup[T]): Fold[T, Option[T]] =
+ Fold.foldLeft(None: Option[T]) {
+ case (None, i) => Some(i)
+ case (Some(l), r) => Some(sg.plus(l, r))
+ }
+
+ /**
+ * A Fold that returns the product of a numeric sequence. Does not protect against overflow.
+ */
+ def product[I](implicit numeric: Ring[I]): Fold[I, I] =
+ Fold.foldLeft(numeric.one) { case (x, i) => numeric.times(x, i) }
+
+ /**
+ * A Fold that returns the length of a sequence.
+ */
+ def size: Fold[Any, Long] =
+ Fold.foldLeft(0L) { case (x, _) => x + 1 }
+
+ /**
+ * A Fold that returns "true" if all elements of the sequence statisfy the predicate. Note this does not
+ * short-circuit enumeration of the sequence.
+ */
+ def forall[I](pred: I => Boolean): Fold[I, Boolean] =
+ foldLeft(true)((b, i) => b && pred(i))
+
+ /**
+ * A Fold that returns "true" if any element of the sequence statisfies the predicate. Note this does not
+ * short-circuit enumeration of the sequence.
+ */
+ def exists[I](pred: I => Boolean): Fold[I, Boolean] =
+ foldLeft(false)((b, i) => b || pred(i))
+
+ /**
+ * A Fold that counts the number of elements satisfying the predicate.
+ */
+ def count[I](pred: I => Boolean): Fold[I, Long] =
+ foldLeft(0L) {
+ case (c, i) if pred(i) => c + 1L
+ case (c, _) => c
+ }
+}
+
+/**
+ * Folds are Applicatives!
+ */
+class FoldApplicative[I] extends Applicative[Fold[I, _]] {
+ override def map[T, U](mt: Fold[I, T])(fn: T => U): Fold[I, U] =
+ mt.map(fn)
+ override def apply[T](v: T): Fold[I, T] =
+ Fold.const(v)
+ override def join[T, U](mt: Fold[I, T], mu: Fold[I, U]): Fold[I, (T, U)] =
+ mt.join(mu)
+ override def sequence[T](ms: Seq[Fold[I, T]]): Fold[I, Seq[T]] =
+ Fold.sequence(ms)
+ override def joinWith[T, U, V](mt: Fold[I, T], mu: Fold[I, U])(fn: (T, U) => V): Fold[I, V] =
+ mt.joinWith(mu)(fn)
+}
diff --git a/algebird-core/src/main/scala-2.12/Interval.scala b/algebird-core/src/main/scala-2.12/Interval.scala
new file mode 100644
index 000000000..6a1645d16
--- /dev/null
+++ b/algebird-core/src/main/scala-2.12/Interval.scala
@@ -0,0 +1,380 @@
+/*
+ Copyright 2013 Twitter, Inc.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package com.twitter.algebird
+
+// TODO this is clearly more general than summingbird, and should be extended to be a ring (add union, etc...)
+
+/**
+ * Represents a single interval on a T with an Ordering
+ */
+sealed trait Interval[T] extends java.io.Serializable {
+ def contains(t: T)(implicit ord: Ordering[T]): Boolean
+
+ def intersect(that: Interval[T])(implicit ord: Ordering[T]): Interval[T]
+ final def apply(t: T)(implicit ord: Ordering[T]): Boolean = contains(t)
+ final def &&(that: Interval[T])(implicit ord: Ordering[T]): Interval[T] = intersect(that)
+
+ /**
+ * Map the Interval with a non-decreasing function. If you use a non-monotonic function (like x^2) then the
+ * result is meaningless. TODO: It might be good to have types for these properties in algebird.
+ */
+ def mapNonDecreasing[U](fn: T => U): Interval[U]
+}
+
+case class Universe[T]() extends Interval[T] {
+ override def contains(t: T)(implicit ord: Ordering[T]): Boolean = true
+ override def intersect(that: Interval[T])(implicit ord: Ordering[T]): Interval[T] =
+ that
+ override def mapNonDecreasing[U](fn: T => U): Interval[U] = Universe()
+}
+
+case class Empty[T]() extends Interval[T] {
+ override def contains(t: T)(implicit ord: Ordering[T]): Boolean = false
+ override def intersect(that: Interval[T])(implicit ord: Ordering[T]): Interval[T] =
+ this
+ override def mapNonDecreasing[U](fn: T => U): Interval[U] = Empty()
+}
+
+object Interval extends java.io.Serializable {
+
+ /**
+ * Class that only exists so that [[leftClosedRightOpen]] and [[leftOpenRightClosed]] can retain the type
+ * information of the returned interval. The compiler doesn't know anything about ordering, so without
+ * [[MaybeEmpty]] the only valid return type is Interval[T].
+ */
+ sealed abstract class MaybeEmpty[T, NonEmpty[t] <: Interval[t]] {
+ def isEmpty: Boolean
+ }
+ object MaybeEmpty {
+
+ /**
+ * Represents an empty interval.
+ */
+ case class SoEmpty[T, NonEmpty[t] <: Interval[t]]() extends MaybeEmpty[T, NonEmpty] {
+ override def isEmpty: Boolean = true
+ }
+
+ /**
+ * Represents a non-empty interval.
+ */
+ case class NotSoEmpty[T, NonEmpty[t] <: Interval[t]](get: NonEmpty[T]) extends MaybeEmpty[T, NonEmpty] {
+ override def isEmpty: Boolean = false
+ }
+ }
+
+ type GenIntersection[T] = Intersection[Lower, Upper, T]
+ type InLowExUp[T] = Intersection[InclusiveLower, ExclusiveUpper, T]
+ type InLowInUp[T] = Intersection[InclusiveLower, InclusiveUpper, T]
+ type ExLowExUp[T] = Intersection[ExclusiveLower, ExclusiveUpper, T]
+ type ExLowInUp[T] = Intersection[ExclusiveLower, InclusiveUpper, T]
+
+ implicit def monoid[T: Ordering]: Monoid[Interval[T]] =
+ Monoid.from[Interval[T]](Universe[T]())(_ && _)
+
+ // Automatically convert from a MaybeEmpty instance
+ implicit def fromMaybeEmpty[T, NonEmpty[t] <: Interval[t]](me: MaybeEmpty[T, NonEmpty]): Interval[T] =
+ me match {
+ case MaybeEmpty.SoEmpty() => Empty()
+ case MaybeEmpty.NotSoEmpty(i) => i
+ }
+
+ def leftClosedRightOpen[T: Ordering](lower: T, upper: T): MaybeEmpty[T, InLowExUp] =
+ if (Ordering[T].lt(lower, upper))
+ MaybeEmpty.NotSoEmpty[T, InLowExUp](Intersection(InclusiveLower(lower), ExclusiveUpper(upper)))
+ else MaybeEmpty.SoEmpty[T, InLowExUp]()
+
+ def leftOpenRightClosed[T: Ordering](lower: T, upper: T): MaybeEmpty[T, ExLowInUp] =
+ if (Ordering[T].lt(lower, upper))
+ MaybeEmpty.NotSoEmpty[T, ExLowInUp](Intersection(ExclusiveLower(lower), InclusiveUpper(upper)))
+ else MaybeEmpty.SoEmpty[T, ExLowInUp]()
+
+ def closed[T: Ordering](lower: T, upper: T): MaybeEmpty[T, InLowInUp] =
+ if (Ordering[T].lteq(lower, upper))
+ MaybeEmpty.NotSoEmpty[T, InLowInUp](Intersection(InclusiveLower(lower), InclusiveUpper(upper)))
+ else MaybeEmpty.SoEmpty[T, InLowInUp]()
+
+ def open[T: Ordering](lower: T, upper: T): MaybeEmpty[T, ExLowExUp] =
+ if (Ordering[T].lt(lower, upper))
+ MaybeEmpty.NotSoEmpty[T, ExLowExUp](Intersection(ExclusiveLower(lower), ExclusiveUpper(upper)))
+ else MaybeEmpty.SoEmpty[T, ExLowExUp]()
+
+ /**
+ * This is here for binary compatibility reasons. These methods should be moved to Interval, which should
+ * also be an abstract class for better binary compatibility at the next incompatible change
+ */
+ implicit final class IntervalMethods[T](val intr: Interval[T]) extends AnyVal {
+ def isEmpty(implicit succ: Successible[T], pred: Predecessible[T]): Boolean = intr match {
+ case Empty() => true
+ case Universe() => false
+ case Intersection(InclusiveLower(l), ExclusiveUpper(u)) =>
+ !succ.ordering.lt(l, u)
+ case Intersection(InclusiveLower(l), InclusiveUpper(u)) =>
+ !succ.ordering.lteq(l, u)
+ case Intersection(ExclusiveLower(l), ExclusiveUpper(u)) =>
+ !succ.next(l).exists(succ.ordering.lt(_, u))
+ case Intersection(ExclusiveLower(l), InclusiveUpper(u)) =>
+ !succ.next(l).exists(succ.ordering.lteq(_, u))
+ case InclusiveLower(_) => false // we at least have l
+ case InclusiveUpper(_) => false // false // we at least have u
+ case ExclusiveLower(l) =>
+ succ.next(l).isEmpty
+ case ExclusiveUpper(u) =>
+ pred.prev(u).isEmpty
+ }
+
+ /**
+ * If this returns Some(t), then intr.contains(t) and there is no s less than t such that intr.contains(s)
+ *
+ * if this returns None, it may be Empty, Upper or Universe
+ */
+ def boundedLeast(implicit succ: Successible[T]): Option[T] = intr match {
+ case Empty() => None
+ case Universe() => None
+ case _: Upper[?] => None
+ case i @ Intersection(_, _) => i.least
+ case l: Lower[?] => l.least
+ }
+
+ /**
+ * If this returns Some(t), then intr.contains(t) and there is no s greater than t such that
+ * intr.contains(s)
+ *
+ * if this returns None, it may be Empty, Lower, or Universe
+ */
+ def boundedGreatest(implicit pred: Predecessible[T]): Option[T] =
+ intr match {
+ case Empty() => None
+ case Universe() => None
+ case _: Lower[?] => None
+ case i @ Intersection(_, _) => i.greatest
+ case u: Upper[?] => u.greatest
+ }
+ }
+}
+
+// Marker traits to keep lower on the left in Intersection
+sealed trait Lower[T] extends Interval[T] {
+
+ /**
+ * This may give a false positive (but should try not to). Note the case of (0,1) for the integers. If they
+ * were doubles, this would intersect, but since there are no members of the set Int that are bigger than 0
+ * and less than 1, they don't really intersect. So, ordering is not enough here. You need a stronger
+ * notion, which we don't have a typeclass for.
+ */
+ def intersects(u: Upper[T])(implicit ord: Ordering[T]): Boolean
+
+ /**
+ * The smallest value that is contained here This is an Option, because of cases like
+ * ExclusiveLower(Int.MaxValue) which are pathological and equivalent to Empty
+ */
+ def least(implicit s: Successible[T]): Option[T]
+ def strictLowerBound(implicit p: Predecessible[T]): Option[T]
+
+ /**
+ * Iterates all the items in this Lower[T] from lowest to highest
+ */
+ def toIterable(implicit s: Successible[T]): Iterable[T] =
+ least match {
+ case Some(l) => s.iterateNext(l)
+ case None => Iterable.empty
+ }
+}
+sealed trait Upper[T] extends Interval[T] {
+
+ /**
+ * The smallest value that is contained here This is an Option, because of cases like
+ * ExclusiveUpper(Int.MinValue), which are pathological and equivalent to Empty
+ */
+ def greatest(implicit p: Predecessible[T]): Option[T]
+ // The smallest value that is not present
+ def strictUpperBound(implicit s: Successible[T]): Option[T]
+
+ /**
+ * Iterates all the items in this Upper[T] from highest to lowest
+ */
+ def toIterable(implicit p: Predecessible[T]): Iterable[T] =
+ greatest match {
+ case Some(g) => p.iteratePrev(g)
+ case None => Iterable.empty
+ }
+}
+
+case class InclusiveLower[T](lower: T) extends Interval[T] with Lower[T] {
+ override def contains(t: T)(implicit ordering: Ordering[T]): Boolean =
+ ordering.lteq(lower, t)
+ override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match {
+ case Universe() => this
+ case Empty() => that
+ case ub @ InclusiveUpper(_) =>
+ if (intersects(ub)) Intersection(this, ub) else Empty()
+ case ub @ ExclusiveUpper(_) =>
+ if (intersects(ub)) Intersection(this, ub) else Empty()
+ case InclusiveLower(thatlb) =>
+ if (ordering.gt(lower, thatlb)) this else that
+ case ExclusiveLower(thatlb) =>
+ if (ordering.gt(lower, thatlb)) this else that
+ case Intersection(thatL, thatU) => (this && thatL) && thatU
+ }
+ override def intersects(u: Upper[T])(implicit ordering: Ordering[T]): Boolean =
+ u match {
+ case InclusiveUpper(upper) => ordering.lteq(lower, upper)
+ case ExclusiveUpper(upper) => ordering.lt(lower, upper)
+ }
+ override def least(implicit s: Successible[T]): Option[T] = Some(lower)
+ override def strictLowerBound(implicit p: Predecessible[T]): Option[T] = p.prev(lower)
+ override def mapNonDecreasing[U](fn: T => U): Interval[U] = InclusiveLower(fn(lower))
+}
+case class ExclusiveLower[T](lower: T) extends Interval[T] with Lower[T] {
+ override def contains(t: T)(implicit ordering: Ordering[T]): Boolean =
+ ordering.lt(lower, t)
+ override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match {
+ case Universe() => this
+ case Empty() => that
+ case ub @ InclusiveUpper(_) =>
+ if (intersects(ub)) Intersection(this, ub) else Empty()
+ case ub @ ExclusiveUpper(_) =>
+ if (intersects(ub)) Intersection(this, ub) else Empty()
+ case InclusiveLower(thatlb) =>
+ if (ordering.gteq(lower, thatlb)) this else that
+ case ExclusiveLower(thatlb) =>
+ if (ordering.gteq(lower, thatlb)) this else that
+ case Intersection(thatL, thatU) => (this && thatL) && thatU
+ }
+ override def intersects(u: Upper[T])(implicit ordering: Ordering[T]): Boolean =
+ u match {
+ case InclusiveUpper(upper) => ordering.lt(lower, upper)
+ case ExclusiveUpper(upper) =>
+ ordering.lt(lower, upper) // This is a false positive for (x, next(x))
+ }
+ override def least(implicit s: Successible[T]): Option[T] = s.next(lower)
+ override def strictLowerBound(implicit p: Predecessible[T]): Option[T] = Some(lower)
+ override def mapNonDecreasing[U](fn: T => U): Interval[U] = ExclusiveLower(fn(lower))
+}
+case class InclusiveUpper[T](upper: T) extends Interval[T] with Upper[T] {
+ override def contains(t: T)(implicit ordering: Ordering[T]): Boolean =
+ ordering.lteq(t, upper)
+ override def greatest(implicit p: Predecessible[T]): Option[T] = Some(upper)
+ // The smallest value that is not present
+ override def strictUpperBound(implicit s: Successible[T]): Option[T] = s.next(upper)
+ override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match {
+ case Universe() => this
+ case Empty() => that
+ case lb @ InclusiveLower(_) =>
+ if (lb.intersects(this)) Intersection(lb, this) else Empty()
+ case lb @ ExclusiveLower(_) =>
+ if (lb.intersects(this)) Intersection(lb, this) else Empty()
+ case InclusiveUpper(thatub) =>
+ if (ordering.lt(upper, thatub)) this else that
+ case ExclusiveUpper(thatub) =>
+ if (ordering.lt(upper, thatub)) this else that
+ case Intersection(thatL, thatU) => thatL && (this && thatU)
+ }
+ override def mapNonDecreasing[U](fn: T => U): Interval[U] = InclusiveUpper(fn(upper))
+}
+case class ExclusiveUpper[T](upper: T) extends Interval[T] with Upper[T] {
+ override def contains(t: T)(implicit ordering: Ordering[T]): Boolean =
+ ordering.lt(t, upper)
+ override def greatest(implicit p: Predecessible[T]): Option[T] = p.prev(upper)
+ // The smallest value that is not present
+ override def strictUpperBound(implicit s: Successible[T]): Option[T] = Some(upper)
+ override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match {
+ case Universe() => this
+ case Empty() => that
+ case lb @ InclusiveLower(_) =>
+ if (lb.intersects(this)) Intersection(lb, this) else Empty()
+ case lb @ ExclusiveLower(_) =>
+ if (lb.intersects(this)) Intersection(lb, this) else Empty()
+ case InclusiveUpper(thatub) =>
+ if (ordering.lteq(upper, thatub)) this else that
+ case ExclusiveUpper(thatub) =>
+ if (ordering.lteq(upper, thatub)) this else that
+ case Intersection(thatL, thatU) => thatL && (this && thatU)
+ }
+ override def mapNonDecreasing[U](fn: T => U): Interval[U] = ExclusiveUpper(fn(upper))
+}
+
+case class Intersection[L[t] <: Lower[t], U[t] <: Upper[t], T](lower: L[T], upper: U[T]) extends Interval[T] {
+ override def contains(t: T)(implicit ordering: Ordering[T]): Boolean =
+ lower.contains(t) && upper.contains(t)
+ override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match {
+ case Universe() => this
+ case Empty() => that
+ case lb @ InclusiveLower(_) => (lb && lower) && upper
+ case lb @ ExclusiveLower(_) => (lb && lower) && upper
+ case ub @ InclusiveUpper(_) => lower && (ub && upper)
+ case ub @ ExclusiveUpper(_) => lower && (ub && upper)
+ case Intersection(thatL, thatU) => (lower && thatL) && (upper && thatU)
+ }
+ override def mapNonDecreasing[T1](fn: T => T1): Interval[T1] = {
+ val newLower = lower match {
+ case InclusiveLower(l) => InclusiveLower(fn(l))
+ case ExclusiveLower(l) => ExclusiveLower(fn(l))
+ }
+ val newUpper = upper match {
+ case InclusiveUpper(u) => InclusiveUpper(fn(u))
+ case ExclusiveUpper(u) => ExclusiveUpper(fn(u))
+ }
+ Intersection(newLower, newUpper)
+ }
+
+ def least(implicit s: Successible[T]): Option[T] =
+ lower.least.filter(upper.contains(_)(s.ordering))
+
+ /**
+ * Goes from lowest to highest for all items that are contained in this Intersection
+ */
+ def leastToGreatest(implicit s: Successible[T]): Iterable[T] = {
+ val self = this
+ implicit val ord: Ordering[T] = s.ordering
+ // TODO https://github.com/twitter/algebird/issues/263
+ new AbstractIterable[T] {
+ // We have to do this because the normal takeWhile causes OOM on big intervals
+ override def iterator: Iterator[T] = lower.toIterable.iterator.takeWhile(self.upper.contains(_))
+ }
+ }
+
+ def greatest(implicit p: Predecessible[T]): Option[T] =
+ upper.greatest.filter(lower.contains(_)(p.ordering))
+
+ /**
+ * Goes from highest to lowest for all items that are contained in this Intersection
+ */
+ def greatestToLeast(implicit p: Predecessible[T]): Iterable[T] = {
+ val self = this
+ implicit val ord: Ordering[T] = p.ordering
+ // TODO https://github.com/twitter/algebird/issues/263
+ new AbstractIterable[T] {
+ // We have to do this because the normal takeWhile causes OOM on big intervals
+ override def iterator: Iterator[T] = upper.toIterable.iterator.takeWhile(self.lower.contains(_))
+ }
+ }
+
+ /**
+ * Some intervals can actually be synonyms for empty: (0,0) for instance, contains nothing. This cannot be
+ * normalized to [a, b) form, thus we return an option Also, there are cases like [Int.MinValue,
+ * Int.MaxValue] that cannot are actually equivalent to Universe. The bottom line: if this returns None, it
+ * just means you can't express it this way, it does not mean it is empty or universe, etc... (there are
+ * other cases).
+ */
+ def toLeftClosedRightOpen(implicit
+ s: Successible[T]
+ ): Option[Intersection[InclusiveLower, ExclusiveUpper, T]] =
+ for {
+ l <- lower.least
+ g <- upper.strictUpperBound if s.ordering.lt(l, g)
+ } yield Intersection(InclusiveLower(l), ExclusiveUpper(g))
+}
diff --git a/algebird-core/src/main/scala-2.12/InvariantAlgebras.scala b/algebird-core/src/main/scala-2.12/InvariantAlgebras.scala
new file mode 100644
index 000000000..6f30ebc1c
--- /dev/null
+++ b/algebird-core/src/main/scala-2.12/InvariantAlgebras.scala
@@ -0,0 +1,48 @@
+package com.twitter.algebird
+
+class InvariantSemigroup[T, U](val forward: T => U, val reverse: U => T)(implicit val semigroup: Semigroup[T])
+ extends Semigroup[U] {
+ override def plus(l: U, r: U): U =
+ forward(semigroup.plus(reverse(l), reverse(r)))
+ override def sumOption(iter: TraversableOnce[U]): Option[U] =
+ semigroup.sumOption(iter.map(reverse)).map(forward)
+
+ /*
+ * Note these work for the subclasses since in those cases semigroup
+ * will be the appropriate algebra.
+ */
+ override val hashCode: Int = (forward, reverse, semigroup).hashCode
+ override def equals(that: Any): Boolean =
+ that match {
+ case r: InvariantSemigroup[?, ?] =>
+ (hashCode == r.hashCode) &&
+ (forward == r.forward) &&
+ (reverse == r.reverse) &&
+ (semigroup == r.semigroup)
+ case _ => false
+ }
+}
+
+class InvariantMonoid[T, U](forward: T => U, reverse: U => T)(implicit val monoid: Monoid[T])
+ extends InvariantSemigroup[T, U](forward, reverse)
+ with Monoid[U] {
+ override val zero: U = forward(monoid.zero)
+}
+
+class InvariantGroup[T, U](forward: T => U, reverse: U => T)(implicit val group: Group[T])
+ extends InvariantMonoid[T, U](forward, reverse)
+ with Group[U] {
+ override def negate(u: U): U = forward(group.negate(reverse(u)))
+ override def minus(l: U, r: U): U =
+ forward(group.minus(reverse(l), reverse(r)))
+}
+
+class InvariantRing[T, U](forward: T => U, reverse: U => T)(implicit val ring: Ring[T])
+ extends InvariantGroup[T, U](forward, reverse)
+ with Ring[U] {
+ override val one: U = forward(ring.one)
+ override def times(l: U, r: U): U =
+ forward(ring.times(reverse(l), reverse(r)))
+ override def product(iter: TraversableOnce[U]): U =
+ forward(ring.product(iter.map(reverse)))
+}
diff --git a/algebird-core/src/main/scala-2.12/JavaMonoids.scala b/algebird-core/src/main/scala-2.12/JavaMonoids.scala
new file mode 100644
index 000000000..26ce54f0a
--- /dev/null
+++ b/algebird-core/src/main/scala-2.12/JavaMonoids.scala
@@ -0,0 +1,147 @@
+/*
+Copyright 2012 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+ */
+package com.twitter.algebird
+
+import java.lang.{
+ Boolean => JBool,
+ Double => JDouble,
+ Float => JFloat,
+ Integer => JInt,
+ Long => JLong,
+ Short => JShort
+}
+import java.util.{ArrayList => JArrayList, HashMap => JHashMap, List => JList, Map => JMap}
+
+import scala.collection.JavaConverters._
+
+object JIntRing extends Ring[JInt] {
+ override val zero: JInt = JInt.valueOf(0)
+ override val one: JInt = JInt.valueOf(1)
+ override def plus(x: JInt, y: JInt): JInt = x + y
+ override def negate(x: JInt): JInt = -x
+ override def minus(x: JInt, y: JInt): JInt = x - y
+ override def times(x: JInt, y: JInt): JInt = x * y
+}
+
+object JShortRing extends Ring[JShort] {
+ override val zero: JShort = Short.box(0)
+ override val one: JShort = Short.box(1)
+ override def plus(x: JShort, y: JShort): JShort = (x + y).toShort
+ override def negate(x: JShort): JShort = (-x).toShort
+ override def minus(x: JShort, y: JShort): JShort = (x - y).toShort
+ override def times(x: JShort, y: JShort): JShort = (x * y).toShort
+}
+
+object JLongRing extends Ring[JLong] {
+ override val zero: JLong = JLong.valueOf(0L)
+ override val one: JLong = JLong.valueOf(1L)
+ override def plus(x: JLong, y: JLong): JLong = x + y
+ override def negate(x: JLong): JLong = -x
+ override def minus(x: JLong, y: JLong): JLong = x - y
+ override def times(x: JLong, y: JLong): JLong = x * y
+}
+
+object JFloatRing extends Ring[JFloat] {
+ override val zero: JFloat = JFloat.valueOf(0.0f)
+ override val one: JFloat = JFloat.valueOf(1.0f)
+ override def plus(x: JFloat, y: JFloat): JFloat = x + y
+ override def negate(x: JFloat): JFloat = -x
+ override def minus(x: JFloat, y: JFloat): JFloat = x - y
+ override def times(x: JFloat, y: JFloat): JFloat = x * y
+}
+
+object JDoubleRing extends Ring[JDouble] {
+ override val zero: JDouble = JDouble.valueOf(0.0)
+ override val one: JDouble = JDouble.valueOf(1.0)
+ override def plus(x: JDouble, y: JDouble): JDouble = x + y
+ override def negate(x: JDouble): JDouble = -x
+ override def minus(x: JDouble, y: JDouble): JDouble = x - y
+ override def times(x: JDouble, y: JDouble): JDouble = x * y
+}
+
+object JBoolRing extends Ring[JBool] {
+ override val zero: JBool = JBool.FALSE
+ override val one: JBool = JBool.TRUE
+ override def plus(x: JBool, y: JBool): JBool =
+ JBool.valueOf(x.booleanValue ^ y.booleanValue)
+ override def negate(x: JBool): JBool = x
+ override def minus(x: JBool, y: JBool): JBool = plus(x, y)
+ override def times(x: JBool, y: JBool): JBool =
+ JBool.valueOf(x.booleanValue & y.booleanValue)
+}
+
+/**
+ * Since Lists are mutable, this always makes a full copy. Prefer scala immutable Lists if you use scala
+ * immutable lists, the tail of the result of plus is always the right argument
+ */
+class JListMonoid[T] extends Monoid[JList[T]] {
+ override def isNonZero(x: JList[T]): Boolean = !x.isEmpty
+ override lazy val zero: JArrayList[T] = new JArrayList[T](0)
+ override def plus(x: JList[T], y: JList[T]): JArrayList[T] = {
+ val res = new JArrayList[T](x.size + y.size)
+ res.addAll(x)
+ res.addAll(y)
+ res
+ }
+}
+
+/**
+ * Since maps are mutable, this always makes a full copy. Prefer scala immutable maps if you use scala
+ * immutable maps, this operation is much faster TODO extend this to Group, Ring
+ */
+class JMapMonoid[K, V: Semigroup] extends Monoid[JMap[K, V]] {
+ override lazy val zero: JHashMap[K, V] = new JHashMap[K, V](0)
+
+ val nonZero: (V => Boolean) = implicitly[Semigroup[V]] match {
+ case mon: Monoid[?] => mon.isNonZero(_)
+ case _ => _ => true
+ }
+
+ override def isNonZero(x: JMap[K, V]): Boolean =
+ !x.isEmpty && (implicitly[Semigroup[V]] match {
+ case mon: Monoid[?] =>
+ x.values.asScala.exists(v => mon.isNonZero(v))
+ case _ => true
+ })
+ override def plus(x: JMap[K, V], y: JMap[K, V]): JHashMap[K, V] = {
+ val (big, small, bigOnLeft) =
+ if (x.size > y.size) {
+ (x, y, true)
+ } else {
+ (y, x, false)
+ }
+ val vsemi = implicitly[Semigroup[V]]
+ val result = new JHashMap[K, V](big.size + small.size)
+ result.putAll(big)
+ small.entrySet.asScala.foreach { kv =>
+ val smallK = kv.getKey
+ val smallV = kv.getValue
+ if (big.containsKey(smallK)) {
+ val bigV = big.get(smallK)
+ val newV =
+ if (bigOnLeft) vsemi.plus(bigV, smallV) else vsemi.plus(smallV, bigV)
+ if (nonZero(newV))
+ result.put(smallK, newV)
+ else
+ result.remove(smallK)
+ } else {
+ // No need to explicitly add with zero on V, just put in the small value
+ result.put(smallK, smallV)
+ }
+ }
+ result
+ }
+}
diff --git a/algebird-core/src/main/scala-2.12/MapAlgebra.scala b/algebird-core/src/main/scala-2.12/MapAlgebra.scala
new file mode 100644
index 000000000..9ca370eaf
--- /dev/null
+++ b/algebird-core/src/main/scala-2.12/MapAlgebra.scala
@@ -0,0 +1,320 @@
+/*
+Copyright 2012 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+ */
+package com.twitter.algebird
+
+import com.twitter.algebird.macros.{Cuber, Roller}
+import scala.collection.mutable.{Builder, Map => MMap}
+import scala.collection.{Map => ScMap}
+import algebra.ring.Rng
+import scala.collection.compat._
+
+trait MapOperations[K, V, M <: ScMap[K, V]] {
+ def add(oldMap: M, kv: (K, V)): M
+ def remove(oldMap: M, k: K): M
+ def fromMutable(mut: MMap[K, V]): M
+}
+
+abstract class GenericMapMonoid[K, V, M <: ScMap[K, V]](implicit val semigroup: Semigroup[V])
+ extends Monoid[M]
+ with MapOperations[K, V, M] {
+
+ val nonZero: (V => Boolean) = semigroup match {
+ case mon: Monoid[?] => mon.isNonZero(_)
+ case _ => _ => true
+ }
+
+ override def isNonZero(x: M): Boolean =
+ !x.isEmpty && (semigroup match {
+ case mon: Monoid[?] =>
+ x.valuesIterator.exists(v => mon.isNonZero(v))
+ case _ => true
+ })
+
+ override def plus(x: M, y: M): M = {
+ // Scala maps can reuse internal structure, so don't copy just add into the bigger one:
+ // This really saves computation when adding lots of small maps into big ones (common)
+ val (big, small, bigOnLeft) =
+ if (x.size > y.size) {
+ (x, y, true)
+ } else {
+ (y, x, false)
+ }
+ small match {
+ // Mutable maps create new copies of the underlying data on add so don't use the
+ // handleImmutable method.
+ // Cannot have a None so 'get' is safe here.
+ case _: MMap[?, ?] => sumOption(Seq(big, small)).get
+ case _ => handleImmutable(big, small, bigOnLeft)
+ }
+ }
+
+ private def handleImmutable(big: M, small: M, bigOnLeft: Boolean) =
+ small.foldLeft(big) { (oldMap, kv) =>
+ val newV = big
+ .get(kv._1)
+ .map { bigV =>
+ if (bigOnLeft)
+ semigroup.plus(bigV, kv._2)
+ else
+ semigroup.plus(kv._2, bigV)
+ }
+ .getOrElse(kv._2)
+ if (nonZero(newV))
+ add(oldMap, kv._1 -> newV)
+ else
+ remove(oldMap, kv._1)
+ }
+ override def sumOption(items: TraversableOnce[M]): Option[M] =
+ if (items.iterator.isEmpty) None
+ else {
+ val mutable = MMap[K, V]()
+ items.iterator.foreach { m =>
+ m.foreach { case (k, v) =>
+ val oldVOpt = mutable.get(k)
+ // sorry for the micro optimization here: avoiding a closure
+ val newV =
+ if (oldVOpt.isEmpty) v else Semigroup.plus(oldVOpt.get, v)
+ if (nonZero(newV))
+ mutable.update(k, newV)
+ else
+ mutable.remove(k)
+ }
+ }
+ Some(fromMutable(mutable))
+ }
+}
+
+class MapMonoid[K, V](implicit semigroup: Semigroup[V]) extends GenericMapMonoid[K, V, Map[K, V]] {
+ override lazy val zero: Map[K, V] = Map[K, V]()
+ override def add(oldMap: Map[K, V], kv: (K, V)): Map[K, V] = oldMap + kv
+ override def remove(oldMap: Map[K, V], k: K): Map[K, V] = oldMap - k
+ override def fromMutable(mut: MMap[K, V]): Map[K, V] =
+ new MutableBackedMap(mut)
+}
+
+class ScMapMonoid[K, V](implicit semigroup: Semigroup[V]) extends GenericMapMonoid[K, V, ScMap[K, V]] {
+ override lazy val zero: ScMap[K, V] = ScMap[K, V]()
+ override def add(oldMap: ScMap[K, V], kv: (K, V)): ScMap[K, V] = oldMap + kv
+ override def remove(oldMap: ScMap[K, V], k: K): ScMap[K, V] = oldMap - k
+ override def fromMutable(mut: MMap[K, V]): ScMap[K, V] =
+ new MutableBackedMap(mut)
+}
+
+/**
+ * You can think of this as a Sparse vector group
+ */
+class MapGroup[K, V](implicit val group: Group[V]) extends MapMonoid[K, V]()(group) with Group[Map[K, V]] {
+ override def negate(kv: Map[K, V]): Map[K, V] =
+ kv.iterator.map { case (k, v) =>
+ (k, group.negate(v))
+ }.toMap
+}
+
+class ScMapGroup[K, V](implicit val group: Group[V])
+ extends ScMapMonoid[K, V]()(group)
+ with Group[ScMap[K, V]] {
+ override def negate(kv: ScMap[K, V]): ScMap[K, V] =
+ kv.iterator.map { case (k, v) =>
+ (k, group.negate(v))
+ }.toMap
+}
+
+/**
+ * You can think of this as a Sparse vector ring
+ */
+trait GenericMapRing[K, V, M <: ScMap[K, V]] extends Rng[M] with MapOperations[K, V, M] {
+
+ implicit def ring: Ring[V]
+
+ override def times(x: M, y: M): M = {
+ val (big, small, bigOnLeft) =
+ if (x.size > y.size) {
+ (x, y, true)
+ } else {
+ (y, x, false)
+ }
+ small.foldLeft(zero) { (oldMap, kv) =>
+ val bigV = big.getOrElse(kv._1, ring.zero)
+ val newV =
+ if (bigOnLeft) ring.times(bigV, kv._2) else ring.times(kv._2, bigV)
+ if (ring.isNonZero(newV)) {
+ add(oldMap, kv._1 -> newV)
+ } else {
+ remove(oldMap, kv._1)
+ }
+ }
+ }
+}
+
+class MapRing[K, V](implicit override val ring: Ring[V])
+ extends MapGroup[K, V]()(ring)
+ with GenericMapRing[K, V, Map[K, V]]
+
+class ScMapRing[K, V](implicit override val ring: Ring[V])
+ extends ScMapGroup[K, V]()(ring)
+ with GenericMapRing[K, V, ScMap[K, V]]
+
+object MapAlgebra {
+ def rightContainsLeft[K, V: Equiv](l: Map[K, V], r: Map[K, V]): Boolean =
+ l.forall { case (k, v) =>
+ r.get(k).exists(Equiv[V].equiv(_, v))
+ }
+
+ implicit def sparseEquiv[K, V: Monoid: Equiv]: Equiv[Map[K, V]] =
+ Equiv.fromFunction { (m1, m2) =>
+ val cleanM1 = removeZeros(m1)
+ val cleanM2 = removeZeros(m2)
+ rightContainsLeft(cleanM1, cleanM2) && rightContainsLeft(cleanM2, cleanM1)
+ }
+
+ def mergeLookup[T, U, V: Monoid](
+ keys: TraversableOnce[T]
+ )(lookup: T => Option[V])(present: T => U): Map[U, V] =
+ sumByKey {
+ keys.iterator.map(k => present(k) -> lookup(k).getOrElse(Monoid.zero[V]))
+ }
+
+ // Returns a new map with zero-value entries removed
+ def removeZeros[K, V: Monoid](m: Map[K, V]): Map[K, V] =
+ m.filter { case (_, v) => Monoid.isNonZero(v) }
+
+ /**
+ * For each key, sum all the values. Note that if V is a Monoid, the current implementation will drop from
+ * the output any key where the values are all Monoid.zero. If the Semigroup is a Monoid, This function is
+ * equivalent to:
+ *
+ * pairs.filter(_._2 != Monoid.zero).groupBy(_._1).mapValues(_.map(_._2).sum)
+ *
+ * Otherwise, the function is equivalent to:
+ *
+ * pairs.groupBy(_._1).mapValues(_.map(_._2).sum)
+ */
+ def sumByKey[K, V: Semigroup](pairs: TraversableOnce[(K, V)]): Map[K, V] =
+ Monoid.sum(pairs.iterator.map(Map(_)))
+
+ /**
+ * For each key, creates a list of all values. This function is equivalent to:
+ *
+ * pairs.groupBy(_._1).mapValues(_.map(_._2))
+ */
+ def group[K, V](pairs: TraversableOnce[(K, V)]): Map[K, List[V]] =
+ if (pairs.iterator.isEmpty) Map.empty
+ else {
+ val mutable = MMap[K, Builder[V, List[V]]]()
+ pairs.iterator.foreach { case (k, v) =>
+ val oldVOpt = mutable.get(k)
+ // sorry for the micro optimization here: avoiding a closure
+ val bldr = if (oldVOpt.isEmpty) {
+ val b = List.newBuilder[V]
+ mutable.update(k, b)
+ b
+ } else oldVOpt.get
+ bldr += v
+ }
+ mutable.iterator.map { case (k, bldr) => (k, bldr.result()) }.toMap
+ }
+
+ // Consider this as edges from k -> v, produce a Map[K,Set[V]]
+ def toGraph[K, V](pairs: TraversableOnce[(K, V)]): Map[K, Set[V]] =
+ Monoid.sum(pairs.map { case (k, v) => Map(k -> Set(v)) })
+
+ /** join the keys of two maps (similar to outer-join in a DB) */
+ def join[K, V, W](map1: Map[K, V], map2: Map[K, W]): Map[K, (Option[V], Option[W])] =
+ Monoid
+ .plus(
+ map1.transform { case (_, v) =>
+ (List(v), List[W]())
+ },
+ map2.transform { case (_, w) =>
+ (List[V](), List(w))
+ }
+ )
+ .transform { case (_, (v, w)) => (v.headOption, w.headOption) }
+
+ /**
+ * Reverses a graph losslessly None key is for v's with no sources.
+ */
+ def invertExact[K, V](m: Map[Option[K], Set[V]]): Map[Option[V], Set[K]] = {
+ def nonEmptyIter[T](i: Iterable[T]): Iterable[Option[T]] =
+ if (i.isEmpty) Iterable(None)
+ else {
+ i.map(Some(_))
+ }
+
+ Monoid.sum {
+ for {
+ (k, sv) <- m.view.toIterable
+ v <- nonEmptyIter(sv)
+ } yield Map(v -> k.toSet)
+ }
+ }
+
+ /**
+ * Invert the Common case of exactly one value for each key
+ */
+ def invert[K, V](m: Map[K, V]): Map[V, Set[K]] =
+ Monoid.sum(m.view.toIterable.map { case (k, v) => Map(v -> Set(k)) })
+
+ def dot[K, V](left: Map[K, V], right: Map[K, V])(implicit mring: Ring[Map[K, V]], mon: Monoid[V]): V =
+ Monoid.sum(mring.times(left, right).values)
+
+ def cube[K, V](it: TraversableOnce[(K, V)])(implicit c: Cuber[K]): Map[c.K, List[V]] = {
+ val map: MMap[c.K, List[V]] = MMap[c.K, List[V]]()
+ it.iterator.foreach { case (k, v) =>
+ c(k).iterator.foreach { ik =>
+ map.get(ik) match {
+ case Some(vs) => map += ik -> (v :: vs)
+ case None => map += ik -> List(v)
+ }
+ }
+ }
+ map.foreach { case (k, v) => map(k) = v.reverse }
+ new MutableBackedMap(map)
+ }
+
+ def cubeSum[K, V](it: TraversableOnce[(K, V)])(implicit c: Cuber[K], sg: Semigroup[V]): Map[c.K, V] =
+ sumByKey(it.iterator.flatMap { case (k, v) => c(k).map((_, v)) })
+
+ def cubeAggregate[T, K, U, V](it: TraversableOnce[T], agg: Aggregator[T, U, V])(
+ fn: T => K
+ )(implicit c: Cuber[K]): Map[c.K, V] =
+ sumByKey(it.iterator.flatMap(t => c(fn(t)).iterator.map((_, agg.prepare(t)))))(agg.semigroup)
+ .map { case (k, v) => (k, agg.present(v)) }
+
+ def rollup[K, V](it: TraversableOnce[(K, V)])(implicit r: Roller[K]): Map[r.K, List[V]] = {
+ val map: MMap[r.K, List[V]] = MMap[r.K, List[V]]()
+ it.iterator.foreach { case (k, v) =>
+ r(k).iterator.foreach { ik =>
+ map.get(ik) match {
+ case Some(vs) => map += ik -> (v :: vs)
+ case None => map += ik -> List(v)
+ }
+ }
+ }
+ map.foreach { case (k, v) => map(k) = v.reverse }
+ new MutableBackedMap(map)
+ }
+
+ def rollupSum[K, V](it: TraversableOnce[(K, V)])(implicit r: Roller[K], sg: Semigroup[V]): Map[r.K, V] =
+ sumByKey(it.iterator.flatMap { case (k, v) => r(k).iterator.map((_, v)) })
+
+ def rollupAggregate[T, K, U, V](it: TraversableOnce[T], agg: Aggregator[T, U, V])(
+ fn: T => K
+ )(implicit r: Roller[K]): Map[r.K, V] =
+ sumByKey(it.iterator.flatMap(t => r(fn(t)).iterator.map((_, agg.prepare(t)))))(agg.semigroup)
+ .map { case (k, v) => (k, agg.present(v)) }
+
+}
diff --git a/algebird-core/src/main/scala-2.12/Scan.scala b/algebird-core/src/main/scala-2.12/Scan.scala
new file mode 100644
index 000000000..2dc2ff9c2
--- /dev/null
+++ b/algebird-core/src/main/scala-2.12/Scan.scala
@@ -0,0 +1,333 @@
+package com.twitter.algebird
+
+import scala.collection.compat._
+
+object Scan {
+
+ /**
+ * Most consumers of Scan don't care about the type of the type State type variable. But for those that do,
+ * we make an effort to expose it in all of our combinators.
+ * @tparam I
+ * @tparam S
+ * @tparam O
+ */
+ type Aux[-I, S, +O] = Scan[I, O] { type State = S }
+
+ implicit def applicative[I]: Applicative[Scan[I, _]] = new ScanApplicative[I]
+
+ def from[I, S, O](initState: S)(presentAndNextStateFn: (I, S) => (O, S)): Aux[I, S, O] =
+ new Scan[I, O] {
+ override type State = S
+ override val initialState = initState
+ override def presentAndNextState(i: I, s: State): (O, State) = presentAndNextStateFn(i, s)
+ }
+
+ def fromFunction[I, O](f: I => O): Aux[I, Unit, O] = new Scan[I, O] {
+ override type State = Unit
+ override val initialState = ()
+ override def presentAndNextState(i: I, stateBeforeProcessingI: Unit): (O, State) = (f(i), ())
+ }
+
+ /**
+ * Scans take streams of inputs to streams of outputs, but some scans have trivial inputs and just produce a
+ * stream of outputs. Streams can be thought of as being a hidden state that is queryable for a head
+ * element, and another hidden state that represents the rest of the stream.
+ * @param initState
+ * The initial state of the scan; think of this as an infinite stream.
+ * @param destructor
+ * This function decomposes a stream into the its head-element and tail-stream.
+ * @tparam S
+ * The hidden state of the stream that we are turning into a Scan.
+ * @tparam O
+ * The type of the elments of the stream that we are turning into a Scan
+ * @return
+ * A Scan whose inputs are irrelevant, and whose outputs are those that we would get from implementing a
+ * stream using the information provided to this method.
+ */
+ def iterate[S, O](initState: S)(destructor: S => (O, S)): Aux[Any, S, O] = new Scan[Any, O] {
+ override type State = S
+ override val initialState = initState
+ override def presentAndNextState(i: Any, stateBeforeProcessingI: S): (O, S) =
+ destructor(stateBeforeProcessingI)
+ }
+
+ /**
+ * A Scan whose `Nth` output is the number `N` (starting from 0).
+ */
+ val index: Aux[Any, Long, Long] = iterate(0L)(n => (n, n + 1))
+
+ def identity[A]: Aux[A, Unit, A] = fromFunction[A, A](x => x)
+
+ /**
+ * @param initStateCreator
+ * A call-by-name method that allocates new mutable state
+ * @param presentAndUpdateStateFn
+ * A function that both presents the output value, and has the side-effect of updating the mutable state
+ * @tparam I
+ * @tparam S
+ * @tparam O
+ * @return
+ * A Scan that safely encapsulates state while it's doing its thing.
+ */
+ def mutable[I, S, O](initStateCreator: => S)(presentAndUpdateStateFn: (I, S) => O): Aux[I, S, O] =
+ new Scan[I, O] {
+ override type State = S
+ override def initialState = initStateCreator
+ override def presentAndNextState(i: I, s: S): (O, S) = (presentAndUpdateStateFn(i, s), s)
+ }
+
+ /**
+ * The trivial scan that always returns the same value, regardless of input
+ * @param t
+ * @tparam T
+ */
+ def const[T](t: T): Aux[Any, Unit, T] = fromFunction(_ => t)
+
+ /**
+ * @param aggregator
+ * @param initState
+ * @tparam A
+ * @tparam B
+ * @tparam C
+ * @return
+ * A scan which, when given `[a_1, ..., a_n]` outputs `[c_1, ..., c_n]` where `c_i = initState +
+ * aggregator.prepare(a_1) + ... + aggregator.prepare(a_i)`
+ */
+ def fromAggregator[A, B, C](aggregator: Aggregator[A, B, C], initState: B): Aux[A, B, C] =
+ from(initState) { (a: A, stateBeforeProcessingI: B) =>
+ // nb: the order of the arguments to semigroup.plus here is what determines the order of the final summation;
+ // this matters because not all semigroups are commutative
+ val stateAfterProcessingA =
+ aggregator.append(stateBeforeProcessingI, a)
+ (aggregator.present(stateAfterProcessingA), stateAfterProcessingA)
+ }
+
+ /**
+ * @param monoidAggregator
+ * @tparam A
+ * @tparam B
+ * @tparam C
+ * @return
+ * A scan which, when given `[a_1, ..., a_n]` outputs `[c_1, ..., c_n]` where `c_i =
+ * monoidAggregator.monoid.zero + aggregator.prepare(a_1) + ... + aggregator.prepare(a_i)`
+ */
+ def fromMonoidAggregator[A, B, C](monoidAggregator: MonoidAggregator[A, B, C]): Aux[A, B, C] =
+ fromAggregator(monoidAggregator, monoidAggregator.monoid.zero)
+
+}
+
+/**
+ * The Scan trait is an alternative to the `scanLeft` method on iterators/other collections for a range of of
+ * use-cases where `scanLeft` is awkward to use. At a high level it provides some of the same functionality as
+ * `scanLeft`, but with a separation of "what is the state of the scan" from "what are the elements that I'm
+ * scanning over?". In particular, when scanning over an iterator with `N` elements, the output is an iterator
+ * with `N` elements (in contrast to scanLeft's `N+1`).
+ *
+ * If you find yourself writing a `scanLeft` over pairs of elements, where you only use one element of the
+ * pair within the `scanLeft`, then throw that element away in a `map` immediately after the scanLeft is done,
+ * then this abstraction is for you.
+ *
+ * The canonical method to use a scan is `apply`.
+ *
+ * @tparam I
+ * The type of elements that the computation is scanning over.
+ * @tparam O
+ * The output type of the scan (typically distinct from the hidden `State` of the scan).
+ */
+sealed abstract class Scan[-I, +O] extends Serializable {
+
+ import Scan.{from, Aux}
+
+ /**
+ * The computation of any given scan involves keeping track of a hidden state.
+ */
+ type State
+
+ /**
+ * The state of the scan before any elements have been processed
+ * @return
+ */
+ def initialState: State
+
+ /**
+ * @param i
+ * An element in the stream to process
+ * @param stateBeforeProcessingI
+ * The state of the scan before processing i
+ * @return
+ * The output of the scan corresponding to processing i with state stateBeforeProcessing, along with the
+ * result of updating stateBeforeProcessing with the information from i.
+ */
+ def presentAndNextState(i: I, stateBeforeProcessingI: State): (O, State)
+
+ /**
+ * @param iter
+ * @return
+ * If `iter = Iterator(a_1, ..., a_n)`, return:` `Iterator(o_1, ..., o_n)` where `(o_(i+1), state_(i+1)) =
+ * presentAndNextState(a_i, state_i)` and `state_0 = initialState`
+ */
+ def scanIterator(iter: Iterator[I]): Iterator[O] = new AbstractIterator[O] {
+ override def hasNext: Boolean = iter.hasNext
+ var state: State = initialState
+ override def next(): O = {
+ val thisState = state
+ val thisA = iter.next()
+ val (thisC, nextState) = presentAndNextState(thisA, thisState)
+ state = nextState
+ thisC
+ }
+ }
+
+ /**
+ * @param inputs
+ * @param bf
+ * @tparam In
+ * The type of the input collection
+ * @tparam Out
+ * The type of the output collection
+ * @return
+ * Given inputs as a collection of the form `[a_1, ..., a_n]` the output will be a collection of the form:
+ * `[o_1, ..., o_n]` where `(o_(i+1), state_(i+1)) = presentAndNextState(a_i, state_i)` and `state_0 =
+ * initialState`.
+ */
+ def apply[In <: TraversableOnce[I], Out](
+ inputs: In
+ )(implicit bf: BuildFrom[In, O, Out]): Out =
+ bf.fromSpecific(inputs)(scanIterator(inputs.toIterator))
+
+ // combinators
+
+ /**
+ * Return a new scan that is the same as this scan, but with a different `initialState`.
+ * @param newInitialState
+ * @return
+ */
+ def replaceState(newInitialState: => State): Aux[I, State, O] =
+ from(newInitialState)(presentAndNextState(_, _))
+
+ def composePrepare[I1](f: I1 => I): Aux[I1, State, O] = from(initialState) { (i, stateBeforeProcessingI) =>
+ presentAndNextState(f(i), stateBeforeProcessingI)
+ }
+
+ def andThenPresent[O1](g: O => O1): Aux[I, State, O1] = from(initialState) { (i, stateBeforeProcessingI) =>
+ val (c, stateAfterProcessingA) = presentAndNextState(i, stateBeforeProcessingI)
+ (g(c), stateAfterProcessingA)
+ }
+
+ /**
+ * Return a scan that is semantically identical to `this.join(Scan.identity[I1])`, but where we don't
+ * pollute the `State` by pairing it redundantly with `Unit`.
+ * @tparam I1
+ * @return
+ * If this Scan's `apply` method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1,
+ * ..., o_n`, then this results in a Scan whose `apply` method returns `[(o_1, a_1), ..., (o_n, a_n)]`
+ * when given the same input.
+ */
+ def joinWithInput[I1 <: I]: Aux[I1, State, (O, I1)] = from(initialState) { (i, stateBeforeProcessingI) =>
+ val (o, stateAfterProcessingI) = presentAndNextState(i, stateBeforeProcessingI)
+ ((o, i), stateAfterProcessingI)
+ }
+
+ /**
+ * Return a scan whose output is paired with the state of the scan before each input updates the state.
+ * @return
+ * If this Scan's `apply` method is given inputs [a_1, ..., a_n] resulting in outputs of the form `[o_1,
+ * ..., o_n]`, where `(o_(i+1), state_(i+1)) = presentAndNextState(a_i, state_i)` and `state_0 =
+ * initialState`, return a scan that whose apply method, when given inputs `[a_1, ..., a_n]` will return
+ * `[(o_1, state_0), ..., (o_n, state_(n-1))]`.
+ */
+ def joinWithPriorState: Aux[I, State, (State, O)] = from(initialState) { (i, stateBeforeProcessingI) =>
+ val (o, stateAfterProcessingA) = presentAndNextState(i, stateBeforeProcessingI)
+ ((stateBeforeProcessingI, o), stateAfterProcessingA)
+ }
+
+ /**
+ * Return a scan whose output is paired with the state of the scan after each input updates the state.
+ * @return
+ * If this Scan's `apply` method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1,
+ * ..., o_n]`, where `(o_(i+1), state_(i+1)) = presentAndNextState(a_i, state_i)`` and state_0 =
+ * initialState, return a scan that whose apply method, when given inputs `[a_1, ..., a_n]` will return
+ * `[(o_1, state_1), ..., (o_n, state_n]`.
+ */
+ def joinWithPosteriorState: Aux[I, State, (O, State)] = from(initialState) { (i, stateBeforeProcessingI) =>
+ val (c, stateAfterProcessingA) = presentAndNextState(i, stateBeforeProcessingI)
+ ((c, stateAfterProcessingA), stateAfterProcessingA)
+ }
+
+ /**
+ * For every `foo`, `scan.joinWithIndex(foo) == scan(foo).zipWithIndex`.
+ * @return
+ * If this Scan's `apply` method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1,
+ * ..., o_n]`, return a scan that whose apply method, when given the same input, will return `[(o_1, 1),
+ * ..., (o_n, n)]`.
+ */
+ def joinWithIndex: Aux[I, (State, Long), (O, Long)] = join(Scan.index)
+
+ /**
+ * Compose two scans pairwise such that, when given pairwise zipped inputs, the resulting scan will output
+ * pairwise zipped outputs.
+ * @param scan2
+ * @tparam I2
+ * @tparam O2
+ * @return
+ * If this Scan's apply method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1,
+ * ..., o_n]`, and `scan2.apply([b_1, ..., b_n] = [p_1, ..., p_n]` then `zip` will return a scan whose
+ * apply method, when given input `[(a_1, b_1), ..., (a_n, b_n)]` results in the output `[(o_1, p_1), ...,
+ * (o_2, p_2)]`. In other words: `scan.zip(scan2)(foo.zip(bar)) == scan(foo).zip(scan2(bar))`
+ */
+ def zip[I2, O2](scan2: Scan[I2, O2]): Aux[(I, I2), (State, scan2.State), (O, O2)] =
+ from((initialState, scan2.initialState)) { (i1i2, stateBeforeProcessingI1I2) =>
+ val (o1, state1AfterProcesingI1) =
+ presentAndNextState(i1i2._1, stateBeforeProcessingI1I2._1)
+ val (o2, state2AfterProcesingI2) =
+ scan2.presentAndNextState(i1i2._2, stateBeforeProcessingI1I2._2)
+ ((o1, o2), (state1AfterProcesingI1, state2AfterProcesingI2))
+ }
+
+ /**
+ * Given a scan that takes compatible input to this one, pairwise compose the state and outputs of each scan
+ * on a common input stream.
+ * @param scan2
+ * @tparam I2
+ * @tparam O2
+ * @return
+ * If this Scan's apply method is given inputs [a_1, ..., a_n] resulting in outputs of the form `[o_1,
+ * ..., o_n]`, and `scan2.apply([a_1, ..., a_n] = [p_1, ..., p_n]` then `join` will return a scan whose
+ * apply method returns `[(o_1, p_1), ..., (o_2, p_2)]`. In other words: `scan.join(scan2)(foo) ==
+ * scan(foo).zip(scan2(foo))`
+ */
+ def join[I2 <: I, O2](scan2: Scan[I2, O2]): Aux[I2, (State, scan2.State), (O, O2)] =
+ from((initialState, scan2.initialState)) { (i, stateBeforeProcessingI) =>
+ val (o1, state1AfterProcesingI1) = presentAndNextState(i, stateBeforeProcessingI._1)
+ val (o2, state2AfterProcesingI2) = scan2.presentAndNextState(i, stateBeforeProcessingI._2)
+ ((o1, o2), (state1AfterProcesingI1, state2AfterProcesingI2))
+ }
+
+ /**
+ * Takes the output of this scan and feeds as input into scan2.
+ * @param scan2
+ * @tparam P
+ * @return
+ * If this Scan's apply method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1,
+ * ..., o_n]`, and `scan2.apply([o_1, ..., o_n] = [p_1, ..., p_n]` then `compose` will return a scan which
+ * returns `[p_1, ..., p_n]`.
+ */
+ def compose[P](scan2: Scan[O, P]): Aux[I, (State, scan2.State), P] =
+ from((initialState, scan2.initialState)) { (i, stateBeforeProcessingI) =>
+ val (o, state1AfterProcesingI) = presentAndNextState(i, stateBeforeProcessingI._1)
+ val (p, state2AfterProcesingO) = scan2.presentAndNextState(o, stateBeforeProcessingI._2)
+ (p, (state1AfterProcesingI, state2AfterProcesingO))
+ }
+
+}
+
+class ScanApplicative[I] extends Applicative[Scan[I, _]] {
+ override def map[T, U](mt: Scan[I, T])(fn: T => U): Scan[I, U] =
+ mt.andThenPresent(fn)
+
+ override def apply[T](v: T): Scan[I, T] =
+ Scan.const(v)
+
+ override def join[T, U](mt: Scan[I, T], mu: Scan[I, U]): Scan[I, (T, U)] =
+ mt.join(mu)
+}
diff --git a/algebird-core/src/main/scala-2.12/SpaceSaver.scala b/algebird-core/src/main/scala-2.12/SpaceSaver.scala
new file mode 100644
index 000000000..5f9eee7e6
--- /dev/null
+++ b/algebird-core/src/main/scala-2.12/SpaceSaver.scala
@@ -0,0 +1,296 @@
+package com.twitter.algebird
+
+import java.nio.ByteBuffer
+
+import scala.collection.immutable.SortedMap
+import scala.util.{Failure, Success, Try}
+
+object SpaceSaver {
+
+ /**
+ * Construct SpaceSaver with given capacity containing a single item. This is the public api to create a new
+ * SpaceSaver.
+ */
+ def apply[T](capacity: Int, item: T): SpaceSaver[T] = SSOne(capacity, item)
+
+ /**
+ * Construct SpaceSaver with given capacity containing a single item with provided exact count. This is the
+ * public api to create a new SpaceSaver.
+ */
+ def apply[T](capacity: Int, item: T, count: Long): SpaceSaver[T] =
+ SSMany(capacity, Map(item -> ((count, 0L))))
+
+ private[algebird] val ordering =
+ Ordering.by[(?, (Long, Long)), (Long, Long)] { case (_, (count, err)) =>
+ (-count, err)
+ }
+
+ implicit def spaceSaverSemiGroup[T]: Semigroup[SpaceSaver[T]] =
+ new SpaceSaverSemigroup[T]
+
+ /**
+ * Encodes the SpaceSaver as a sequence of bytes containing in order
+ * - 1 byte: 1/2 => 1 = SSOne, 2 = SSMany
+ * - 4 bytes: the capacity
+ * - N bytes: the item/counters (counters as length + N*(item size + item + 2 * counters)
+ */
+ def toBytes[T](ss: SpaceSaver[T], tSerializer: T => Array[Byte]): Array[Byte] =
+ ss match {
+ case SSOne(capacity, item) =>
+ val itemAsBytes = tSerializer(item)
+ val itemLength = itemAsBytes.length
+ // 1 for the type, 4 for capacity, 4 for itemAsBytes.length
+ val buffer = new Array[Byte](1 + 4 + 4 + itemLength)
+ ByteBuffer
+ .wrap(buffer)
+ .put(1: Byte)
+ .putInt(capacity)
+ .putInt(itemLength)
+ .put(itemAsBytes)
+ buffer
+
+ case SSMany(
+ capacity,
+ counters,
+ _
+ ) => // We do not care about the buckets are thery are created by SSMany.apply
+ val buffer = scala.collection.mutable.ArrayBuffer.newBuilder[Byte]
+ buffer += (2: Byte)
+
+ var buff = ByteBuffer.allocate(4)
+ buff.putInt(capacity)
+ buffer ++= buff.array()
+
+ buff = ByteBuffer.allocate(4)
+ buff.putInt(counters.size)
+ buffer ++= buff.array()
+ counters.foreach { case (item, (a, b)) =>
+ val itemAsBytes = tSerializer(item)
+
+ buff = ByteBuffer.allocate(4)
+ buff.putInt(itemAsBytes.length)
+ buffer ++= buff.array()
+
+ buffer ++= itemAsBytes
+
+ buff = ByteBuffer.allocate(8 * 2)
+ buff.putLong(a)
+ buff.putLong(b)
+ buffer ++= buff.array()
+ }
+ buffer.result().toArray
+ }
+
+ // Make sure to be reversible so fromBytes(toBytes(x)) == x
+ def fromBytes[T](bytes: Array[Byte], tDeserializer: Array[Byte] => Try[T]): Try[SpaceSaver[T]] =
+ fromByteBuffer(ByteBuffer.wrap(bytes), buffer => tDeserializer(buffer.array()))
+
+ def fromByteBuffer[T](bb: ByteBuffer, tDeserializer: ByteBuffer => Try[T]): Try[SpaceSaver[T]] =
+ Try {
+ bb.get.toInt match {
+ case 1 =>
+ val capacity = bb.getInt
+ val itemLength = bb.getInt
+ val itemAsBytes = new Array[Byte](itemLength)
+ bb.get(itemAsBytes)
+ tDeserializer(ByteBuffer.wrap(itemAsBytes)).map(item => SSOne(capacity, item))
+ case 2 =>
+ val capacity = bb.getInt
+
+ var countersToDeserialize = bb.getInt
+ val counters = scala.collection.mutable.Map.empty[T, (Long, Long)]
+ while (countersToDeserialize != 0) {
+ val itemLength = bb.getInt()
+ val itemAsBytes = new Array[Byte](itemLength)
+ bb.get(itemAsBytes)
+ val item = tDeserializer(ByteBuffer.wrap(itemAsBytes))
+
+ val a = bb.getLong
+ val b = bb.getLong
+
+ item match {
+ case Failure(e) => return Failure(e)
+ case Success(i) =>
+ counters += ((i, (a, b)))
+ }
+
+ countersToDeserialize -= 1
+ }
+
+ Success(SSMany(capacity, counters.toMap))
+ }
+ }.flatten
+}
+
+/**
+ * Data structure used in the Space-Saving Algorithm to find the approximate most frequent and top-k elements.
+ * The algorithm is described in "Efficient Computation of Frequent and Top-k Elements in Data Streams". See
+ * here: www.cs.ucsb.edu/research/tech_reports/reports/2005-23.pdf In the paper the data structure is called
+ * StreamSummary but we chose to call it SpaceSaver instead. Note that the adaptation to hadoop and
+ * parallelization were not described in the article and have not been proven to be mathematically correct or
+ * preserve the guarantees or benefits of the algorithm.
+ */
+sealed abstract class SpaceSaver[T] {
+ import SpaceSaver.ordering
+
+ /**
+ * Maximum number of counters to keep (parameter "m" in the research paper).
+ */
+ def capacity: Int
+
+ /**
+ * Current lowest value for count
+ */
+ def min: Long
+
+ /**
+ * Map of item to counter, where each counter consists of an observed count and possible over-estimation
+ * (error)
+ */
+ def counters: Map[T, (Long, Long)]
+
+ def ++(other: SpaceSaver[T]): SpaceSaver[T]
+
+ /**
+ * returns the frequency estimate for the item
+ */
+ def frequency(item: T): Approximate[Long] = {
+ val (count, err) = counters.getOrElse(item, (min, min))
+ Approximate(count - err, count, count, 1.0)
+ }
+
+ /**
+ * Get the elements that show up more than thres times. Returns sorted in descending order: (item,
+ * Approximate[Long], guaranteed)
+ */
+ def mostFrequent(thres: Int): Seq[(T, Approximate[Long], Boolean)] =
+ counters.iterator
+ .filter { case (_, (count, _)) => count >= thres }
+ .toList
+ .sorted(ordering)
+ .map { case (item, (count, err)) =>
+ (item, Approximate(count - err, count, count, 1.0), thres <= count - err)
+ }
+
+ /**
+ * Get the top-k elements. Returns sorted in descending order: (item, Approximate[Long], guaranteed)
+ */
+ def topK(k: Int): Seq[(T, Approximate[Long], Boolean)] = {
+ require(k < capacity)
+ val si = counters.toList
+ .sorted(ordering)
+ val siK = si.take(k)
+ val countKPlus1 = si.drop(k).headOption.map(_._2._1).getOrElse(0L)
+ siK.map { case (item, (count, err)) =>
+ (item, Approximate(count - err, count, count, 1.0), countKPlus1 < count - err)
+ }
+ }
+
+ /**
+ * Check consistency with other SpaceSaver, useful for testing. Returns boolean indicating if they are
+ * consistent
+ */
+ def consistentWith(that: SpaceSaver[T]): Boolean =
+ (counters.keys ++ that.counters.keys).forall(item => (frequency(item) - that.frequency(item)) ~ 0)
+}
+
+case class SSOne[T] private[algebird] (override val capacity: Int, item: T) extends SpaceSaver[T] {
+ require(capacity > 1)
+
+ override def min: Long = 0L
+
+ override def counters: Map[T, (Long, Long)] = Map(item -> ((1L, 1L)))
+
+ override def ++(other: SpaceSaver[T]): SpaceSaver[T] = other match {
+ case other: SSOne[?] => SSMany(this).add(other)
+ case other: SSMany[?] => other.add(this)
+ }
+}
+
+object SSMany {
+ private def bucketsFromCounters[T](counters: Map[T, (Long, Long)]): SortedMap[Long, Set[T]] =
+ SortedMap[Long, Set[T]]() ++ counters.groupBy(_._2._1).mapValues(_.keySet).toMap
+
+ private[algebird] def apply[T](capacity: Int, counters: Map[T, (Long, Long)]): SSMany[T] =
+ SSMany(capacity, counters, bucketsFromCounters(counters))
+
+ private[algebird] def apply[T](one: SSOne[T]): SSMany[T] =
+ SSMany(one.capacity, Map(one.item -> ((1L, 0L))), SortedMap(1L -> Set(one.item)))
+}
+
+case class SSMany[T] private (
+ override val capacity: Int,
+ override val counters: Map[T, (Long, Long)],
+ buckets: SortedMap[Long, Set[T]]
+) extends SpaceSaver[T] {
+ private val exact: Boolean = counters.size < capacity
+
+ override val min: Long = if (counters.size < capacity) 0L else buckets.firstKey
+
+ // item is already present and just needs to be bumped up one
+ private def bump(item: T) = {
+ val (count, err) = counters(item)
+ val counters1 = counters + (item -> ((count + 1L, err))) // increment by one
+ val currBucket = buckets(count) // current bucket
+ val buckets1 = {
+ if (currBucket.size == 1) // delete current bucket since it will be empty
+ buckets - count
+ else // remove item from current bucket
+ buckets + (count -> (currBucket - item))
+ } + (count + 1L -> (buckets.getOrElse(count + 1L, Set()) + item))
+ SSMany(capacity, counters1, buckets1)
+ }
+
+ // lose one item to meet capacity constraint
+ private def loseOne = {
+ val firstBucket = buckets(buckets.firstKey)
+ val itemToLose = firstBucket.head
+ val counters1 = counters - itemToLose
+ val buckets1 =
+ if (firstBucket.size == 1)
+ buckets - min
+ else
+ buckets + (min -> (firstBucket - itemToLose))
+ SSMany(capacity, counters1, buckets1)
+ }
+
+ // introduce new item
+ private def introduce(item: T, count: Long, err: Long) = {
+ val counters1 = counters + (item -> ((count, err)))
+ val buckets1 = buckets + (count -> (buckets.getOrElse(count, Set()) + item))
+ SSMany(capacity, counters1, buckets1)
+ }
+
+ // add a single element
+ private[algebird] def add(x: SSOne[T]): SSMany[T] = {
+ require(x.capacity == capacity)
+ if (counters.contains(x.item))
+ bump(x.item)
+ else
+ (if (exact) this else this.loseOne).introduce(x.item, min + 1L, min)
+ }
+
+ // merge two stream summaries
+ private def merge(x: SSMany[T]): SSMany[T] = {
+ require(x.capacity == capacity)
+ val counters1 = Map() ++
+ (counters.keySet ++ x.counters.keySet).toList
+ .map { key =>
+ val (count1, err1) = counters.getOrElse(key, (min, min))
+ val (count2, err2) = x.counters.getOrElse(key, (x.min, x.min))
+ key -> ((count1 + count2, err1 + err2))
+ }
+ .sorted(SpaceSaver.ordering)
+ .take(capacity)
+ SSMany(capacity, counters1)
+ }
+
+ override def ++(other: SpaceSaver[T]): SpaceSaver[T] = other match {
+ case other: SSOne[?] => add(other)
+ case other: SSMany[?] => merge(other)
+ }
+}
+
+class SpaceSaverSemigroup[T] extends Semigroup[SpaceSaver[T]] {
+ override def plus(x: SpaceSaver[T], y: SpaceSaver[T]): SpaceSaver[T] = x ++ y
+}
diff --git a/algebird-core/src/main/scala-2.12/VectorSpace.scala b/algebird-core/src/main/scala-2.12/VectorSpace.scala
new file mode 100644
index 000000000..f8818600c
--- /dev/null
+++ b/algebird-core/src/main/scala-2.12/VectorSpace.scala
@@ -0,0 +1,59 @@
+/*
+Copyright 2012 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+ */
+
+package com.twitter.algebird
+
+import scala.annotation.implicitNotFound
+
+/**
+ * This class represents a vector space. For the required properties see:
+ *
+ * http://en.wikipedia.org/wiki/Vector_space#Definition
+ */
+object VectorSpace extends VectorSpaceOps with Implicits
+
+sealed trait VectorSpaceOps {
+ def scale[F, C[_]](v: F, c: C[F])(implicit vs: VectorSpace[F, C]): C[F] =
+ vs.scale(v, c)
+ def from[F, C[_]](scaleFn: (F, C[F]) => C[F])(implicit r: Ring[F], cGroup: Group[C[F]]): VectorSpace[F, C] =
+ new VectorSpace[F, C] {
+ override def ring: Ring[F] = r
+ override def group: Group[C[F]] = cGroup
+ override def scale(v: F, c: C[F]): C[F] =
+ if (r.isNonZero(v)) scaleFn(v, c) else cGroup.zero
+ }
+}
+private object VectorSpaceOps extends VectorSpaceOps
+
+sealed trait Implicits extends LowPrioImpicits {
+ implicit def indexedSeqSpace[T: Ring]: VectorSpace[T, IndexedSeq] =
+ VectorSpaceOps.from[T, IndexedSeq]((s, seq) => seq.map(Ring.times(s, _)))
+}
+
+sealed trait LowPrioImpicits {
+ implicit def mapSpace[K, T: Ring]: VectorSpace[T, Map[K, _]] =
+ VectorSpaceOps.from[T, Map[K, _]] { (s, m) =>
+ m.transform { case (_, v) => Ring.times(s, v) }
+ }
+}
+
+@implicitNotFound(msg = "Cannot find VectorSpace type class for Container: ${C} and Ring: ${F}")
+trait VectorSpace[F, C[_]] extends java.io.Serializable {
+ implicit def ring: Ring[F]
+ def field: Ring[F] = ring // this is for compatibility with older versions
+ implicit def group: Group[C[F]]
+ def scale(v: F, c: C[F]): C[F]
+}
diff --git a/algebird-core/src/main/scala-2.12/monad/EitherMonad.scala b/algebird-core/src/main/scala-2.12/monad/EitherMonad.scala
new file mode 100644
index 000000000..b6d5e2ffc
--- /dev/null
+++ b/algebird-core/src/main/scala-2.12/monad/EitherMonad.scala
@@ -0,0 +1,37 @@
+/*
+ Copyright 2013 Twitter, Inc.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package com.twitter.algebird.monad
+
+import com.twitter.algebird.Monad
+
+// Monad for either, used for modeling Error where L is the type of the error
+object EitherMonad {
+ class Error[L] extends Monad[Either[L, *]] {
+ override def apply[R](r: R): Right[L, R] = Right(r)
+
+ override def flatMap[T, U](self: Either[L, T])(next: T => Either[L, U]): Either[L, U] =
+ self.right.flatMap(next)
+
+ override def map[T, U](self: Either[L, T])(fn: T => U): Either[L, U] =
+ self.right.map(fn)
+ }
+
+ implicit def monad[L]: Monad[Either[L, _]] = new Error[L]
+
+ def assert[L](truth: Boolean, failure: => L): Either[L, Unit] =
+ if (truth) Right(()) else Left(failure)
+}
diff --git a/algebird-core/src/main/scala-2.12/monad/Reader.scala b/algebird-core/src/main/scala-2.12/monad/Reader.scala
new file mode 100644
index 000000000..e0747af20
--- /dev/null
+++ b/algebird-core/src/main/scala-2.12/monad/Reader.scala
@@ -0,0 +1,76 @@
+/*
+ Copyright 2013 Twitter, Inc.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package com.twitter.algebird.monad
+
+import com.twitter.algebird.Monad
+
+// TODO this is general, move somewhere better
+
+// Reader Monad, represents a series of operations that mutate some environment
+// type (the input to the function)
+
+sealed trait Reader[-Env, +T] {
+ def apply(env: Env): T
+ def flatMap[E1 <: Env, U](next: T => Reader[E1, U]): Reader[E1, U] =
+ FlatMappedReader[E1, T, U](this, next)
+ def map[U](thatFn: T => U): Reader[Env, U] =
+ FlatMappedReader(this, (t: T) => ConstantReader(thatFn(t)))
+}
+
+final case class ConstantReader[+T](get: T) extends Reader[Any, T] {
+ override def apply(env: Any): T = get
+ override def map[U](fn: T => U): ConstantReader[U] = ConstantReader(fn(get))
+ override def flatMap[E1 <: Any, U](next: T => Reader[E1, U]): Reader[E1, U] =
+ next(get)
+}
+final case class ReaderFn[E, +T](fn: E => T) extends Reader[E, T] {
+ override def apply(env: E): T = fn(env)
+}
+final case class FlatMappedReader[E, U, +T](first: Reader[E, U], fn: U => Reader[E, T]) extends Reader[E, T] {
+ override def apply(env: E): T = {
+ @annotation.tailrec
+ def loop(r: Reader[E, Any], stack: List[(Any) => Reader[E, Any]]): Any =
+ r match {
+ case ConstantReader(get) =>
+ stack match {
+ case head :: tail => loop(head(get), tail)
+ case Nil => get
+ }
+ case ReaderFn(fn) =>
+ stack match {
+ case head :: tail => loop(head(fn(env)), tail)
+ case Nil => fn(env)
+ }
+ case FlatMappedReader(first, nextFn) => loop(first, nextFn :: stack)
+ }
+ loop(first, List(fn.asInstanceOf[(Any) => Reader[E, Any]])).asInstanceOf[T]
+ }
+}
+
+object Reader {
+ def const[T](t: T): Reader[Any, T] = ConstantReader(t)
+ implicit def apply[E, T](fn: (E) => T): Reader[E, T] = ReaderFn(fn)
+
+ class ReaderM[Env] extends Monad[Reader[Env, _]] {
+ override def apply[T](t: T): ConstantReader[T] = ConstantReader(t)
+ override def flatMap[T, U](self: Reader[Env, T])(next: T => Reader[Env, U]): Reader[Env, U] =
+ self.flatMap(next)
+ override def map[T, U](self: Reader[Env, T])(fn: T => U): Reader[Env, U] = self.map(fn)
+ }
+
+ implicit def monad[Env]: Monad[Reader[Env, _]] = new ReaderM[Env]
+}
diff --git a/algebird-core/src/main/scala-2.12/monad/StateWithError.scala b/algebird-core/src/main/scala-2.12/monad/StateWithError.scala
new file mode 100644
index 000000000..e15a9ebc3
--- /dev/null
+++ b/algebird-core/src/main/scala-2.12/monad/StateWithError.scala
@@ -0,0 +1,130 @@
+/*
+ Copyright 2013 Twitter, Inc.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package com.twitter.algebird.monad
+
+import com.twitter.algebird.{Monad, Semigroup}
+
+/**
+ * Monad to handle mutating input state and possible failures. This is used to interact in the planning phase
+ * with existing mutable APIs (like storm or cascading), but retain the ability to compose carefully.
+ */
+sealed trait StateWithError[S, +F, +T] {
+ def join[F1 >: F, U](
+ that: StateWithError[S, F1, U],
+ mergeErr: (F1, F1) => F1,
+ mergeState: (S, S) => S
+ ): StateWithError[S, F1, (T, U)] =
+ join(that)(Semigroup.from(mergeErr), Semigroup.from(mergeState))
+
+ def join[F1 >: F, U](that: StateWithError[S, F1, U])(implicit
+ sgf: Semigroup[F1],
+ sgs: Semigroup[S]
+ ): // TODO: deep joins could blow the stack, not yet using trampoline here
+ StateWithError[S, F1, (T, U)] =
+ StateFn { (requested: S) =>
+ (run(requested), that.run(requested)) match {
+ case (Right((s1, r1)), Right((s2, r2))) =>
+ Right((sgs.plus(s1, s2), (r1, r2)))
+ case (Left(err1), Left(err2)) =>
+ Left(sgf.plus(err1, err2)) // Our earlier is not ready
+ case (Left(err), _) => Left(err)
+ case (_, Left(err)) => Left(err)
+ }
+ }
+
+ def apply(state: S): Either[F, (S, T)] = run(state)
+
+ def run(state: S): Either[F, (S, T)]
+
+ def flatMap[F1 >: F, U](next: T => StateWithError[S, F1, U]): StateWithError[S, F1, U] =
+ FlatMappedState(this, next)
+
+ def map[U](fn: (T) => U): StateWithError[S, F, U] =
+ FlatMappedState(this, (t: T) => StateWithError.const(fn(t)))
+}
+
+/** Simple wrapper of a function in the Monad */
+final case class StateFn[S, F, T](fn: S => Either[F, (S, T)]) extends StateWithError[S, F, T] {
+ override def run(state: S): Either[F, (S, T)] = fn(state)
+}
+
+/**
+ * A Trampolining instance that should prevent stack overflow at the expense of performance
+ */
+final case class FlatMappedState[S, F, T, U](start: StateWithError[S, F, T], fn: T => StateWithError[S, F, U])
+ extends StateWithError[S, F, U] {
+ override def run(state: S): Either[F, (S, U)] = {
+ @annotation.tailrec
+ def loop(inState: S, st: StateWithError[S, F, Any], stack: List[Any => StateWithError[S, F, Any]]): Any =
+ st match {
+ case StateFn(fn) =>
+ fn(inState) match {
+ case err @ Left(_) => err // bail at first error
+ case noError @ Right((newState, out)) =>
+ stack match {
+ case head :: tailStack => loop(newState, head(out), tailStack)
+ case Nil => noError // recursion ends
+ }
+ }
+ case FlatMappedState(st, next) => loop(inState, st, next :: stack)
+ }
+ loop(state, this, Nil).asInstanceOf[Either[F, (S, U)]]
+ }
+}
+
+object StateWithError {
+ def getState[S]: StateWithError[S, Nothing, S] =
+ StateFn((state: S) => Right((state, state)))
+ def putState[S](newState: S): StateWithError[S, Nothing, Unit] =
+ StateFn((_: S) => Right((newState, ())))
+ def swapState[S](newState: S): StateWithError[S, Nothing, S] =
+ StateFn((old: S) => Right((newState, old)))
+
+ def const[S, T](t: T): StateWithError[S, Nothing, T] =
+ StateFn((state: S) => Right((state, t)))
+ def lazyVal[S, T](t: => T): StateWithError[S, Nothing, T] =
+ StateFn((state: S) => Right((state, t)))
+ def failure[S, F](f: F): StateWithError[S, F, Nothing] =
+ StateFn(_ => Left(f))
+
+ /**
+ * Use like fromEither[Int](Right("good")) to get a constant Either in the monad
+ */
+ def fromEither[S]: ConstantStateMaker[S] = new ConstantStateMaker[S]
+ class ConstantStateMaker[S] {
+ def apply[F, T](either: Either[F, T]): StateWithError[S, F, T] = { (s: S) => either.right.map((s, _)) }
+ }
+
+ class FunctionLifter[S] {
+ def apply[I, F, T](fn: I => Either[F, T]): (I => StateWithError[S, F, T]) = { (i: I) =>
+ StateFn((s: S) => fn(i).right.map((s, _)))
+ }
+ }
+ // TODO this should move to Monad and work for any Monad
+ def toKleisli[S]: FunctionLifter[S] = new FunctionLifter[S]
+
+ implicit def apply[S, F, T](fn: S => Either[F, (S, T)]): StateWithError[S, F, T] = StateFn(fn)
+ implicit def monad[S, F]: Monad[StateWithError[S, F, _]] = new StateFMonad[F, S]
+
+ class StateFMonad[F, S] extends Monad[StateWithError[S, F, _]] {
+ override def apply[T](const: T): StateWithError[S, Nothing, T] = { (s: S) => Right((s, const)) }
+ override def flatMap[T, U](
+ earlier: StateWithError[S, F, T]
+ )(next: T => StateWithError[S, F, U]): StateWithError[S, F, U] =
+ earlier.flatMap(next)
+ }
+}
diff --git a/algebird-core/src/main/scala-2.13/Aggregator.scala b/algebird-core/src/main/scala-2.13/Aggregator.scala
new file mode 100644
index 000000000..8a4d2b230
--- /dev/null
+++ b/algebird-core/src/main/scala-2.13/Aggregator.scala
@@ -0,0 +1,637 @@
+package com.twitter.algebird
+
+import java.util.PriorityQueue
+import scala.collection.compat._
+import scala.collection.generic.CanBuildFrom
+
+/**
+ * Aggregators compose well.
+ *
+ * To create a parallel aggregator that operates on a single input in parallel, use:
+ * GeneratedTupleAggregator.from2((agg1, agg2))
+ */
+object Aggregator extends java.io.Serializable {
+ implicit def applicative[I]: Applicative[({ type L[O] = Aggregator[I, ?, O] })#L] =
+ new AggregatorApplicative[I]
+
+ private val DefaultSeed = 471312384
+
+ /**
+ * This is a trivial aggregator that always returns a single value
+ */
+ def const[T](t: T): MonoidAggregator[Any, Unit, T] =
+ prepareMonoid { (_: Any) => () }.andThenPresent(_ => t)
+
+ /**
+ * Using Aggregator.prepare,present you can add to this aggregator
+ */
+ def fromReduce[T](red: (T, T) => T): Aggregator[T, T, T] =
+ fromSemigroup(Semigroup.from(red))
+ def fromSemigroup[T](implicit sg: Semigroup[T]): Aggregator[T, T, T] =
+ new Aggregator[T, T, T] {
+ override def prepare(input: T): T = input
+ override def semigroup: Semigroup[T] = sg
+ override def present(reduction: T): T = reduction
+ }
+ def fromMonoid[T](implicit mon: Monoid[T]): MonoidAggregator[T, T, T] =
+ prepareMonoid(identity[T])
+ // Uses the product from the ring
+ def fromRing[T](implicit rng: Ring[T]): RingAggregator[T, T, T] =
+ fromRing[T, T](rng, identity[T])
+
+ def fromMonoid[F, T](implicit mon: Monoid[T], prep: F => T): MonoidAggregator[F, T, T] =
+ prepareMonoid(prep)(mon)
+
+ def prepareSemigroup[F, T](prep: F => T)(implicit sg: Semigroup[T]): Aggregator[F, T, T] =
+ new Aggregator[F, T, T] {
+ override def prepare(input: F): T = prep(input)
+ override def semigroup: Semigroup[T] = sg
+ override def present(reduction: T): T = reduction
+ }
+ def prepareMonoid[F, T](prep: F => T)(implicit m: Monoid[T]): MonoidAggregator[F, T, T] =
+ new MonoidAggregator[F, T, T] {
+ override def prepare(input: F): T = prep(input)
+ override def monoid: Monoid[T] = m
+ override def present(reduction: T): T = reduction
+ }
+ // Uses the product from the ring
+ def fromRing[F, T](implicit rng: Ring[T], prep: F => T): RingAggregator[F, T, T] =
+ new RingAggregator[F, T, T] {
+ override def prepare(input: F): T = prep(input)
+ override def ring: Ring[T] = rng
+ override def present(reduction: T): T = reduction
+ }
+
+ /**
+ * Obtain an [[Aggregator]] that uses an efficient append operation for faster aggregation. Equivalent to
+ * {{{appendSemigroup(prep, appnd, identity[T]_)(sg)}}}
+ */
+ def appendSemigroup[F, T](prep: F => T, appnd: (T, F) => T)(implicit
+ sg: Semigroup[T]
+ ): Aggregator[F, T, T] =
+ appendSemigroup(prep, appnd, identity[T])(sg)
+
+ /**
+ * Obtain an [[Aggregator]] that uses an efficient append operation for faster aggregation
+ * @tparam F
+ * Data input type
+ * @tparam T
+ * Aggregating [[Semigroup]] type
+ * @tparam P
+ * Presentation (output) type
+ * @param prep
+ * The preparation function. Expected to construct an instance of type T from a single data element.
+ * @param appnd
+ * Function that appends the [[Semigroup]]. Defines the [[Aggregator.append]] method for this aggregator.
+ * Analogous to the 'seqop' function in Scala's sequence 'aggregate' method
+ * @param pres
+ * The presentation function
+ * @param sg
+ * The [[Semigroup]] type class
+ * @note
+ * The functions 'appnd' and 'prep' are expected to obey the law: {{{appnd(t, f) == sg.plus(t, prep(f))}}}
+ */
+ def appendSemigroup[F, T, P](prep: F => T, appnd: (T, F) => T, pres: T => P)(implicit
+ sg: Semigroup[T]
+ ): Aggregator[F, T, P] =
+ new Aggregator[F, T, P] {
+ override def semigroup: Semigroup[T] = sg
+ override def prepare(input: F): T = prep(input)
+ override def present(reduction: T): P = pres(reduction)
+
+ override def apply(inputs: TraversableOnce[F]): P =
+ applyOption(inputs).get
+
+ override def applyOption(inputs: TraversableOnce[F]): Option[P] =
+ agg(inputs).map(pres)
+
+ override def append(l: T, r: F): T = appnd(l, r)
+
+ override def appendAll(old: T, items: TraversableOnce[F]): T =
+ if (items.iterator.isEmpty) old else reduce(old, agg(items).get)
+
+ private def agg(inputs: TraversableOnce[F]): Option[T] =
+ if (inputs.iterator.isEmpty) None
+ else {
+ val itr = inputs.iterator
+ val t = prepare(itr.next)
+ Some(itr.foldLeft(t)(appnd))
+ }
+ }
+
+ /**
+ * Obtain a [[MonoidAggregator]] that uses an efficient append operation for faster aggregation. Equivalent
+ * to {{{appendMonoid(appnd, identity[T]_)(m)}}}
+ */
+ def appendMonoid[F, T](appnd: (T, F) => T)(implicit m: Monoid[T]): MonoidAggregator[F, T, T] =
+ appendMonoid(appnd, identity[T])(m)
+
+ /**
+ * Obtain a [[MonoidAggregator]] that uses an efficient append operation for faster aggregation
+ * @tparam F
+ * Data input type
+ * @tparam T
+ * Aggregating [[Monoid]] type
+ * @tparam P
+ * Presentation (output) type
+ * @param appnd
+ * Function that appends the [[Monoid]]. Defines the [[MonoidAggregator.append]] method for this
+ * aggregator. Analogous to the 'seqop' function in Scala's sequence 'aggregate' method
+ * @param pres
+ * The presentation function
+ * @param m
+ * The [[Monoid]] type class
+ * @note
+ * The function 'appnd' is expected to obey the law: {{{appnd(t, f) == m.plus(t, appnd(m.zero, f))}}}
+ */
+ def appendMonoid[F, T, P](appnd: (T, F) => T, pres: T => P)(implicit
+ m: Monoid[T]
+ ): MonoidAggregator[F, T, P] =
+ new MonoidAggregator[F, T, P] {
+ override def monoid: Monoid[T] = m
+ override def prepare(input: F): T = appnd(m.zero, input)
+ override def present(reduction: T): P = pres(reduction)
+
+ override def apply(inputs: TraversableOnce[F]): P = present(agg(inputs))
+
+ override def applyOption(inputs: TraversableOnce[F]): Option[P] =
+ if (inputs.isEmpty) None else Some(apply(inputs))
+
+ override def append(l: T, r: F): T = appnd(l, r)
+
+ override def appendAll(old: T, items: TraversableOnce[F]): T =
+ reduce(old, agg(items))
+
+ override def appendAll(items: TraversableOnce[F]): T = agg(items)
+
+ private def agg(inputs: TraversableOnce[F]): T =
+ inputs.foldLeft(m.zero)(append)
+ }
+
+ /**
+ * How many items satisfy a predicate
+ */
+ def count[T](pred: T => Boolean): MonoidAggregator[T, Long, Long] =
+ prepareMonoid { (t: T) => if (pred(t)) 1L else 0L }
+
+ /**
+ * Do any items satisfy some predicate
+ */
+ def exists[T](pred: T => Boolean): MonoidAggregator[T, Boolean, Boolean] =
+ prepareMonoid(pred)(OrVal.unboxedMonoid)
+
+ /**
+ * Do all items satisfy a predicate
+ */
+ def forall[T](pred: T => Boolean): MonoidAggregator[T, Boolean, Boolean] =
+ prepareMonoid(pred)(AndVal.unboxedMonoid)
+
+ /**
+ * Take the first (left most in reduce order) item found
+ */
+ def head[T]: Aggregator[T, T, T] = fromReduce[T]((l, _) => l)
+
+ /**
+ * Take the last (right most in reduce order) item found
+ */
+ def last[T]: Aggregator[T, T, T] = fromReduce[T]((_, r) => r)
+
+ /**
+ * Get the maximum item
+ */
+ def max[T: Ordering]: Aggregator[T, T, T] = new MaxAggregator[T]
+ def maxBy[U, T: Ordering](fn: U => T): Aggregator[U, U, U] = {
+ implicit val ordU: Ordering[U] = Ordering.by(fn)
+ max[U]
+ }
+
+ /**
+ * Get the minimum item
+ */
+ def min[T: Ordering]: Aggregator[T, T, T] = new MinAggregator[T]
+ def minBy[U, T: Ordering](fn: U => T): Aggregator[U, U, U] = {
+ implicit val ordU: Ordering[U] = Ordering.by(fn)
+ min[U]
+ }
+
+ /**
+ * This returns the number of items we find
+ */
+ def size: MonoidAggregator[Any, Long, Long] =
+ prepareMonoid((_: Any) => 1L)
+
+ /**
+ * Take the smallest `count` items using a heap
+ */
+ def sortedTake[T: Ordering](count: Int): MonoidAggregator[T, PriorityQueue[T], Seq[T]] =
+ new mutable.PriorityQueueToListAggregator[T](count)
+
+ /**
+ * Same as sortedTake, but using a function that returns a value that has an Ordering.
+ *
+ * This function is like writing list.sortBy(fn).take(count).
+ */
+ def sortByTake[T, U: Ordering](count: Int)(fn: T => U): MonoidAggregator[T, PriorityQueue[T], Seq[T]] =
+ Aggregator.sortedTake(count)(Ordering.by(fn))
+
+ /**
+ * Take the largest `count` items using a heap
+ */
+ def sortedReverseTake[T: Ordering](count: Int): MonoidAggregator[T, PriorityQueue[T], Seq[T]] =
+ new mutable.PriorityQueueToListAggregator[T](count)(implicitly[Ordering[T]].reverse)
+
+ /**
+ * Same as sortedReverseTake, but using a function that returns a value that has an Ordering.
+ *
+ * This function is like writing list.sortBy(fn).reverse.take(count).
+ */
+ def sortByReverseTake[T, U: Ordering](
+ count: Int
+ )(fn: T => U): MonoidAggregator[T, PriorityQueue[T], Seq[T]] =
+ Aggregator.sortedReverseTake(count)(Ordering.by(fn))
+
+ /**
+ * Immutable version of sortedTake, for frameworks that check immutability of reduce functions.
+ */
+ def immutableSortedTake[T: Ordering](count: Int): MonoidAggregator[T, TopK[T], Seq[T]] =
+ new TopKToListAggregator[T](count)
+
+ /**
+ * Immutable version of sortedReverseTake, for frameworks that check immutability of reduce functions.
+ */
+ def immutableSortedReverseTake[T: Ordering](count: Int): MonoidAggregator[T, TopK[T], Seq[T]] =
+ new TopKToListAggregator[T](count)(implicitly[Ordering[T]].reverse)
+
+ /**
+ * Randomly selects input items where each item has an independent probability 'prob' of being selected.
+ * This assumes that all sampled records can fit in memory, so use this only when the expected number of
+ * sampled values is small.
+ */
+ def randomSample[T](
+ prob: Double,
+ seed: Int = DefaultSeed
+ ): MonoidAggregator[T, Option[Batched[T]], List[T]] = {
+ assert(prob >= 0 && prob <= 1, "randomSample.prob must lie in [0, 1]")
+ val rng = new java.util.Random(seed)
+ Preparer[T]
+ .filter(_ => rng.nextDouble() <= prob)
+ .monoidAggregate(toList)
+ }
+
+ /**
+ * Selects exactly 'count' of the input records randomly (or all of the records if there are less then
+ * 'count' total records). This assumes that all 'count' of the records can fit in memory, so use this only
+ * for small values of 'count'.
+ */
+ def reservoirSample[T](
+ count: Int,
+ seed: Int = DefaultSeed
+ ): MonoidAggregator[T, PriorityQueue[(Double, T)], Seq[T]] = {
+ val rng = new java.util.Random(seed)
+ Preparer[T]
+ .map(rng.nextDouble() -> _)
+ .monoidAggregate(sortByTake(count)(_._1))
+ .andThenPresent(_.map(_._2))
+ }
+
+ /**
+ * Put everything in a List. Note, this could fill the memory if the List is very large.
+ */
+ def toList[T]: MonoidAggregator[T, Option[Batched[T]], List[T]] =
+ new MonoidAggregator[T, Option[Batched[T]], List[T]] {
+ override def prepare(t: T): Option[Batched[T]] = Some(Batched(t))
+ override def monoid: Monoid[Option[Batched[T]]] =
+ Monoid.optionMonoid(Batched.semigroup)
+ override def present(o: Option[Batched[T]]): List[T] =
+ o.map(_.toList).getOrElse(Nil)
+ }
+
+ /**
+ * Put everything in a Set. Note, this could fill the memory if the Set is very large.
+ */
+ def toSet[T]: MonoidAggregator[T, Set[T], Set[T]] =
+ prepareMonoid { (t: T) => Set(t) }
+
+ /**
+ * This builds an in-memory Set, and then finally gets the size of that set. This may not be scalable if the
+ * Uniques are very large. You might check the approximateUniqueCount or HyperLogLog Aggregator to get an
+ * approximate version of this that is scalable.
+ */
+ def uniqueCount[T]: MonoidAggregator[T, Set[T], Int] =
+ toSet[T].andThenPresent(_.size)
+
+ /**
+ * Using a constant amount of memory, give an approximate unique count (~ 1% error). This uses an exact set
+ * for up to 100 items, then HyperLogLog (HLL) with an 1.2% standard error which uses at most 8192 bytes for
+ * each HLL. For more control, see HyperLogLogAggregator.
+ */
+ def approximateUniqueCount[T: Hash128]: MonoidAggregator[T, Either[HLL, Set[T]], Long] =
+ SetSizeHashAggregator[T](hllBits = 13, maxSetSize = 100)
+
+ /**
+ * Returns the lower bound of a given percentile where the percentile is between (0,1] The items that are
+ * iterated over cannot be negative.
+ */
+ def approximatePercentile[T](percentile: Double, k: Int = QTreeAggregator.DefaultK)(implicit
+ num: Numeric[T]
+ ): QTreeAggregatorLowerBound[T] =
+ QTreeAggregatorLowerBound[T](percentile, k)
+
+ /**
+ * Returns the intersection of a bounded percentile where the percentile is between (0,1] The items that are
+ * iterated over cannot be negative.
+ */
+ def approximatePercentileBounds[T](percentile: Double, k: Int = QTreeAggregator.DefaultK)(implicit
+ num: Numeric[T]
+ ): QTreeAggregator[T] =
+ QTreeAggregator[T](percentile, k)
+
+ /**
+ * An aggregator that sums Numeric values into Doubles.
+ *
+ * This is really no more than converting to Double and then summing. The conversion to double means we
+ * don't have the overflow semantics of integer types on the jvm (e.g. Int.MaxValue + 1 == Int.MinValue).
+ *
+ * Note that if you instead wanted to aggregate Numeric values of a type T into the same type T (e.g. if you
+ * want MonoidAggregator[T, T, T] for some Numeric type T), you can directly use Aggregator.fromMonoid[T]
+ * after importing the numericRing implicit:
+ *
+ * > import com.twitter.algebird.Ring.numericRing > def numericAggregator[T: Numeric]: MonoidAggregator[T,
+ * T, T] = Aggregator.fromMonoid[T]
+ */
+ def numericSum[T](implicit num: Numeric[T]): MonoidAggregator[T, Double, Double] =
+ Preparer[T].map(num.toDouble).monoidAggregate(Aggregator.fromMonoid)
+
+}
+
+/**
+ * This is a type that models map/reduce(map). First each item is mapped, then we reduce with a semigroup,
+ * then finally we present the results.
+ *
+ * Unlike Fold, Aggregator keeps it's middle aggregation type externally visible. This is because Aggregators
+ * are useful in parallel map/reduce systems where there may be some additional types needed to cross the
+ * map/reduce boundary (such a serialization and intermediate storage). If you don't care about the middle
+ * type, an _ may be used and the main utility of the instance is still preserved (e.g. def operate[T](ag:
+ * Aggregator[T, _, Int]): Int)
+ *
+ * Note, join is very useful to combine multiple aggregations with one pass. Also
+ * GeneratedTupleAggregator.fromN((agg1, agg2, ... aggN)) can glue these together well.
+ *
+ * This type is the the Fold.M from Haskell's fold package:
+ * https://hackage.haskell.org/package/folds-0.6.2/docs/Data-Fold-M.html
+ */
+trait Aggregator[-A, B, +C] extends java.io.Serializable { self =>
+ def prepare(input: A): B
+ def semigroup: Semigroup[B]
+ def present(reduction: B): C
+
+ /* *****
+ * All the following are in terms of the above
+ */
+
+ /**
+ * combine two inner values
+ */
+ def reduce(l: B, r: B): B = semigroup.plus(l, r)
+
+ /**
+ * This may error if items is empty. To be safe you might use reduceOption if you don't know that items is
+ * non-empty
+ */
+ def reduce(items: TraversableOnce[B]): B = semigroup.sumOption(items).get
+
+ /**
+ * This is the safe version of the above. If the input in empty, return None, else reduce the items
+ */
+ def reduceOption(items: TraversableOnce[B]): Option[B] =
+ semigroup.sumOption(items)
+
+ /**
+ * This may error if inputs are empty (for Monoid Aggregators it never will, instead you see
+ * present(Monoid.zero[B])
+ */
+ def apply(inputs: TraversableOnce[A]): C =
+ present(reduce(inputs.iterator.map(prepare)))
+
+ /**
+ * This returns None if the inputs are empty
+ */
+ def applyOption(inputs: TraversableOnce[A]): Option[C] =
+ reduceOption(inputs.iterator.map(prepare))
+ .map(present)
+
+ /**
+ * This returns the cumulative sum of its inputs, in the same order. If the inputs are empty, the result
+ * will be empty too.
+ */
+ def cumulativeIterator(inputs: Iterator[A]): Iterator[C] =
+ inputs
+ .scanLeft(None: Option[B]) {
+ case (None, a) => Some(prepare(a))
+ case (Some(b), a) => Some(append(b, a))
+ }
+ .collect { case Some(b) => present(b) }
+
+ /**
+ * This returns the cumulative sum of its inputs, in the same order. If the inputs are empty, the result
+ * will be empty too.
+ */
+ def applyCumulatively[In <: TraversableOnce[A], Out](
+ inputs: In
+ )(implicit bf: CanBuildFrom[In, C, Out]): Out =
+ (bf: BuildFrom[In, C, Out]).fromSpecific(inputs)(cumulativeIterator(inputs.iterator))
+
+ def append(l: B, r: A): B = reduce(l, prepare(r))
+
+ def appendAll(old: B, items: TraversableOnce[A]): B =
+ if (items.iterator.isEmpty) old else reduce(old, reduce(items.iterator.map(prepare)))
+
+ /** Like calling andThen on the present function */
+ def andThenPresent[D](present2: C => D): Aggregator[A, B, D] =
+ new Aggregator[A, B, D] {
+ override def prepare(input: A): B = self.prepare(input)
+ override def semigroup: Semigroup[B] = self.semigroup
+ override def present(reduction: B): D = present2(self.present(reduction))
+ }
+
+ /** Like calling compose on the prepare function */
+ def composePrepare[A1](prepare2: A1 => A): Aggregator[A1, B, C] =
+ new Aggregator[A1, B, C] {
+ override def prepare(input: A1): B = self.prepare(prepare2(input))
+ override def semigroup: Semigroup[B] = self.semigroup
+ override def present(reduction: B): C = self.present(reduction)
+ }
+
+ /**
+ * This allows you to run two aggregators on the same data with a single pass
+ */
+ def join[A2 <: A, B2, C2](that: Aggregator[A2, B2, C2]): Aggregator[A2, (B, B2), (C, C2)] =
+ GeneratedTupleAggregator.from2((this, that))
+
+ /**
+ * This allows you to join two aggregators into one that takes a tuple input, which in turn allows you to
+ * chain .composePrepare onto the result if you have an initial input that has to be prepared differently
+ * for each of the joined aggregators.
+ *
+ * The law here is: ag1.zip(ag2).apply(as.zip(bs)) == (ag1(as), ag2(bs))
+ */
+ def zip[A2, B2, C2](ag2: Aggregator[A2, B2, C2]): Aggregator[(A, A2), (B, B2), (C, C2)] = {
+ val ag1 = this
+ new Aggregator[(A, A2), (B, B2), (C, C2)] {
+ override def prepare(a: (A, A2)): (B, B2) = (ag1.prepare(a._1), ag2.prepare(a._2))
+ override val semigroup = new Tuple2Semigroup()(ag1.semigroup, ag2.semigroup)
+ override def present(b: (B, B2)): (C, C2) = (ag1.present(b._1), ag2.present(b._2))
+ }
+ }
+
+ /**
+ * An Aggregator can be converted to a Fold, but not vice-versa Note, a Fold is more constrained so only do
+ * this if you require joining a Fold with an Aggregator to produce a Fold
+ */
+ def toFold: Fold[A, Option[C]] =
+ Fold.fold[Option[B], A, Option[C]](
+ {
+ case (None, a) => Some(self.prepare(a))
+ case (Some(b), a) => Some(self.append(b, a))
+ },
+ None,
+ _.map(self.present)
+ )
+
+ def lift: MonoidAggregator[A, Option[B], Option[C]] =
+ new MonoidAggregator[A, Option[B], Option[C]] {
+ override def prepare(input: A): Option[B] = Some(self.prepare(input))
+ override def present(reduction: Option[B]): Option[C] = reduction.map(self.present)
+ override def monoid = new OptionMonoid[B]()(self.semigroup)
+ }
+}
+
+/**
+ * Aggregators are Applicatives, but this hides the middle type. If you need a join that does not hide the
+ * middle type use join on the trait, or GeneratedTupleAggregator.fromN
+ */
+class AggregatorApplicative[I] extends Applicative[({ type L[O] = Aggregator[I, ?, O] })#L] {
+ override def map[T, U](mt: Aggregator[I, ?, T])(fn: T => U): Aggregator[I, ?, U] =
+ mt.andThenPresent(fn)
+ override def apply[T](v: T): Aggregator[I, ?, T] =
+ Aggregator.const(v)
+ override def join[T, U](mt: Aggregator[I, ?, T], mu: Aggregator[I, ?, U]): Aggregator[I, ?, (T, U)] =
+ mt.join(mu)
+ override def join[T1, T2, T3](
+ m1: Aggregator[I, ?, T1],
+ m2: Aggregator[I, ?, T2],
+ m3: Aggregator[I, ?, T3]
+ ): Aggregator[I, ?, (T1, T2, T3)] =
+ GeneratedTupleAggregator.from3((m1, m2, m3))
+
+ override def join[T1, T2, T3, T4](
+ m1: Aggregator[I, ?, T1],
+ m2: Aggregator[I, ?, T2],
+ m3: Aggregator[I, ?, T3],
+ m4: Aggregator[I, ?, T4]
+ ): Aggregator[I, ?, (T1, T2, T3, T4)] =
+ GeneratedTupleAggregator.from4((m1, m2, m3, m4))
+
+ override def join[T1, T2, T3, T4, T5](
+ m1: Aggregator[I, ?, T1],
+ m2: Aggregator[I, ?, T2],
+ m3: Aggregator[I, ?, T3],
+ m4: Aggregator[I, ?, T4],
+ m5: Aggregator[I, ?, T5]
+ ): Aggregator[I, ?, (T1, T2, T3, T4, T5)] =
+ GeneratedTupleAggregator.from5((m1, m2, m3, m4, m5))
+}
+
+trait MonoidAggregator[-A, B, +C] extends Aggregator[A, B, C] { self =>
+ def monoid: Monoid[B]
+ override def semigroup: Monoid[B] = monoid
+ final override def reduce(items: TraversableOnce[B]): B =
+ monoid.sum(items)
+
+ def appendAll(items: TraversableOnce[A]): B = reduce(items.iterator.map(prepare))
+
+ override def andThenPresent[D](present2: C => D): MonoidAggregator[A, B, D] = {
+ val self = this
+ new MonoidAggregator[A, B, D] {
+ override def prepare(a: A): B = self.prepare(a)
+ override def monoid: Monoid[B] = self.monoid
+ override def present(b: B): D = present2(self.present(b))
+ }
+ }
+ override def composePrepare[A2](prepare2: A2 => A): MonoidAggregator[A2, B, C] = {
+ val self = this
+ new MonoidAggregator[A2, B, C] {
+ override def prepare(a: A2): B = self.prepare(prepare2(a))
+ override def monoid: Monoid[B] = self.monoid
+ override def present(b: B): C = self.present(b)
+ }
+ }
+
+ /**
+ * Build a MonoidAggregator that either takes left or right input and outputs the pair from both
+ */
+ def either[A2, B2, C2](
+ that: MonoidAggregator[A2, B2, C2]
+ ): MonoidAggregator[Either[A, A2], (B, B2), (C, C2)] =
+ new MonoidAggregator[Either[A, A2], (B, B2), (C, C2)] {
+ override def prepare(e: Either[A, A2]): (B, B2) = e match {
+ case Left(a) => (self.prepare(a), that.monoid.zero)
+ case Right(a2) => (self.monoid.zero, that.prepare(a2))
+ }
+ override val monoid = new Tuple2Monoid[B, B2]()(self.monoid, that.monoid)
+ override def present(bs: (B, B2)): (C, C2) = (self.present(bs._1), that.present(bs._2))
+ }
+
+ /**
+ * Only transform values where the function is defined, else discard
+ */
+ def collectBefore[A2](fn: PartialFunction[A2, A]): MonoidAggregator[A2, B, C] =
+ new MonoidAggregator[A2, B, C] {
+ override def prepare(a: A2): B =
+ if (fn.isDefinedAt(a)) self.prepare(fn(a)) else self.monoid.zero
+ override def monoid: Monoid[B] = self.monoid
+ override def present(b: B): C = self.present(b)
+ }
+
+ /**
+ * Only aggregate items that match a predicate
+ */
+ def filterBefore[A1 <: A](pred: A1 => Boolean): MonoidAggregator[A1, B, C] =
+ new MonoidAggregator[A1, B, C] {
+ override def prepare(a: A1): B = if (pred(a)) self.prepare(a) else self.monoid.zero
+ override def monoid: Monoid[B] = self.monoid
+ override def present(b: B): C = self.present(b)
+ }
+
+ /**
+ * This maps the inputs to Bs, then sums them, effectively flattening the inputs to the MonoidAggregator
+ */
+ def sumBefore: MonoidAggregator[TraversableOnce[A], B, C] =
+ new MonoidAggregator[TraversableOnce[A], B, C] {
+ override def monoid: Monoid[B] = self.monoid
+ override def prepare(input: TraversableOnce[A]): B =
+ monoid.sum(input.iterator.map(self.prepare))
+ override def present(reduction: B): C = self.present(reduction)
+ }
+
+ /**
+ * This allows you to join two aggregators into one that takes a tuple input, which in turn allows you to
+ * chain .composePrepare onto the result if you have an initial input that has to be prepared differently
+ * for each of the joined aggregators.
+ *
+ * The law here is: ag1.zip(ag2).apply(as.zip(bs)) == (ag1(as), ag2(bs))
+ */
+ def zip[A2, B2, C2](ag2: MonoidAggregator[A2, B2, C2]): MonoidAggregator[(A, A2), (B, B2), (C, C2)] = {
+ val ag1 = self
+ new MonoidAggregator[(A, A2), (B, B2), (C, C2)] {
+ override def prepare(a: (A, A2)): (B, B2) = (ag1.prepare(a._1), ag2.prepare(a._2))
+ override val monoid = new Tuple2Monoid[B, B2]()(ag1.monoid, ag2.monoid)
+ override def present(b: (B, B2)): (C, C2) = (ag1.present(b._1), ag2.present(b._2))
+ }
+ }
+}
+
+trait RingAggregator[-A, B, +C] extends MonoidAggregator[A, B, C] {
+ def ring: Ring[B]
+ override def monoid: Monoid[B] = Ring.asTimesMonoid(ring)
+}
diff --git a/algebird-core/src/main/scala-2.13/CountMinSketch.scala b/algebird-core/src/main/scala-2.13/CountMinSketch.scala
new file mode 100644
index 000000000..826aebd5a
--- /dev/null
+++ b/algebird-core/src/main/scala-2.13/CountMinSketch.scala
@@ -0,0 +1,1420 @@
+/*
+Copyright 2012 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+ */
+
+package com.twitter.algebird
+
+import algebra.CommutativeMonoid
+
+import scala.collection.compat._
+
+/**
+ * A Count-Min sketch is a probabilistic data structure used for summarizing streams of data in sub-linear
+ * space.
+ *
+ * It works as follows. Let `(eps, delta)` be two parameters that describe the confidence in our error
+ * estimates, and let `d = ceil(ln 1/delta)` and `w = ceil(e / eps)`.
+ *
+ * Note: Throughout the code `d` and `w` are called `depth` and `width`, respectively.
+ *
+ * Then:
+ *
+ * - Take `d` pairwise independent hash functions `h_i`, each of which maps onto the domain `[0, w - 1]`.
+ * - Create a 2-dimensional table of counts, with `d` rows and `w` columns, initialized with all zeroes.
+ * - When a new element x arrives in the stream, update the table of counts by setting `counts[i, h_i[x]] +=
+ * 1`, for each `1 <= i <= d`.
+ * - (Note the rough similarity to a Bloom filter.)
+ *
+ * As an example application, suppose you want to estimate the number of times an element `x` has appeared in
+ * a data stream so far. The Count-Min sketch estimate of this frequency is
+ *
+ * min_i { counts[i, h_i[x]] }
+ *
+ * With probability at least `1 - delta`, this estimate is within `eps * N` of the true frequency (i.e., `true
+ * frequency <= estimate <= true frequency + eps * N`), where N is the total size of the stream so far.
+ *
+ * See http://www.eecs.harvard.edu/~michaelm/CS222/countmin.pdf for technical details, including proofs of the
+ * estimates and error bounds used in this implementation.
+ *
+ * Parts of this implementation are taken from
+ * https://github.com/clearspring/stream-lib/blob/master/src/main/java/com/clearspring/analytics/stream/frequency/CountMinSketch.java
+ *
+ * @author
+ * Edwin Chen
+ */
+/**
+ * Monoid for adding CMS sketches.
+ *
+ * =Usage=
+ *
+ * `eps` and `delta` are parameters that bound the error of each query estimate. For example, errors in
+ * answering point queries (e.g., how often has element x appeared in the stream described by the sketch?) are
+ * often of the form: "with probability p >= 1 - delta, the estimate is close to the truth by some factor
+ * depending on eps."
+ *
+ * The type `K` is the type of items you want to count. You must provide an implicit `CMSHasher[K]` for `K`,
+ * and Algebird ships with several such implicits for commonly used types such as `Long` and `BigInt`.
+ *
+ * If your type `K` is not supported out of the box, you have two options: 1) You provide a "translation"
+ * function to convert items of your (unsupported) type `K` to a supported type such as Double, and then use
+ * the `contramap` function of [[CMSHasher]] to create the required `CMSHasher[K]` for your type (see the
+ * documentation of [[CMSHasher]] for an example); 2) You implement a `CMSHasher[K]` from scratch, using the
+ * existing CMSHasher implementations as a starting point.
+ *
+ * Note: Because Arrays in Scala/Java not have sane `equals` and `hashCode` implementations, you cannot safely
+ * use types such as `Array[Byte]`. Extra work is required for Arrays. For example, you may opt to convert
+ * `Array[T]` to a `Seq[T]` via `toSeq`, or you can provide appropriate wrapper classes. Algebird provides one
+ * such wrapper class, [[Bytes]], to safely wrap an `Array[Byte]` for use with CMS.
+ *
+ * @param eps
+ * One-sided error bound on the error of each point query, i.e. frequency estimate.
+ * @param delta
+ * A bound on the probability that a query estimate does not lie within some small interval (an interval
+ * that depends on `eps`) around the truth.
+ * @param seed
+ * A seed to initialize the random number generator used to create the pairwise independent hash functions.
+ * @param maxExactCountOpt
+ * An Option parameter about how many exact counts a sparse CMS wants to keep.
+ * @tparam K
+ * The type used to identify the elements to be counted. For example, if you want to count the occurrence of
+ * user names, you could map each username to a unique numeric ID expressed as a `Long`, and then count the
+ * occurrences of those `Long`s with a CMS of type `K=Long`. Note that this mapping between the elements of
+ * your problem domain and their identifiers used for counting via CMS should be bijective. We require a
+ * [[CMSHasher]] context bound for `K`, see [[CMSHasherImplicits]] for available implicits that can be
+ * imported. Which type K should you pick in practice? For domains that have less than `2^64` unique
+ * elements, you'd typically use `Long`. For larger domains you can try `BigInt`, for example. Other
+ * possibilities include Spire's `SafeLong` and `Numerical` data types (https://github.com/non/spire),
+ * though Algebird does not include the required implicits for CMS-hashing (cf. [[CMSHasherImplicits]].
+ */
+class CMSMonoid[K: CMSHasher](eps: Double, delta: Double, seed: Int, maxExactCountOpt: Option[Int] = None)
+ extends Monoid[CMS[K]]
+ with CommutativeMonoid[CMS[K]] {
+
+ val params: CMSParams[K] = {
+ val hashes: Seq[CMSHash[K]] = CMSFunctions.generateHashes(eps, delta, seed)
+ CMSParams(hashes, eps, delta, maxExactCountOpt)
+ }
+
+ override val zero: CMS[K] = CMSZero[K](params)
+
+ /**
+ * Combines the two sketches.
+ *
+ * The sketches must use the same hash functions.
+ */
+ override def plus(left: CMS[K], right: CMS[K]): CMS[K] = {
+ require(left.params.hashes == right.params.hashes, "The sketches must use the same hash functions.")
+ left ++ right
+ }
+
+ /**
+ * Creates a sketch out of a single item.
+ */
+ def create(item: K): CMS[K] = CMSItem[K](item, 1L, params)
+
+ /**
+ * Creates a sketch out of multiple items.
+ */
+ def create(data: Seq[K]): CMS[K] = {
+ val summation = new CMSSummation(params)
+ data.foreach(k => summation.insert(k, 1L))
+ summation.result
+ }
+
+ override def sumOption(sketches: TraversableOnce[CMS[K]]): Option[CMS[K]] =
+ if (sketches.iterator.isEmpty) None else Some(sum(sketches))
+
+ override def sum(sketches: TraversableOnce[CMS[K]]): CMS[K] = {
+ val summation = new CMSSummation(params)
+ summation.updateAll(sketches)
+ summation.result
+ }
+}
+
+/**
+ * This mutable builder can be used when speed is essential and you can be sure the scope of the mutability
+ * cannot escape in an unsafe way. The intended use is to allocate and call result in one method without
+ * letting a reference to the instance escape into a closure.
+ */
+class CMSSummation[K](params: CMSParams[K]) {
+ private[this] val hashes = params.hashes.toArray
+ private[this] val height = CMSFunctions.depth(params.delta)
+ private[this] val width = CMSFunctions.width(params.eps)
+ private[this] val cells = new Array[Long](height * width)
+ private[this] var totalCount = 0L
+
+ final def insert(k: K, count: Long): Unit = {
+ var row = 0
+ var offset = 0
+ val hs = hashes
+ while (row < hs.length) {
+ cells(offset + hs(row)(k)) += count
+ offset += width
+ row += 1
+ }
+ totalCount += count
+ }
+
+ def updateAll(sketches: TraversableOnce[CMS[K]]): Unit =
+ sketches.iterator.foreach(updateInto)
+
+ def updateInto(cms: CMS[K]): Unit =
+ cms match {
+ case CMSZero(_) =>
+ ()
+ case CMSItem(item, count, _) =>
+ insert(item, count)
+ case SparseCMS(table, _, _) =>
+ table.foreach { case (item, c) =>
+ insert(item, c)
+ }
+ case CMSInstance(CMSInstance.CountsTable(matrix), count, _) =>
+ var offset = 0
+ val rit = matrix.iterator
+ while (rit.hasNext) {
+ var col = 0
+ val cit = rit.next().iterator
+ while (cit.hasNext) {
+ cells(offset + col) += cit.next()
+ col += 1
+ }
+ offset += width
+ }
+ totalCount += count
+ }
+
+ def result: CMS[K] =
+ if (totalCount == 0L) CMSZero(params)
+ else {
+ def vectorize(row: Int): Vector[Long] = {
+ val offset = row * width
+ val b = Vector.newBuilder[Long]
+ var col = 0
+ while (col < width) {
+ b += cells(offset + col)
+ col += 1
+ }
+ b.result()
+ }
+
+ val b = Vector.newBuilder[Vector[Long]]
+ var row = 0
+ while (row < height) {
+ b += vectorize(row)
+ row += 1
+ }
+ CMSInstance(CMSInstance.CountsTable(b.result()), totalCount, params)
+ }
+}
+
+/**
+ * An Aggregator for [[CMS]]. Can be created using CMS.aggregator.
+ */
+case class CMSAggregator[K](cmsMonoid: CMSMonoid[K]) extends MonoidAggregator[K, CMS[K], CMS[K]] {
+ override val monoid: CMSMonoid[K] = cmsMonoid
+
+ override def prepare(value: K): CMS[K] = monoid.create(value)
+
+ override def present(cms: CMS[K]): CMS[K] = cms
+
+}
+
+/**
+ * Configuration parameters for [[CMS]].
+ *
+ * @param hashes
+ * Pair-wise independent hashes functions. We need `N=depth` such functions (`depth` can be derived from
+ * `delta`).
+ * @param eps
+ * One-sided error bound on the error of each point query, i.e. frequency estimate.
+ * @param delta
+ * A bound on the probability that a query estimate does not lie within some small interval (an interval
+ * that depends on `eps`) around the truth.
+ * @param maxExactCountOpt
+ * An Option parameter about how many exact counts a sparse CMS wants to keep.
+ * @tparam K
+ * The type used to identify the elements to be counted.
+ */
+case class CMSParams[K](
+ hashes: Seq[CMSHash[K]],
+ eps: Double,
+ delta: Double,
+ maxExactCountOpt: Option[Int] = None
+) {
+
+ require(0 < eps && eps < 1, "eps must lie in (0, 1)")
+ require(0 < delta && delta < 1, "delta must lie in (0, 1)")
+ require(
+ hashes.size >= CMSFunctions.depth(delta),
+ s"we require at least ${CMSFunctions.depth(delta)} hash functions"
+ )
+
+}
+
+/**
+ * Helper functions to generate or to translate between various CMS parameters (cf. [[CMSParams]]).
+ */
+object CMSFunctions {
+
+ /**
+ * Translates from `width` to `eps`.
+ */
+ def eps(width: Int): Double = scala.math.exp(1.0) / width
+
+ /**
+ * Translates from `depth` to `delta`.
+ */
+ @throws[IllegalArgumentException]("if depth is too large, causing precision errors when computing delta")
+ def delta(depth: Int): Double = {
+ val i = scala.math.exp(-depth)
+ require(
+ i > 0.0,
+ s"depth must be smaller as it causes precision errors when computing delta ($depth led to an invalid delta of $i)"
+ )
+ i
+ }
+
+ /**
+ * Translates from `delta` to `depth`.
+ */
+ @throws[IllegalArgumentException]("if delta is is not in (0, 1)")
+ def depth(delta: Double): Int = {
+ require(0 < delta && delta < 1, "delta must lie in (0, 1)")
+ scala.math.ceil(scala.math.log(1.0 / delta)).toInt
+ }
+
+ /**
+ * Translates from `eps` to `width`.
+ */
+ def width(eps: Double): Int =
+ scala.math.ceil(truncatePrecisionError(scala.math.exp(1) / eps)).toInt
+
+ /**
+ * Compute maxExactCount from parameters or `depth` and `width`
+ */
+ def maxExactCount(maxExactCountOpt: Option[Int], depth: Int, width: Int): Int =
+ maxExactCountOpt.getOrElse(math.max(width * depth / 100, 50))
+
+ // Eliminates precision errors such as the following:
+ //
+ // scala> val width = 39
+ // scala> scala.math.exp(1) / CMSFunctions.eps(width)
+ // res171: Double = 39.00000000000001 <<< should be 39.0
+ //
+ // Because of the actual types on which CMSFunctions operates (i.e. Int and Double), the maximum number of decimal
+ // places should be 6.
+ private def truncatePrecisionError(i: Double, decimalPlaces: Int = 6) =
+ BigDecimal(i)
+ .setScale(decimalPlaces, BigDecimal.RoundingMode.HALF_UP)
+ .toDouble
+
+ /**
+ * Generates `N=depth` pair-wise independent hash functions.
+ *
+ * @param eps
+ * One-sided error bound on the error of each point query, i.e. frequency estimate.
+ * @param delta
+ * Error bound on the probability that a query estimate does NOT lie within some small interval around the
+ * truth.
+ * @param seed
+ * Seed for the random number generator.
+ * @tparam K
+ * The type used to identify the elements to be counted.
+ * @return
+ * The generated hash functions.
+ */
+ def generateHashes[K: CMSHasher](eps: Double, delta: Double, seed: Int): Seq[CMSHash[K]] = {
+ // Typically, we would use d -- aka depth -- pair-wise independent hash functions of the form
+ //
+ // h_i(x) = a_i * x + b_i (mod p)
+ //
+ // But for this particular application, setting b_i does not matter (since all it does is shift the results of a
+ // particular hash), so we omit it (by setting b_i to 0) and simply use hash functions of the form
+ //
+ // h_i(x) = a_i * x (mod p)
+ //
+ val r = new scala.util.Random(seed)
+ val numHashes = depth(delta)
+ val numCounters = width(eps)
+ (0 to (numHashes - 1)).map(_ => CMSHash[K](r.nextInt(), 0, numCounters))
+ }
+
+}
+
+/**
+ * A trait for CMS implementations that can count elements in a data stream and that can answer point queries
+ * (i.e. frequency estimates) for these elements.
+ *
+ * Known implementations: [[CMS]], [[TopCMS]].
+ *
+ * @tparam K
+ * The type used to identify the elements to be counted.
+ * @tparam C
+ * The type of the actual CMS that implements this trait.
+ */
+trait CMSCounting[K, C[_]] {
+
+ /**
+ * Returns the one-sided error bound on the error of each point query, i.e. frequency estimate.
+ */
+ def eps: Double
+
+ /**
+ * Returns the bound on the probability that a query estimate does NOT lie within some small interval (an
+ * interval that depends on `eps`) around the truth.
+ */
+ def delta: Double
+
+ /**
+ * Number of hash functions (also: number of rows in the counting table). This number is derived from
+ * `delta`.
+ */
+ def depth: Int = CMSFunctions.depth(delta)
+
+ /**
+ * Number of counters per hash function (also: number of columns in the counting table). This number is
+ * derived from `eps`.
+ */
+ def width: Int = CMSFunctions.width(eps)
+
+ /**
+ * An Option parameter about how many exact counts a sparse CMS wants to keep
+ */
+ def maxExactCountOpt: Option[Int]
+
+ /**
+ * Number of exact counts a sparse CMS wants to keep. This number is derived from `maxExactCountOpt`.
+ */
+ def maxExactCount: Int =
+ CMSFunctions.maxExactCount(maxExactCountOpt, depth, width)
+
+ /**
+ * Returns a new sketch that is the combination of this sketch and the other sketch.
+ */
+ def ++(other: C[K]): C[K]
+
+ /**
+ * Counts the item and returns the result as a new sketch.
+ */
+ def +(item: K): C[K] = this + (item, 1L)
+
+ /**
+ * Counts the item `count` times and returns the result as a new sketch.
+ */
+ def +(item: K, count: Long): C[K]
+
+ /**
+ * Returns an estimate of the total number of times this item has been seen in the stream so far. This
+ * estimate is an upper bound.
+ *
+ * It is always true that `estimatedFrequency >= trueFrequency`. With probability `p >= 1 - delta`, it also
+ * holds that `estimatedFrequency <= trueFrequency + eps * totalCount`.
+ */
+ def frequency(item: K): Approximate[Long]
+
+ /**
+ * Returns an estimate of the inner product against another data stream.
+ *
+ * In other words, let a_i denote the number of times element i has been seen in the data stream summarized
+ * by this CMS, and let b_i denote the same for the other CMS. Then this returns an estimate of ` =
+ * \sum a_i b_i`.
+ *
+ * Note: This can also be viewed as the join size between two relations.
+ *
+ * It is always true that actualInnerProduct <= estimatedInnerProduct. With probability `p >= 1 - delta`, it
+ * also holds that `estimatedInnerProduct <= actualInnerProduct + eps * thisTotalCount * otherTotalCount`.
+ */
+ def innerProduct(other: C[K]): Approximate[Long]
+
+ /**
+ * Total number of elements counted (i.e. seen in the data stream) so far.
+ */
+ def totalCount: Long
+
+ /**
+ * The first frequency moment is the total number of elements in the stream.
+ */
+ def f1: Long = totalCount
+
+ /**
+ * The second frequency moment is `\sum a_i^2`, where `a_i` is the count of the i-th element.
+ */
+ def f2: Approximate[Long]
+
+}
+
+/**
+ * A trait for CMS implementations that can track heavy hitters in a data stream.
+ *
+ * It is up to the implementation how the semantics of tracking heavy hitters are defined. For instance, one
+ * implementation could track the "top %" heavy hitters whereas another implementation could track the "top N"
+ * heavy hitters.
+ *
+ * Known implementations: [[TopCMS]].
+ *
+ * @tparam K
+ * The type used to identify the elements to be counted.
+ */
+trait CMSHeavyHitters[K] {
+
+ /**
+ * The pluggable logic of how heavy hitters are being tracked.
+ */
+ def heavyHittersLogic: HeavyHittersLogic[K]
+
+ /**
+ * Returns the set of heavy hitters.
+ */
+ def heavyHitters: Set[K]
+
+}
+
+object CMS {
+
+ def monoid[K: CMSHasher](eps: Double, delta: Double, seed: Int): CMSMonoid[K] =
+ monoid(eps, delta, seed, None)
+ def monoid[K: CMSHasher](
+ eps: Double,
+ delta: Double,
+ seed: Int,
+ maxExactCountOpt: Option[Int]
+ ): CMSMonoid[K] =
+ new CMSMonoid[K](eps, delta, seed, maxExactCountOpt)
+
+ def monoid[K: CMSHasher](depth: Int, width: Int, seed: Int): CMSMonoid[K] =
+ monoid(depth, width, seed, None)
+ def monoid[K: CMSHasher](depth: Int, width: Int, seed: Int, maxExactCountOpt: Option[Int]): CMSMonoid[K] =
+ monoid(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, maxExactCountOpt)
+
+ def aggregator[K: CMSHasher](eps: Double, delta: Double, seed: Int): CMSAggregator[K] =
+ aggregator(eps, delta, seed, None)
+ def aggregator[K: CMSHasher](
+ eps: Double,
+ delta: Double,
+ seed: Int,
+ maxExactCountOpt: Option[Int]
+ ): CMSAggregator[K] =
+ new CMSAggregator[K](monoid(eps, delta, seed, maxExactCountOpt))
+
+ def aggregator[K: CMSHasher](depth: Int, width: Int, seed: Int): CMSAggregator[K] =
+ aggregator(depth, width, seed, None)
+ def aggregator[K: CMSHasher](
+ depth: Int,
+ width: Int,
+ seed: Int,
+ maxExactCountOpt: Option[Int]
+ ): CMSAggregator[K] =
+ aggregator(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, maxExactCountOpt)
+
+ /**
+ * Returns a fresh, zeroed CMS instance.
+ */
+ def apply[K: CMSHasher](
+ eps: Double,
+ delta: Double,
+ seed: Int,
+ maxExactCountOpt: Option[Int] = None
+ ): CMS[K] = {
+ val params = {
+ val hashes: Seq[CMSHash[K]] =
+ CMSFunctions.generateHashes(eps, delta, seed)
+ CMSParams(hashes, eps, delta, maxExactCountOpt)
+ }
+ CMSZero[K](params)
+ }
+
+}
+
+/**
+ * A Count-Min sketch data structure that allows for counting and frequency estimation of elements in a data
+ * stream.
+ *
+ * Tip: If you also need to track heavy hitters ("Top N" problems), take a look at [[TopCMS]].
+ *
+ * =Usage=
+ *
+ * This example demonstrates how to count `Long` elements with [[CMS]], i.e. `K=Long`.
+ *
+ * Note that the actual counting is always performed with a `Long`, regardless of your choice of `K`. That is,
+ * the counting table behind the scenes is backed by `Long` values (at least in the current implementation),
+ * and thus the returned frequency estimates are always instances of `Approximate[Long]`.
+ *
+ * @example
+ * {{{
+ *
+ * // Creates a monoid for a CMS that can count `Long` elements. val cmsMonoid: CMSMonoid[Long] = { val eps =
+ * 0.001 val delta = 1E-10 val seed = 1 CMS.monoid[Long](eps, delta, seed) }
+ *
+ * // Creates a CMS instance that has counted the element `1L`. val cms: CMS[Long] = cmsMonoid.create(1L)
+ *
+ * // Estimates the frequency of `1L` val estimate: Approximate[Long] = cms.frequency(1L)
+ * }}}
+ *
+ * @tparam K
+ * The type used to identify the elements to be counted.
+ */
+sealed abstract class CMS[K](val params: CMSParams[K]) extends java.io.Serializable with CMSCounting[K, CMS] {
+
+ override val eps: Double = params.eps
+
+ override val delta: Double = params.delta
+
+ override val maxExactCountOpt: Option[Int] = params.maxExactCountOpt
+
+ override def f2: Approximate[Long] = innerProduct(this)
+
+}
+
+/**
+ * Zero element. Used for initialization.
+ */
+case class CMSZero[K](override val params: CMSParams[K]) extends CMS[K](params) {
+
+ override val totalCount: Long = 0L
+
+ override def +(item: K, count: Long): CMS[K] = CMSItem[K](item, count, params)
+
+ override def ++(other: CMS[K]): CMS[K] = other
+
+ override def frequency(item: K): Approximate[Long] = Approximate.exact(0L)
+
+ override def innerProduct(other: CMS[K]): Approximate[Long] =
+ Approximate.exact(0L)
+
+}
+
+/**
+ * Used for holding a single element, to avoid repeatedly adding elements from sparse counts tables.
+ */
+case class CMSItem[K](item: K, override val totalCount: Long, override val params: CMSParams[K])
+ extends CMS[K](params) {
+
+ override def +(x: K, count: Long): CMS[K] =
+ SparseCMS[K](params) + (item, totalCount) + (x, count)
+
+ override def ++(other: CMS[K]): CMS[K] =
+ other match {
+ case _: CMSZero[?] => this
+ case other: CMSItem[K] =>
+ CMSInstance[K](params) + (item, totalCount) + (other.item, other.totalCount)
+ case _ => other + item
+ }
+
+ override def frequency(x: K): Approximate[Long] =
+ if (item == x) Approximate.exact(totalCount) else Approximate.exact(0L)
+
+ override def innerProduct(other: CMS[K]): Approximate[Long] =
+ Approximate.exact(totalCount) * other.frequency(item)
+
+}
+
+/**
+ * A sparse Count-Min sketch structure, used for situations where the key is highly skewed.
+ */
+case class SparseCMS[K](
+ exactCountTable: Map[K, Long],
+ override val totalCount: Long,
+ override val params: CMSParams[K]
+) extends CMS[K](params) {
+ import SparseCMS._
+
+ override def +(x: K, count: Long): CMS[K] = {
+ val currentCount = exactCountTable.getOrElse(x, 0L)
+ val newTable = exactCountTable.updated(x, currentCount + count)
+ if (newTable.size < maxExactCount) {
+ // still sparse
+ SparseCMS(newTable, totalCount = totalCount + count, params = params)
+ } else {
+ toDense(newTable, params)
+ }
+ }
+
+ override def ++(other: CMS[K]): CMS[K] =
+ other match {
+ case _: CMSZero[?] => this
+ case other: CMSItem[K] => this + (other.item, other.totalCount)
+ case other: SparseCMS[K] =>
+ // This SparseCMS's maxExactCount is used, so ++ is not communitive
+ val newTable = Semigroup.plus(exactCountTable, other.exactCountTable)
+ if (newTable.size < maxExactCount) {
+ // still sparse
+ SparseCMS(newTable, totalCount = totalCount + other.totalCount, params = params)
+ } else {
+ toDense(newTable, params)
+ }
+
+ case other: CMSInstance[K] => other ++ this
+ }
+
+ override def frequency(x: K): Approximate[Long] =
+ Approximate.exact(exactCountTable.getOrElse(x, 0L))
+
+ override def innerProduct(other: CMS[K]): Approximate[Long] =
+ exactCountTable.iterator
+ .map { case (x, count) => Approximate.exact(count) * other.frequency(x) }
+ .reduceOption(_ + _)
+ .getOrElse(Approximate.exact(0L))
+}
+
+object SparseCMS {
+
+ /**
+ * Creates a new [[SparseCMS]] with empty exactCountTable
+ */
+ def apply[K](params: CMSParams[K]): SparseCMS[K] = {
+ val exactCountTable = Map[K, Long]()
+ SparseCMS[K](exactCountTable, totalCount = 0, params = params)
+ }
+
+ /**
+ * Creates a new [[CMSInstance]] from a Map[K, Long]
+ */
+ def toDense[K](exactCountTable: Map[K, Long], params: CMSParams[K]): CMS[K] =
+ // Create new CMSInstace
+ exactCountTable.foldLeft(CMSInstance[K](params)) { case (cms, (x, count)) =>
+ cms + (x, count)
+ }
+}
+
+/**
+ * The general Count-Min sketch structure, used for holding any number of elements.
+ */
+case class CMSInstance[K](
+ countsTable: CMSInstance.CountsTable[K],
+ override val totalCount: Long,
+ override val params: CMSParams[K]
+) extends CMS[K](params) {
+
+ override def ++(other: CMS[K]): CMS[K] =
+ other match {
+ case _: CMSZero[?] => this
+ case other: CMSItem[K] => this + other.item
+ case other: SparseCMS[K] =>
+ other.exactCountTable.foldLeft(this) { case (cms, (x, count)) =>
+ cms + (x, count)
+ }
+ case other: CMSInstance[K] =>
+ val newTable = countsTable ++ other.countsTable
+ val newTotalCount = totalCount + other.totalCount
+ CMSInstance[K](newTable, newTotalCount, params)
+ }
+
+ private def makeApprox(est: Long): Approximate[Long] =
+ if (est == 0L) Approximate.exact(0L)
+ else {
+ val lower = math.max(0L, est - (eps * totalCount).toLong)
+ Approximate(lower, est, est, 1 - delta)
+ }
+
+ override def frequency(item: K): Approximate[Long] = {
+ var freq = Long.MaxValue
+ val hs = params.hashes
+ val it = countsTable.counts.iterator
+ var i = 0
+ while (it.hasNext) {
+ val row = it.next()
+ val count = row(hs(i)(item))
+ if (count < freq) freq = count
+ i += 1
+ }
+ makeApprox(freq)
+ }
+
+ /**
+ * Let X be a CMS, and let count_X[j, k] denote the value in X's 2-dimensional count table at row j and
+ * column k. Then the Count-Min sketch estimate of the inner product between A and B is the minimum inner
+ * product between their rows: estimatedInnerProduct = min_j (\sum_k count_A[j, k] * count_B[j, k]|)
+ */
+ override def innerProduct(other: CMS[K]): Approximate[Long] =
+ other match {
+ case other: CMSInstance[?] =>
+ require(other.depth == depth && other.width == width, "Tables must have the same dimensions.")
+
+ def innerProductAtDepth(d: Int) =
+ (0 to (width - 1)).iterator.map { w =>
+ countsTable.getCount((d, w)) * other.countsTable.getCount((d, w))
+ }.sum
+
+ val est = (0 to (depth - 1)).iterator.map(innerProductAtDepth).min
+ val minimum =
+ math.max(est - (eps * totalCount * other.totalCount).toLong, 0)
+ Approximate(minimum, est, est, 1 - delta)
+ case _ => other.innerProduct(this)
+ }
+
+ override def +(item: K, count: Long): CMSInstance[K] = {
+ require(count >= 0, "count must be >= 0 (negative counts not implemented")
+ if (count != 0L) {
+ val newCountsTable =
+ (0 to (depth - 1)).foldLeft(countsTable) { case (table, row) =>
+ val pos = (row, params.hashes(row)(item))
+ table + (pos, count)
+ }
+ CMSInstance[K](newCountsTable, totalCount + count, params)
+ } else this
+ }
+
+}
+
+object CMSInstance {
+
+ /**
+ * Initializes a [[CMSInstance]] with all zeroes, i.e. nothing has been counted yet.
+ */
+ def apply[K](params: CMSParams[K]): CMSInstance[K] = {
+ val countsTable = CountsTable[K](CMSFunctions.depth(params.delta), CMSFunctions.width(params.eps))
+ CMSInstance[K](countsTable, 0, params)
+ }
+
+ /**
+ * The 2-dimensional table of counters used in the Count-Min sketch. Each row corresponds to a particular
+ * hash function.
+ */
+ // TODO: implement a dense matrix type, and use it here
+ case class CountsTable[K](counts: Vector[Vector[Long]]) {
+ require(depth > 0, "Table must have at least 1 row.")
+ require(width > 0, "Table must have at least 1 column.")
+
+ def depth: Int = counts.size
+
+ def width: Int = counts(0).size
+
+ def getCount(pos: (Int, Int)): Long = {
+ val (row, col) = pos
+ require(row < depth && col < width, "Position must be within the bounds of this table.")
+ counts(row)(col)
+ }
+
+ /**
+ * Updates the count of a single cell in the table.
+ */
+ def +(pos: (Int, Int), count: Long): CountsTable[K] = {
+ val (row, col) = pos
+ val currCount = getCount(pos)
+ val newCounts =
+ counts.updated(row, counts(row).updated(col, currCount + count))
+ CountsTable[K](newCounts)
+ }
+
+ /**
+ * Adds another counts table to this one, through element-wise addition.
+ */
+ def ++(other: CountsTable[K]): CountsTable[K] = {
+ require(depth == other.depth && width == other.width, "Tables must have the same dimensions.")
+ val xss = this.counts.iterator
+ val yss = other.counts.iterator
+ val rows = Vector.newBuilder[Vector[Long]]
+ while (xss.hasNext) {
+ val xs = xss.next().iterator
+ val ys = yss.next().iterator
+ val row = Vector.newBuilder[Long]
+ while (xs.hasNext) row += (xs.next() + ys.next())
+ rows += row.result()
+ }
+ CountsTable[K](rows.result())
+ }
+ }
+
+ object CountsTable {
+
+ /**
+ * Creates a new [[CountsTable]] with counts initialized to all zeroes.
+ */
+ def apply[K](depth: Int, width: Int): CountsTable[K] =
+ CountsTable[K](Vector.fill[Long](depth, width)(0L))
+
+ }
+
+}
+
+case class TopCMSParams[K](logic: HeavyHittersLogic[K])
+
+/**
+ * A Count-Min sketch data structure that allows for (a) counting and frequency estimation of elements in a
+ * data stream and (b) tracking the heavy hitters among these elements.
+ *
+ * The logic of how heavy hitters are computed is pluggable, see [[HeavyHittersLogic]].
+ *
+ * Tip: If you do not need to track heavy hitters, take a look at [[CMS]], which is more efficient in this
+ * case.
+ *
+ * =Usage=
+ *
+ * This example demonstrates how to count `Long` elements with [[TopCMS]], i.e. `K=Long`.
+ *
+ * Note that the actual counting is always performed with a `Long`, regardless of your choice of `K`. That is,
+ * the counting table behind the scenes is backed by `Long` values (at least in the current implementation),
+ * and thus the returned frequency estimates are always instances of `Approximate[Long]`.
+ *
+ * @example
+ * {{{ // Creates a monoid for a CMS that can count `Long` elements. val topPctCMSMonoid:
+ * TopPctCMSMonoid[Long] = { val eps = 0.001 val delta = 1E-10 val seed = 1 val heavyHittersPct = 0.1
+ * TopPctCMS.monoid[Long](eps, delta, seed, heavyHittersPct) }
+ *
+ * // Creates a TopCMS instance that has counted the element `1L`. val topCMS: TopCMS[Long] =
+ * topPctCMSMonoid.create(1L)
+ *
+ * // Estimates the frequency of `1L` val estimate: Approximate[Long] = topCMS.frequency(1L)
+ *
+ * // What are the heavy hitters so far? val heavyHitters: Set[Long] = topCMS.heavyHitters }}}
+ *
+ * @tparam K
+ * The type used to identify the elements to be counted.
+ */
+sealed abstract class TopCMS[K](val cms: CMS[K], params: TopCMSParams[K])
+ extends java.io.Serializable
+ with CMSCounting[K, TopCMS]
+ with CMSHeavyHitters[K] {
+
+ override val eps: Double = cms.eps
+
+ override val delta: Double = cms.delta
+
+ override val totalCount: Long = cms.totalCount
+
+ override val maxExactCountOpt: Option[Int] = cms.maxExactCountOpt
+
+ override def frequency(item: K): Approximate[Long] = cms.frequency(item)
+
+ override def innerProduct(other: TopCMS[K]): Approximate[Long] =
+ cms.innerProduct(other.cms)
+
+ override def f2: Approximate[Long] = innerProduct(this)
+
+ /**
+ * The pluggable logic with which heavy hitters are being tracked.
+ */
+ override def heavyHittersLogic: HeavyHittersLogic[K] = params.logic
+
+}
+
+/**
+ * Zero element. Used for initialization.
+ */
+case class TopCMSZero[K](override val cms: CMS[K], params: TopCMSParams[K]) extends TopCMS[K](cms, params) {
+
+ override val heavyHitters: Set[K] = Set.empty[K]
+
+ override def +(item: K, count: Long): TopCMS[K] =
+ TopCMSInstance(cms, params) + (item, count)
+
+ override def ++(other: TopCMS[K]): TopCMS[K] = other
+
+}
+
+/**
+ * Used for holding a single element, to avoid repeatedly adding elements from sparse counts tables.
+ */
+case class TopCMSItem[K](item: K, override val cms: CMS[K], params: TopCMSParams[K])
+ extends TopCMS[K](cms, params) {
+
+ override val heavyHitters: Set[K] = Set(item)
+
+ override def +(x: K, count: Long): TopCMS[K] = toCMSInstance + (x, count)
+
+ override def ++(other: TopCMS[K]): TopCMS[K] = other match {
+ case _: TopCMSZero[?] => this
+ case other: TopCMSItem[K] => toCMSInstance + other.item
+ case other: TopCMSInstance[K] => other + item
+ }
+
+ private def toCMSInstance: TopCMSInstance[K] = {
+ val hhs = HeavyHitters.from(HeavyHitter(item, 1L))
+ TopCMSInstance(cms, hhs, params)
+ }
+
+}
+
+object TopCMSInstance {
+
+ def apply[K](cms: CMS[K], params: TopCMSParams[K]): TopCMSInstance[K] =
+ TopCMSInstance[K](cms, HeavyHitters.empty[K], params)
+
+}
+
+case class TopCMSInstance[K](override val cms: CMS[K], hhs: HeavyHitters[K], params: TopCMSParams[K])
+ extends TopCMS[K](cms, params) {
+
+ override def heavyHitters: Set[K] = hhs.items
+
+ override def +(item: K, count: Long): TopCMSInstance[K] = {
+ require(count >= 0, "count must be >= 0 (negative counts not implemented")
+ if (count != 0L) {
+ val newCms = cms + (item, count)
+ val newHhs =
+ heavyHittersLogic.updateHeavyHitters(cms, newCms)(hhs, item, count)
+ TopCMSInstance[K](newCms, newHhs, params)
+ } else this
+ }
+
+ override def ++(other: TopCMS[K]): TopCMS[K] = other match {
+ case _: TopCMSZero[?] => this
+ case other: TopCMSItem[K] => this + other.item
+ case other: TopCMSInstance[K] =>
+ val newCms = cms ++ other.cms
+ val newHhs = heavyHittersLogic.updateHeavyHitters(newCms)(hhs, other.hhs)
+ TopCMSInstance(newCms, newHhs, params)
+ }
+
+}
+
+class TopCMSMonoid[K](emptyCms: CMS[K], logic: HeavyHittersLogic[K]) extends Monoid[TopCMS[K]] {
+
+ val params: TopCMSParams[K] = TopCMSParams(logic)
+
+ override val zero: TopCMS[K] = TopCMSZero[K](emptyCms, params)
+
+ /**
+ * Combines the two sketches.
+ *
+ * The sketches must use the same hash functions.
+ */
+ override def plus(left: TopCMS[K], right: TopCMS[K]): TopCMS[K] = {
+ require(
+ left.cms.params.hashes == right.cms.params.hashes,
+ "The sketches must use the same hash functions."
+ )
+ left ++ right
+ }
+
+ /**
+ * Creates a sketch out of a single item.
+ */
+ def create(item: K): TopCMS[K] =
+ TopCMSItem[K](item, emptyCms + item, params)
+
+ /**
+ * Creates a sketch out of multiple items.
+ */
+ def create(data: Seq[K]): TopCMS[K] =
+ data.foldLeft(zero) { case (acc, x) => plus(acc, create(x)) }
+
+ override def sum(sketches: TraversableOnce[TopCMS[K]]): TopCMS[K] = {
+ val topCandidates = scala.collection.mutable.Set.empty[K]
+ val summation = new CMSSummation(emptyCms.params)
+ sketches.iterator.foreach { sketch =>
+ summation.updateInto(sketch.cms)
+ topCandidates ++= sketch.heavyHitters
+ }
+ val cms = summation.result
+ val ests =
+ topCandidates.map(k => HeavyHitter(k, cms.frequency(k).estimate)).toSet
+ val hhs = logic.purgeHeavyHitters(cms)(HeavyHitters(ests))
+ TopCMSInstance(cms, hhs, params)
+ }
+
+ override def sumOption(sketches: TraversableOnce[TopCMS[K]]): Option[TopCMS[K]] =
+ if (sketches.iterator.isEmpty) None else Some(sum(sketches))
+}
+
+class TopCMSAggregator[K](cmsMonoid: TopCMSMonoid[K]) extends MonoidAggregator[K, TopCMS[K], TopCMS[K]] {
+
+ override def monoid: TopCMSMonoid[K] = cmsMonoid
+
+ override def prepare(value: K): TopCMS[K] = monoid.create(value)
+
+ override def present(cms: TopCMS[K]): TopCMS[K] = cms
+
+}
+
+/**
+ * Controls how a CMS that implements [[CMSHeavyHitters]] tracks heavy hitters.
+ */
+abstract class HeavyHittersLogic[K] extends java.io.Serializable {
+
+ def updateHeavyHitters(
+ oldCms: CMS[K],
+ newCms: CMS[K]
+ )(hhs: HeavyHitters[K], item: K, count: Long): HeavyHitters[K] = {
+ val oldItemCount = oldCms.frequency(item).estimate
+ val oldHh = HeavyHitter[K](item, oldItemCount)
+ val newItemCount = oldItemCount + count
+ val newHh = HeavyHitter[K](item, newItemCount)
+ purgeHeavyHitters(newCms)(hhs - oldHh + newHh)
+ }
+
+ def updateHeavyHitters(cms: CMS[K])(left: HeavyHitters[K], right: HeavyHitters[K]): HeavyHitters[K] = {
+ val candidates = (left.items ++ right.items).map { case i =>
+ HeavyHitter[K](i, cms.frequency(i).estimate)
+ }
+ val newHhs = HeavyHitters.from(candidates)
+ purgeHeavyHitters(cms)(newHhs)
+ }
+
+ def purgeHeavyHitters(cms: CMS[K])(hhs: HeavyHitters[K]): HeavyHitters[K]
+
+}
+
+/**
+ * Finds all heavy hitters, i.e., elements in the stream that appear at least `(heavyHittersPct * totalCount)`
+ * times.
+ *
+ * Every item that appears at least `(heavyHittersPct * totalCount)` times is output, and with probability `p
+ * >= 1 - delta`, no item whose count is less than `(heavyHittersPct - eps) * totalCount` is output.
+ *
+ * This also means that this parameter is an upper bound on the number of heavy hitters that will be tracked:
+ * the set of heavy hitters contains at most `1 / heavyHittersPct` elements. For example, if
+ * `heavyHittersPct=0.01` (or 0.25), then at most `1 / 0.01 = 100` items (or `1 / 0.25 = 4` items) will be
+ * tracked/returned as heavy hitters. This parameter can thus control the memory footprint required for
+ * tracking heavy hitters.
+ */
+case class TopPctLogic[K](heavyHittersPct: Double) extends HeavyHittersLogic[K] {
+
+ require(0 < heavyHittersPct && heavyHittersPct < 1, "heavyHittersPct must lie in (0, 1)")
+
+ override def purgeHeavyHitters(cms: CMS[K])(hitters: HeavyHitters[K]): HeavyHitters[K] = {
+ val minCount = heavyHittersPct * cms.totalCount
+ HeavyHitters[K](hitters.hhs.filter(_.count >= minCount))
+ }
+
+}
+
+/**
+ * Tracks the top N heavy hitters, where `N` is defined by `heavyHittersN`.
+ *
+ * '''Warning:''' top-N computations are not associative. The effect is that a top-N CMS has an ordering bias
+ * (with regard to heavy hitters) when merging instances. This means merging heavy hitters across CMS
+ * instances may lead to incorrect, biased results: the outcome is biased by the order in which CMS instances
+ * / heavy hitters are being merged, with the rule of thumb being that the earlier a set of heavy hitters is
+ * being merged, the more likely is the end result biased towards these heavy hitters.
+ *
+ * @see
+ * Discussion in [[https://github.com/twitter/algebird/issues/353 Algebird issue 353]]
+ */
+case class TopNLogic[K](heavyHittersN: Int) extends HeavyHittersLogic[K] {
+
+ require(heavyHittersN > 0, "heavyHittersN must be > 0")
+
+ override def purgeHeavyHitters(cms: CMS[K])(hitters: HeavyHitters[K]): HeavyHitters[K] = {
+ val sorted =
+ hitters.hhs.toSeq.sortBy(hh => hh.count).takeRight(heavyHittersN)
+ HeavyHitters[K](sorted.toSet)
+ }
+
+}
+
+/**
+ * Containers for holding heavy hitter items and their associated counts.
+ */
+case class HeavyHitters[K](hhs: Set[HeavyHitter[K]]) extends java.io.Serializable {
+
+ def -(hh: HeavyHitter[K]): HeavyHitters[K] = HeavyHitters[K](hhs - hh)
+
+ def +(hh: HeavyHitter[K]): HeavyHitters[K] = HeavyHitters[K](hhs + hh)
+
+ def ++(other: HeavyHitters[K]): HeavyHitters[K] =
+ HeavyHitters[K](hhs ++ other.hhs)
+
+ def items: Set[K] = hhs.map(_.item)
+
+}
+
+object HeavyHitters {
+
+ def empty[K]: HeavyHitters[K] = HeavyHitters(emptyHhs)
+
+ private def emptyHhs[K]: Set[HeavyHitter[K]] = Set[HeavyHitter[K]]()
+
+ def from[K](hhs: Set[HeavyHitter[K]]): HeavyHitters[K] =
+ hhs.foldLeft(empty[K])(_ + _)
+
+ def from[K](hh: HeavyHitter[K]): HeavyHitters[K] = HeavyHitters(emptyHhs + hh)
+
+}
+
+case class HeavyHitter[K](item: K, count: Long) extends java.io.Serializable
+
+/**
+ * Monoid for Top-% based [[TopCMS]] sketches.
+ *
+ * =Usage=
+ *
+ * The type `K` is the type of items you want to count. You must provide an implicit `CMSHasher[K]` for `K`,
+ * and Algebird ships with several such implicits for commonly used types such as `Long` and `BigInt`.
+ *
+ * If your type `K` is not supported out of the box, you have two options: 1) You provide a "translation"
+ * function to convert items of your (unsupported) type `K` to a supported type such as Double, and then use
+ * the `contramap` function of [[CMSHasher]] to create the required `CMSHasher[K]` for your type (see the
+ * documentation of [[CMSHasher]] for an example); 2) You implement a `CMSHasher[K]` from scratch, using the
+ * existing CMSHasher implementations as a starting point.
+ *
+ * Note: Because Arrays in Scala/Java not have sane `equals` and `hashCode` implementations, you cannot safely
+ * use types such as `Array[Byte]`. Extra work is required for Arrays. For example, you may opt to convert
+ * `Array[T]` to a `Seq[T]` via `toSeq`, or you can provide appropriate wrapper classes. Algebird provides one
+ * such wrapper class, [[Bytes]], to safely wrap an `Array[Byte]` for use with CMS.
+ *
+ * @param cms
+ * A [[CMS]] instance, which is used for the counting and the frequency estimation performed by this class.
+ * @param heavyHittersPct
+ * A threshold for finding heavy hitters, i.e., elements that appear at least (heavyHittersPct * totalCount)
+ * times in the stream.
+ * @tparam K
+ * The type used to identify the elements to be counted. For example, if you want to count the occurrence of
+ * user names, you could map each username to a unique numeric ID expressed as a `Long`, and then count the
+ * occurrences of those `Long`s with a CMS of type `K=Long`. Note that this mapping between the elements of
+ * your problem domain and their identifiers used for counting via CMS should be bijective. We require a
+ * [[CMSHasher]] context bound for `K`, see [[CMSHasher]] for available implicits that can be imported.
+ * Which type K should you pick in practice? For domains that have less than `2^64` unique elements, you'd
+ * typically use `Long`. For larger domains you can try `BigInt`, for example.
+ */
+class TopPctCMSMonoid[K](cms: CMS[K], heavyHittersPct: Double = 0.01)
+ extends TopCMSMonoid[K](cms, TopPctLogic[K](heavyHittersPct))
+
+object TopPctCMS {
+
+ def monoid[K: CMSHasher](
+ eps: Double,
+ delta: Double,
+ seed: Int,
+ heavyHittersPct: Double
+ ): TopPctCMSMonoid[K] =
+ new TopPctCMSMonoid[K](CMS(eps, delta, seed), heavyHittersPct)
+
+ def monoid[K: CMSHasher](depth: Int, width: Int, seed: Int, heavyHittersPct: Double): TopPctCMSMonoid[K] =
+ monoid(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersPct)
+
+ def aggregator[K: CMSHasher](
+ eps: Double,
+ delta: Double,
+ seed: Int,
+ heavyHittersPct: Double
+ ): TopPctCMSAggregator[K] =
+ new TopPctCMSAggregator[K](monoid(eps, delta, seed, heavyHittersPct))
+
+ def aggregator[K: CMSHasher](
+ depth: Int,
+ width: Int,
+ seed: Int,
+ heavyHittersPct: Double
+ ): TopPctCMSAggregator[K] =
+ aggregator(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersPct)
+
+}
+
+/**
+ * An Aggregator for [[TopPctCMS]]. Can be created using [[TopPctCMS.aggregator]].
+ */
+case class TopPctCMSAggregator[K](cmsMonoid: TopPctCMSMonoid[K]) extends TopCMSAggregator(cmsMonoid)
+
+/**
+ * Monoid for top-N based [[TopCMS]] sketches. '''Use with care! (see warning below)'''
+ *
+ * =Warning: Adding top-N CMS instances (`++`) is an unsafe operation=
+ *
+ * Top-N computations are not associative. The effect is that a top-N CMS has an ordering bias (with regard to
+ * heavy hitters) when ''merging'' CMS instances (e.g. via `++`). This means merging heavy hitters across CMS
+ * instances may lead to incorrect, biased results: the outcome is biased by the order in which CMS instances
+ * / heavy hitters are being merged, with the rule of thumb being that the earlier a set of heavy hitters is
+ * being merged, the more likely is the end result biased towards these heavy hitters.
+ *
+ * The warning above only applies when ''adding CMS instances'' (think: `cms1 ++ cms2`). In comparison, heavy
+ * hitters are correctly computed when:
+ *
+ * - a top-N CMS instance is created from a single data stream, i.e. `Seq[K]`
+ * - items are added/counted individually, i.e. `cms + item` or `cms + (item, count)`.
+ *
+ * See the discussion in [[https://github.com/twitter/algebird/issues/353 Algebird issue 353]] for further
+ * details.
+ *
+ * =Alternatives=
+ *
+ * The following, alternative data structures may be better picks than a top-N based CMS given the warning
+ * above:
+ *
+ * - [[TopPctCMS]]: Has safe merge semantics for its instances including heavy hitters.
+ * - [[SpaceSaver]]: Has the same ordering bias than a top-N CMS, but at least it provides bounds on the
+ * bias.
+ *
+ * =Usage=
+ *
+ * The type `K` is the type of items you want to count. You must provide an implicit `CMSHasher[K]` for `K`,
+ * and Algebird ships with several such implicits for commonly used types such as `Long` and `BigInt`.
+ *
+ * If your type `K` is not supported out of the box, you have two options: 1) You provide a "translation"
+ * function to convert items of your (unsupported) type `K` to a supported type such as [[Double]], and then
+ * use the `contramap` function of [[CMSHasher]] to create the required `CMSHasher[K]` for your type (see the
+ * documentation of [[CMSHasher]] for an example); 2) You implement a `CMSHasher[K]` from scratch, using the
+ * existing CMSHasher implementations as a starting point.
+ *
+ * Note: Because Arrays in Scala/Java not have sane `equals` and `hashCode` implementations, you cannot safely
+ * use types such as `Array[Byte]`. Extra work is required for Arrays. For example, you may opt to convert
+ * `Array[T]` to a `Seq[T]` via `toSeq`, or you can provide appropriate wrapper classes. Algebird provides one
+ * such wrapper class, [[Bytes]], to safely wrap an `Array[Byte]` for use with CMS.
+ *
+ * @param cms
+ * A [[CMS]] instance, which is used for the counting and the frequency estimation performed by this class.
+ * @param heavyHittersN
+ * The maximum number of heavy hitters to track.
+ * @tparam K
+ * The type used to identify the elements to be counted. For example, if you want to count the occurrence of
+ * user names, you could map each username to a unique numeric ID expressed as a `Long`, and then count the
+ * occurrences of those `Long`s with a CMS of type `K=Long`. Note that this mapping between the elements of
+ * your problem domain and their identifiers used for counting via CMS should be bijective. We require a
+ * [[CMSHasher]] context bound for `K`, see [[CMSHasher]] for available implicits that can be imported.
+ * Which type K should you pick in practice? For domains that have less than `2^64` unique elements, you'd
+ * typically use `Long`. For larger domains you can try `BigInt`, for example.
+ */
+class TopNCMSMonoid[K](cms: CMS[K], heavyHittersN: Int = 100)
+ extends TopCMSMonoid[K](cms, TopNLogic[K](heavyHittersN))
+
+object TopNCMS {
+
+ def monoid[K: CMSHasher](eps: Double, delta: Double, seed: Int, heavyHittersN: Int): TopNCMSMonoid[K] =
+ new TopNCMSMonoid[K](CMS(eps, delta, seed), heavyHittersN)
+
+ def monoid[K: CMSHasher](depth: Int, width: Int, seed: Int, heavyHittersN: Int): TopNCMSMonoid[K] =
+ monoid(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersN)
+
+ def aggregator[K: CMSHasher](
+ eps: Double,
+ delta: Double,
+ seed: Int,
+ heavyHittersN: Int
+ ): TopNCMSAggregator[K] =
+ new TopNCMSAggregator[K](monoid(eps, delta, seed, heavyHittersN))
+
+ def aggregator[K: CMSHasher](depth: Int, width: Int, seed: Int, heavyHittersN: Int): TopNCMSAggregator[K] =
+ aggregator(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersN)
+
+}
+
+/**
+ * An Aggregator for [[TopNCMS]]. Can be created using [[TopNCMS.aggregator]].
+ */
+case class TopNCMSAggregator[K](cmsMonoid: TopNCMSMonoid[K]) extends TopCMSAggregator(cmsMonoid)
+
+/**
+ * K1 defines a scope for the CMS. For each k1, keep the top heavyHittersN associated k2 values.
+ */
+case class ScopedTopNLogic[K1, K2](heavyHittersN: Int) extends HeavyHittersLogic[(K1, K2)] {
+
+ require(heavyHittersN > 0, "heavyHittersN must be > 0")
+
+ override def purgeHeavyHitters(
+ cms: CMS[(K1, K2)]
+ )(hitters: HeavyHitters[(K1, K2)]): HeavyHitters[(K1, K2)] = {
+ val grouped = hitters.hhs.groupBy(hh => hh.item._1)
+ val (underLimit, overLimit) = grouped.partition {
+ _._2.size <= heavyHittersN
+ }
+ val sorted = overLimit.transform { case (_, hhs) =>
+ hhs.toSeq.sortBy(hh => hh.count)
+ }
+ val purged = sorted.transform { case (_, hhs) =>
+ hhs.takeRight(heavyHittersN)
+ }
+ HeavyHitters[(K1, K2)](purged.values.flatten.toSet ++ underLimit.values.flatten.toSet)
+ }
+
+}
+
+/*
+ * Monoid for Top-N values per key in an associative [[TopCMS]].
+ *
+ * Typical use case for this might be (Country, City) pairs. For a stream of such
+ * pairs, we might want to keep track of the most popular cities for each country.
+ *
+ * This can, of course, be achieved using a Map[Country, TopNCMS[City]], but this
+ * requires storing one CMS per distinct Country.
+ *
+ * Similarly, one could attempt to use a TopNCMS[(Country, City)], but less common
+ * countries may not make the cut if N is not "very large".
+ *
+ * ScopedTopNCMSMonoid[Country, City] will avoid having one Country drown others
+ * out, while still only using a single CMS.
+ *
+ * In general the eviction of K1 is not supported, and all distinct K1 values must
+ * be retained. Therefore it is important to only use this Monoid when the number
+ * of distinct K1 values is known to be reasonably bounded.
+ */
+class ScopedTopNCMSMonoid[K1, K2](cms: CMS[(K1, K2)], heavyHittersN: Int = 100)
+ extends TopCMSMonoid[(K1, K2)](cms, ScopedTopNLogic[K1, K2](heavyHittersN))
+
+object ScopedTopNCMS {
+
+ def scopedHasher[K1: CMSHasher, K2: CMSHasher]: CMSHasher[(K1, K2)] = new CMSHasher[(K1, K2)] {
+ private val k1Hasher = implicitly[CMSHasher[K1]]
+ private val k2Hasher = implicitly[CMSHasher[K2]]
+
+ override def hash(a: Int, b: Int, width: Int)(x: (K1, K2)): Int = {
+ val (k1, k2) = x
+ val xs = Seq(k1Hasher.hash(a, b, width)(k1), k2Hasher.hash(a, b, width)(k2), a, b)
+ (scala.util.hashing.MurmurHash3.seqHash(xs) & Int.MaxValue) % width
+ }
+ }
+
+ def monoid[K1: CMSHasher, K2: CMSHasher](
+ eps: Double,
+ delta: Double,
+ seed: Int,
+ heavyHittersN: Int
+ ): ScopedTopNCMSMonoid[K1, K2] =
+ new ScopedTopNCMSMonoid[K1, K2](CMS(eps, delta, seed)(scopedHasher[K1, K2]), heavyHittersN)
+
+ def monoid[K1: CMSHasher, K2: CMSHasher](
+ depth: Int,
+ width: Int,
+ seed: Int,
+ heavyHittersN: Int
+ ): ScopedTopNCMSMonoid[K1, K2] =
+ monoid(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersN)
+
+ def aggregator[K1: CMSHasher, K2: CMSHasher](
+ eps: Double,
+ delta: Double,
+ seed: Int,
+ heavyHittersN: Int
+ ): TopCMSAggregator[(K1, K2)] =
+ new TopCMSAggregator(monoid(eps, delta, seed, heavyHittersN))
+
+ def aggregator[K1: CMSHasher, K2: CMSHasher](
+ depth: Int,
+ width: Int,
+ seed: Int,
+ heavyHittersN: Int
+ ): TopCMSAggregator[(K1, K2)] =
+ aggregator(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersN)
+
+}
+
+case class CMSHash[K: CMSHasher](a: Int, b: Int, width: Int) extends java.io.Serializable {
+
+ /**
+ * Returns `a * x + b (mod p) (mod width)`.
+ */
+ def apply(x: K): Int = implicitly[CMSHasher[K]].hash(a, b, width)(x)
+
+}
+
+/**
+ * This formerly held the instances that moved to object CMSHasher
+ *
+ * These instances are slow, but here for compatibility with old serialized data. For new code, avoid these
+ * and instead use the implicits found in the CMSHasher companion object.
+ */
+object CMSHasherImplicits {
+
+ implicit object CMSHasherBigInt extends CMSHasher[BigInt] {
+ override def hash(a: Int, b: Int, width: Int)(x: BigInt): Int =
+ CMSHasher.hashBytes(a, b, width)(x.toByteArray)
+ }
+
+ implicit object CMSHasherString extends CMSHasher[String] {
+ override def hash(a: Int, b: Int, width: Int)(x: String): Int =
+ CMSHasher.hashBytes(a, b, width)(x.getBytes("UTF-8"))
+ }
+
+ def cmsHasherShort: CMSHasher[Short] = CMSHasher.cmsHasherShort
+}
diff --git a/algebird-core/src/main/scala-2.13/DecayedVector.scala b/algebird-core/src/main/scala-2.13/DecayedVector.scala
new file mode 100644
index 000000000..18e816fe4
--- /dev/null
+++ b/algebird-core/src/main/scala-2.13/DecayedVector.scala
@@ -0,0 +1,75 @@
+/*
+Copyright 2012 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+ */
+
+package com.twitter.algebird
+
+/**
+ * Represents a container class together with time. Its monoid consists of exponentially scaling the older
+ * value and summing with the newer one.
+ */
+object DecayedVector extends CompatDecayedVector {
+ def buildWithHalflife[C[_]](vector: C[Double], time: Double, halfLife: Double): DecayedVector[C] =
+ DecayedVector(vector, time * scala.math.log(2.0) / halfLife)
+
+ def monoidWithEpsilon[C[_]](
+ eps: Double
+ )(implicit vs: VectorSpace[Double, C], metric: Metric[C[Double]]): Monoid[DecayedVector[C]] =
+ new Monoid[DecayedVector[C]] {
+ override val zero = DecayedVector(vs.group.zero, Double.NegativeInfinity)
+ override def plus(left: DecayedVector[C], right: DecayedVector[C]) =
+ if (left.scaledTime <= right.scaledTime) {
+ scaledPlus(right, left, eps)
+ } else {
+ scaledPlus(left, right, eps)
+ }
+ }
+
+ def forMap[K](m: Map[K, Double], scaledTime: Double): DecayedVector[Map[K, _]] =
+ DecayedVector[Map[K, _]](m, scaledTime)
+ def forMapWithHalflife[K](m: Map[K, Double], time: Double, halfLife: Double): DecayedVector[Map[K, _]] =
+ forMap(m, time * scala.math.log(2.0) / halfLife)
+
+ def mapMonoidWithEpsilon[K](
+ eps: Double
+ )(implicit
+ vs: VectorSpace[Double, Map[K, _]],
+ metric: Metric[Map[K, Double]]
+ ): Monoid[DecayedVector[Map[K, _]]] =
+ monoidWithEpsilon[Map[K, _]](eps)
+
+ implicit def mapMonoid[K](implicit
+ vs: VectorSpace[Double, Map[K, _]],
+ metric: Metric[Map[K, Double]]
+ ): Monoid[DecayedVector[Map[K, _]]] =
+ mapMonoidWithEpsilon(-1.0)
+
+ def scaledPlus[C[_]](newVal: DecayedVector[C], oldVal: DecayedVector[C], eps: Double)(implicit
+ vs: VectorSpace[Double, C],
+ metric: Metric[C[Double]]
+ ): DecayedVector[C] = {
+ implicit val mon: Monoid[C[Double]] = vs.group
+ val expFactor = scala.math.exp(oldVal.scaledTime - newVal.scaledTime)
+ val newVector =
+ Monoid.plus(newVal.vector, vs.scale(expFactor, oldVal.vector))
+ if (eps < 0.0 || Metric.norm(newVector) > eps) {
+ DecayedVector(newVector, newVal.scaledTime)
+ } else {
+ DecayedVector(mon.zero, Double.NegativeInfinity)
+ }
+ }
+}
+
+case class DecayedVector[C[_]](vector: C[Double], scaledTime: Double)
diff --git a/algebird-core/src/main/scala-2.13/DecayingCMS.scala b/algebird-core/src/main/scala-2.13/DecayingCMS.scala
new file mode 100644
index 000000000..54809e2a8
--- /dev/null
+++ b/algebird-core/src/main/scala-2.13/DecayingCMS.scala
@@ -0,0 +1,650 @@
+package com.twitter.algebird
+
+import java.lang.Double.{compare => cmp}
+import java.lang.Math
+import java.util.Arrays.deepHashCode
+import scala.concurrent.duration.Duration
+import scala.util.Random
+
+/**
+ * DecayingCMS is a module to build count-min sketch instances whose counts decay exponentially.
+ *
+ * Similar to a Map[K, com.twitter.algebird.DecayedValue], each key is associated with a single count value
+ * that decays over time. Unlike a map, the decyaing CMS is an approximate count -- in exchange for the
+ * possibility of over-counting, we can bound its size in memory.
+ *
+ * The intended use case is for metrics or machine learning where exact values aren't needed.
+ *
+ * You can expect the keys with the biggest values to be fairly accurate but the very small values (rare keys
+ * or very old keys) to be lost in the noise. For both metrics and ML this should be fine: you can't learn too
+ * much from very rare values.
+ *
+ * We recommend depth of at least 5, and width of at least 100, but you should do some experiments to
+ * determine the smallest parameters that will work for your use case.
+ */
+final class DecayingCMS[K](
+ seed: Long,
+ val halfLife: Duration,
+ val depth: Int, // number of hashing functions
+ val width: Int, // number of table cells per hashing function
+ hasher: CMSHasher[K]
+) extends Serializable { module =>
+
+ override def toString: String =
+ s"DecayingCMS(seed=$seed, halfLife=$halfLife, depth=$depth, width=$width)"
+
+ @inline private def getNextLogScale(
+ logScale: Double,
+ oldTimeInHL: Double,
+ nowInHL: Double
+ ): Double =
+ if (nowInHL == oldTimeInHL) logScale else logScale + (nowInHL - oldTimeInHL) * log2
+
+ @inline private def getScale(logScale: Double, oldTimeInHL: Double, nowInHL: Double): Double = {
+ val logScale1 = getNextLogScale(logScale, oldTimeInHL, nowInHL)
+ Math.exp(-logScale1)
+ }
+
+ val empty: CMS =
+ new CMS(Array.fill(depth)(Vector.fill[Double](width)(0.0)), 0.0, Double.NegativeInfinity)
+
+ /**
+ * Represents a decaying scalar value at a particular point in time.
+ *
+ * The value decays according to halfLife. Another way to think about DoubleAt is that it represents a
+ * particular decay curve (and in particular, a point along that curve). Two DoubleAt values may be
+ * equivalent if they are two points on the same curve.
+ *
+ * The `timeToZero` and `timeToUnit` methods can be used to "normalize" DoubleAt values. If two DoubleAt
+ * values do not produce the same (approximate) Double values from these methods, they represent different
+ * curves.
+ */
+ class DoubleAt private[algebird] (val value: Double, val timeInHL: Double) extends Serializable {
+ lhs =>
+
+ // this is not public because it's not safe in general -- you need
+ // to run a function that is time-commutative.
+ private[algebird] def map(f: Double => Double): DoubleAt =
+ new DoubleAt(f(value), timeInHL)
+
+ // this is not public because it's not safe in general -- you need
+ // to run a function that is time-commutative.
+ private[algebird] def map2(rhs: DoubleAt)(f: (Double, Double) => Double): DoubleAt =
+ if (lhs.timeInHL < rhs.timeInHL) {
+ val x = lhs.scaledAt(rhs.timeInHL)
+ new DoubleAt(f(x, rhs.value), rhs.timeInHL)
+ } else if (lhs.timeInHL == rhs.timeInHL) {
+ new DoubleAt(f(lhs.value, rhs.value), rhs.timeInHL)
+ } else {
+ val y = rhs.scaledAt(lhs.timeInHL)
+ new DoubleAt(f(lhs.value, y), lhs.timeInHL)
+ }
+
+ def unary_- : DoubleAt = new DoubleAt(-value, timeInHL)
+ def abs: DoubleAt = new DoubleAt(Math.abs(value), timeInHL)
+ def *(n: Double): DoubleAt = new DoubleAt(value * n, timeInHL)
+
+ def +(rhs: DoubleAt): DoubleAt = map2(rhs)(_ + _)
+ def -(rhs: DoubleAt): DoubleAt = map2(rhs)(_ - _)
+ def min(rhs: DoubleAt): DoubleAt = map2(rhs)(Math.min)
+ def max(rhs: DoubleAt): DoubleAt = map2(rhs)(Math.max)
+
+ def /(rhs: DoubleAt): Double = map2(rhs)(_ / _).value
+
+ /**
+ * We consider two DoubleAt values equal not just if their elements are equal, but also if they represent
+ * the same value at different points of decay.
+ */
+ def compare(rhs: DoubleAt): Int = {
+ val vc = cmp(lhs.value, rhs.value)
+ val tc = cmp(lhs.timeInHL, rhs.timeInHL)
+ if (vc == tc) vc
+ else if (tc == 0) vc
+ else if (vc == 0) tc
+ else if (tc < 0) cmp(lhs.scaledAt(rhs.timeInHL), rhs.value)
+ else cmp(lhs.value, rhs.scaledAt(lhs.timeInHL))
+ }
+
+ /**
+ * Time when this value will reach the smallest double value bigger than zero, unless we are already at
+ * zero in which case we return the current time
+ */
+ def timeToZero: Double =
+ if (java.lang.Double.isNaN(value)) Double.NaN
+ else if (java.lang.Double.isInfinite(value)) Double.PositiveInfinity
+ else if (value == 0.0) timeInHL
+ else timeToUnit + DoubleAt.TimeFromUnitToZero
+
+ /**
+ * This is the scaled time when the current value will reach 1 (or -1 for negative values)
+ *
+ * This method is a way of collapsing a DoubleAt into a single value (the time in the past or future where
+ * its value would be 1, the unit value).
+ */
+ def timeToUnit: Double =
+ if (java.lang.Double.isNaN(value)) Double.NaN
+ else if (java.lang.Double.isInfinite(value)) Double.PositiveInfinity
+ else if (value == 0.0) Double.NegativeInfinity
+ else {
+ // solve for result:
+ //
+ // 1 = value * module.getScale(0.0, timeInHL, result)
+ // 1 = value * Math.exp(-getNextLogScale(0.0, timeInHL, result))
+ // 1 / value = Math.exp(-getNextLogScale(0.0, timeInHL, result))
+ // log(1 / value) = -getNextLogScale(0.0, timeInHL, result)
+ // -log(1 / value) = getNextLogScale(0.0, timeInHL, result)
+ // log(value) = getNextLogScale(0.0, timeInHL, result)
+ // log(value) = if (result == timeInHL) 0 else 0 + (result - timeInHL) * log2
+ // log(value) = if (result == timeInHL) 0 else (result - timeInHL) * log2
+ //
+ // log(value) = (result - timeInHL) * log2
+ // log(value) / log2 = result - timeInHL
+ // log(value) / log2 + timeInHL = result
+ Math.log(Math.abs(value)) / log2 + timeInHL
+ }
+
+ override def equals(that: Any): Boolean =
+ that match {
+ case d: DoubleAt => compare(d) == 0
+ case _ => false
+ }
+
+ override def hashCode: Int =
+ timeToUnit.##
+
+ override def toString: String =
+ s"DoubleAt($value, $timeInHL)"
+
+ def <(rhs: DoubleAt): Boolean = (lhs.compare(rhs)) < 0
+ def <=(rhs: DoubleAt): Boolean = (lhs.compare(rhs)) <= 0
+ def >(rhs: DoubleAt): Boolean = (lhs.compare(rhs)) > 0
+ def >=(rhs: DoubleAt): Boolean = (lhs.compare(rhs)) >= 0
+
+ def time: Long =
+ toTimestamp(timeInHL)
+
+ private def scaledAt(t: Double): Double =
+ if (value == 0.0) 0.0
+ else value * module.getScale(0.0, timeInHL, t)
+
+ def at(time: Long): Double =
+ if (value == 0.0) 0.0
+ else value * module.getScale(0.0, timeInHL, fromTimestamp(time))
+ }
+
+ object DoubleAt {
+ def apply(x: Double, t: Long): DoubleAt =
+ new DoubleAt(x, fromTimestamp(t))
+
+ val zero: DoubleAt =
+ new DoubleAt(0.0, Double.NegativeInfinity)
+
+ private val TimeFromUnitToZero: Double =
+ -Math.log(Double.MinPositiveValue) / log2
+ }
+
+ val totalCells: Int = depth * width
+
+ val halfLifeSecs: Double =
+ halfLife.toMillis.toDouble / 1000.0
+
+ // TODO: consider a smaller number?
+ // we are trading accuracy for possible performence
+ private[this] val maxLogScale: Double = 20.0
+
+ /**
+ * Allocate an empty array of row.
+ *
+ * The elements start as null. It's an important optimization _not_ to allocate vectors here, since we're
+ * often building up cells mutably.
+ */
+ private def allocCells(): Array[Vector[Double]] =
+ new Array[Vector[Double]](depth)
+
+ def toTimestamp(t: Double): Long =
+ (t * halfLifeSecs * 1000.0).toLong
+
+ def fromTimestamp(t: Long): Double =
+ (t.toDouble / 1000.0) / halfLifeSecs
+
+ val hashFns: Array[K => Int] = {
+ val rng = new Random(seed)
+ def genPos(): Int =
+ rng.nextInt() match {
+ case 0 => genPos()
+ case n => n & 0x7fffffff
+ }
+
+ (0 until depth).map { _ =>
+ val n = genPos()
+ (k: K) => hasher.hash(n, 0, width)(k)
+ }.toArray
+ }
+
+ private final val log2 = Math.log(2.0)
+
+ /**
+ * The idealized formula for the updating current value for a key (y0 -> y1) is given as:
+ *
+ * delta = (t1 - t0) / halflife y1 = y0 * 2^(-delta) + n
+ *
+ * However, we want to avoid having to rescale every single cell every time we update; i.e. a cell with a
+ * zero value should continue to have a zero value when n=0.
+ *
+ * Therefore, we introduce a change of variable to cell values (z) along with a scale factor (scale), and
+ * the following formula:
+ *
+ * (1) zN = yN * scaleN
+ *
+ * Our constraint is expressed as:
+ *
+ * (2) If n=0, z1 = z0
+ *
+ * In that case:
+ *
+ * (3) If n=0, (y1 * scale1) = (y0 * scale0) (4) Substituting for y1, (y0 * 2^(-delta) + 0) * scale1 = y0 *
+ * scale0 (5) 2^(-delta) * scale1 = scale0 (6) scale1 = scale0 * 2^(delta)
+ *
+ * Also, to express z1 in terms of z0, we say:
+ *
+ * (7) z1 = y1 * scale1 (8) z1 = (y0 * 2^(-delta) + n) * scale1 (9) z1 = ((z0 / scale0) * 2^(-delta) + n) *
+ * scale1 (10) z1 / scale1 = (z0 / (scale1 * 2^(-delta))) * 2^(-delta) + n (11) z1 / scale1 = z0 / scale1 +
+ * n (12) z1 = z0 + n * scale1
+ *
+ * So, for cells where n=0, we just update scale0 to scale1, and for cells where n is non-zero, we update z1
+ * in terms of z0 and scale1.
+ *
+ * If we convert scale to logscale, we have:
+ *
+ * (13) logscale1 = logscale0 + delta * log(2) (14) z1 = z0 + n * exp(logscale1)
+ *
+ * When logscale1 gets big, we start to distort z1. For example, exp(36) is close to 2^53. We can measure
+ * when n * exp(logscale1) gets big, and in those cases we can rescale all our cells (set each z to its
+ * corresponding y) and set the logscale to 0.
+ *
+ * (15) y1 = z1 / scale1 (16) y1 = z1 / exp(logscale1) (17) y1 = z1 * exp(-logscale1)
+ */
+ final class CMS(
+ val cells: Array[Vector[Double]],
+ val logScale: Double,
+ val timeInHL: Double
+ ) extends Serializable {
+
+ @inline private def scale: Double =
+ Math.exp(-logScale)
+
+ override def toString: String = {
+ val s = cells.iterator.map(_.toString).mkString("Array(", ", ", ")")
+ s"CMS($s, $logScale, $timeInHL)"
+ }
+
+ override def hashCode: Int =
+ deepHashCode(cells.asInstanceOf[Array[Object]]) * 59 +
+ logScale.## * 17 +
+ timeInHL.## * 37 +
+ 19
+
+ // unfortunately we can't check the path-dependent type of this
+ // CMS, which we signal by using a type projection here.
+ override def equals(any: Any): Boolean =
+ any match {
+ case that: DecayingCMS[?]#CMS =>
+ this.logScale == that.logScale &&
+ this.timeInHL == that.timeInHL &&
+ this.cells.length == that.cells.length && {
+ var i = 0
+ while (i < depth) {
+ if (this.cells(i) != that.cells(i)) return false
+ i += 1
+ }
+ true
+ }
+ case _ =>
+ false
+ }
+
+ def lastUpdateTime: Long =
+ toTimestamp(timeInHL)
+
+ /**
+ * Provide lower and upper bounds on values returned for any possible key.
+ *
+ * The first value is a lower bound: even keys that have never been counted will return this value or
+ * greater. This will be zero unless the CMS is saturated.
+ *
+ * The second value is an upper bound: the key with the largest cardinality will not be reported as being
+ * larger than this value (though it might be reported as being smaller).
+ *
+ * Together these values indicate how saturated and skewed the CMS might be.
+ */
+ def range: (DoubleAt, DoubleAt) = {
+ var minMinimum = Double.PositiveInfinity
+ var minMaximum = Double.PositiveInfinity
+ var i = 0
+ while (i < cells.length) {
+ val it = cells(i).iterator
+ var localMax = it.next() // we know it doesn't start empty
+ if (localMax < minMinimum) minMinimum = localMax
+ while (it.hasNext) {
+ val n = it.next()
+ if (n > localMax) localMax = n
+ else if (n < minMinimum) minMinimum = n
+ }
+ if (localMax < minMaximum) minMaximum = localMax
+ i += 1
+ }
+
+ val s = scale
+ def sc(x: Double): DoubleAt =
+ new DoubleAt(if (x == 0.0) 0.0 else x * s, timeInHL)
+
+ (sc(minMinimum), sc(minMaximum))
+ }
+
+ /**
+ * Returns the square-root of the inner product of two decaying CMSs.
+ *
+ * We want the result to decay at the same rate as the CMS for this method to be valid. Taking the square
+ * root ensures that this is true. Without it, we would violate the following equality (assuming we had
+ * at() on a CMS):
+ *
+ * x.innerProduct(y).at(t) = x.at(t).innerProduct(y.at(t))
+ *
+ * This is why we don't support innerProduct, only innerProductRoot.
+ */
+ def innerProductRoot(that: CMS): DoubleAt = {
+ var i = 0
+ var res = Double.PositiveInfinity
+ val t = Math.max(this.timeInHL, that.timeInHL)
+ val scale = this.getScale(t) * that.getScale(t)
+ while (i < depth) {
+ var sum = 0.0
+ val it0 = this.cells(i).iterator
+ val it1 = that.cells(i).iterator
+ while (it0.hasNext) {
+ val x = it0.next() * it1.next()
+ if (x != 0.0) sum += x
+ }
+ if (sum < res) res = sum
+ i += 1
+ }
+ val x = if (res != 0.0) Math.sqrt(res * scale) else 0.0
+ new DoubleAt(x, t)
+ }
+
+ def l2Norm: DoubleAt =
+ innerProductRoot(this)
+
+ def scale(x: Double): CMS =
+ if (java.lang.Double.isNaN(x)) {
+ throw new IllegalArgumentException(s"invalid scale: $x")
+ } else if (x < 0.0) {
+ throw new IllegalArgumentException(s"negative scale is not allowed: $x")
+ } else if (x == 0.0) {
+ module.empty
+ } else {
+ val s = logScale + Math.log(x)
+ val c = new CMS(cells, s, timeInHL)
+ if (s > maxLogScale) c.rescaleTo(timeInHL) else c
+ }
+
+ /**
+ * Get the total count of all items in the CMS.
+ *
+ * The total is the same as the l1Norm, since we don't allow negative values.
+ *
+ * Total is one of the few non-approximate statistics that DecayingCMS supports. We expect the total to be
+ * exact (except for floating-point error).
+ */
+ def total: DoubleAt = {
+ val n = cells(0).sum
+ val x = if (n == 0.0) 0.0 else scale * n
+ new DoubleAt(x, timeInHL)
+ }
+
+ def get(k: K): DoubleAt = {
+ var minValue = Double.PositiveInfinity
+ var didx = 0
+ while (didx < depth) {
+ val i = hashFns(didx)(k)
+ val inner = cells(didx)
+ val value = inner(i)
+ if (value < minValue) minValue = value
+ didx += 1
+ }
+ val x = if (minValue == 0.0) 0.0 else scale * minValue
+ new DoubleAt(x, timeInHL)
+ }
+
+ def getScale(t: Double): Double =
+ module.getScale(logScale, timeInHL, t)
+
+ private final def nextLogScale(t: Double): Double =
+ module.getNextLogScale(logScale, timeInHL, t)
+
+ def +(other: CMS): CMS = {
+ val x = this
+ val y = other
+ val timeInHL = Math.max(x.timeInHL, y.timeInHL)
+ val cms = new CMS(allocCells(), 0.0, timeInHL)
+
+ val xscale = x.getScale(timeInHL)
+ val yscale = y.getScale(timeInHL)
+
+ // a zero count is zero, no matter, how big the scale is.
+ @inline def prod(x: Double, y: Double): Double =
+ if (x == 0.0) 0.0 else x * y
+
+ var i = 0
+ while (i < depth) {
+ val left = x.cells(i)
+ val right = y.cells(i)
+ var j = 0
+ val bldr = rowBuilder()
+ while (j < width) {
+ bldr += prod(left(j), xscale) + prod(right(j), yscale)
+ j += 1
+ }
+ cms.cells(i) = bldr.result()
+ i += 1
+ }
+ cms
+ }
+
+ def add(t: Long, k: K, n: Double): CMS =
+ scaledAdd(fromTimestamp(t), k, n)
+
+ // TODO: we could allocate a mutable scratch pad, write all the
+ // values into it, and then build a CMS out of it. if items is
+ // very small, this would be less efficient than what we're doing
+ // now. probably the "ideal" solution would be determine how many
+ // items there are. if we have fewer than ~width items, this
+ // approach is fine. for more, a scratch pad would be better
+ // (assuming we wrote that code).
+ //
+ // alternately, you could map items into (zero + item) and then
+ // use the monoid's sum to boil it down.
+ //
+ // we only use this in testing currently so the current code is
+ // fine until we rely on it in production. any change here should
+ // probably include benchmarks justifying the design.
+ def bulkAdd(items: Iterable[(Long, K, Double)]): CMS =
+ items.foldLeft(this) { case (c, (t, k, v)) => c.add(t, k, v) }
+
+ private[algebird] def scaledAdd(ts1: Double, k: K, n: Double): CMS =
+ if (n < 0.0) {
+ val t = toTimestamp(ts1)
+ throw new IllegalArgumentException(
+ s"we can only add non-negative numbers to a CMS, got $n for key: $k at time: $t"
+ )
+ } else if (n == 0.0) {
+ this
+ } else {
+ val logScale1 = nextLogScale(ts1)
+ if (logScale1 > maxLogScale) {
+ rescaleTo(ts1).scaledAdd(ts1, k, n)
+ } else {
+ val increment = n * Math.exp(logScale1)
+ val cells1 = allocCells()
+ var didx = 0
+ while (didx < depth) {
+ val cell = cells(didx)
+ val w = hashFns(didx)(k)
+ cells1(didx) = cell.updated(w, cell(w) + increment)
+ didx += 1
+ }
+ new CMS(cells1, logScale1, ts1)
+ }
+ }
+
+ // Set the scale back to 0.0
+ // input time is in half-lives
+ private[algebird] def rescaleTo(ts: Double): CMS = {
+ val logScale1 = nextLogScale(ts)
+ val expL = Math.exp(-logScale1)
+ if (expL == 0.0) {
+ new CMS(monoid.zero.cells, 0.0, ts)
+ } else {
+ val cms = new CMS(allocCells(), 0.0, ts)
+ var i = 0
+ while (i < depth) {
+ val ci = cells(i)
+ cms.cells(i) = ci.map(_ * expL)
+ i += 1
+ }
+ cms
+ }
+ }
+ }
+
+ private def rowBuilder() = {
+ val bldr = Vector.newBuilder[Double]
+ bldr.sizeHint(width)
+ bldr
+ }
+
+ object CMS {
+
+ implicit val monoidForCMS: Monoid[CMS] =
+ new Monoid[CMS] {
+
+ def zero: CMS = module.empty
+
+ def plus(x: CMS, y: CMS): CMS =
+ x + y
+
+ /**
+ * Turn a flat array into an array of vectors.
+ */
+ private def scratchToCells(scratch: Array[Double]): Array[Vector[Double]] = {
+ val cells = new Array[Vector[Double]](depth)
+ var i = 0
+ while (i < depth) {
+ var j = i * width
+ val limit = j + width
+ val bldr = rowBuilder()
+ while (j < limit) {
+ bldr += scratch(j)
+ j += 1
+ }
+ cells(i) = bldr.result()
+ i += 1
+ }
+ cells
+ }
+
+ /**
+ * This method sums the first `num` items in `arr`.
+ */
+ private def innerSum(arr: Array[CMS], num: Int): CMS =
+ if (num == 0) zero
+ else if (num == 1) arr(0)
+ else if (num == 2) plus(arr(0), arr(1))
+ else {
+ // start with zero
+ val scratch: Array[Double] = new Array(totalCells)
+
+ val latestTimeInHL: Double =
+ arr.iterator.take(num).map(cms => cms.timeInHL).max
+
+ var i = 0
+ while (i < num) {
+ val cms = arr(i)
+ val scale = cms.getScale(latestTimeInHL)
+ var j = 0
+ while (j < depth) {
+ val row = cms.cells(j)
+ val stride = j * width
+ var k = 0
+ while (k < width) {
+ val n = row(k)
+ if (n > 0.0) {
+ scratch(stride + k) += scale * n
+ }
+ k += 1
+ }
+ j += 1
+ }
+ i += 1
+ }
+
+ val cells = scratchToCells(scratch)
+
+ new CMS(cells, 0.0, latestTimeInHL)
+ }
+
+ override def sumOption(xs: TraversableOnce[CMS]): Option[CMS] = {
+
+ val it: Iterator[CMS] = xs.toIterator
+ val ChunkSize = 1000
+
+ // the idea here is that we read up to 1000 CMS values into
+ // a fixed array, crunch them down to a single CMS, store it
+ // in the first array index, read up to 999 more CMS values
+ // in, crunch them down, and so on.
+ var i = 0
+ val arr = new Array[CMS](ChunkSize)
+ while (it.hasNext) {
+ while (it.hasNext && i < ChunkSize) {
+ arr(i) = it.next()
+ i += 1
+ }
+ if (i > 1) {
+ arr(0) = innerSum(arr, i)
+ }
+ i = 1
+ }
+ if (i == 0) None else Some(arr(0))
+ }
+ }
+ }
+
+ val monoid: Monoid[CMS] = CMS.monoidForCMS
+}
+
+object DecayingCMS {
+
+ /**
+ * Construct a DecayingCMS module.
+ *
+ * The seed is used to initialize the hash families used by the count-min sketch. Using the same seed will
+ * always produce the same hash family.
+ *
+ * Half-life determines the rate at which values in the CMS decay. If a key was counted once at time t, by
+ * time (t + halfLife), the value for that key will be 0.5. After enough half lives the value will decay to
+ * zero.
+ *
+ * The size of the CMS in bytes is O(depth * width).
+ *
+ * Width controls the relative error due to over-counting (approximately 1/width). For 1% error, use
+ * width=100, for 0.1% error, use width=1000, etc.
+ *
+ * Depth controls the probability the error bounds are broken and that probability scales with exp(-alpha *
+ * depth) so, a small depth (e.g. 5-10) is fine. Each update requires O(depth) work so you want to keep this
+ * as small as possible.
+ */
+ def apply[K](seed: Long, halfLife: Duration, depth: Int, width: Int)(implicit
+ hasher: CMSHasher[K]
+ ): DecayingCMS[K] =
+ new DecayingCMS(seed, halfLife, depth, width, hasher)
+}
diff --git a/algebird-core/src/main/scala-2.13/Fold.scala b/algebird-core/src/main/scala-2.13/Fold.scala
new file mode 100644
index 000000000..0b89f2d62
--- /dev/null
+++ b/algebird-core/src/main/scala-2.13/Fold.scala
@@ -0,0 +1,352 @@
+/*
+Copyright 2014 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+ */
+package com.twitter.algebird
+
+import java.io.Serializable
+import scala.collection.compat._
+
+/**
+ * Folds are first-class representations of "Traversable.foldLeft." They have the nice property that they can
+ * be fused to work in parallel over an input sequence.
+ *
+ * A Fold accumulates inputs (I) into some internal type (X), converting to a defined output type (O) when
+ * done. We use existential types to hide internal details and to allow for internal and external (X and O)
+ * types to differ for "map" and "join."
+ *
+ * In discussing this type we draw parallels to Function1 and related types. You can think of a fold as a
+ * function "Seq[I] => O" but in reality we do not have to materialize the input sequence at once to "run" the
+ * fold.
+ *
+ * The traversal of the input data structure is NOT done by Fold itself. Instead we expose some methods like
+ * "overTraversable" that know how to iterate through various sequence types and drive the fold. We also
+ * expose some internal state so library authors can fold over their own types.
+ *
+ * See the companion object for constructors.
+ */
+sealed trait Fold[-I, +O] extends Serializable {
+
+ /**
+ * Users can ignore this type.
+ *
+ * The internal accumulator type. No one outside this Fold needs to know what this is, and that's a good
+ * thing. It keeps type signatures sane and makes this easy to use for the amount of flexibility it
+ * provides.
+ */
+ type X
+
+ /**
+ * Users can ignore this method. It is exposed so library authors can run folds over their own sequence
+ * types.
+ *
+ * "build" constructs a FoldState, which tells us how to run the fold. It is expected that we can run the
+ * same Fold many times over different data structures, but we must build a new FoldState every time.
+ *
+ * See FoldState for information on how to use this for your own sequence types.
+ */
+ def build(): FoldState[X, I, O]
+
+ /**
+ * Transforms the output of the Fold after iteration is complete. This is analogous to "Future.map" or
+ * "Function1.compose."
+ */
+ def map[P](f: O => P): Fold[I, P] = {
+ val self = this
+ new Fold[I, P] {
+ type X = self.X
+ override def build(): FoldState[X, I, P] =
+ self.build().map(f)
+ }
+ }
+
+ /**
+ * Joins two folds into one and combines the results. The fused fold accumulates with both at the same time
+ * and combines at the end.
+ */
+ def joinWith[I2 <: I, P, Q](other: Fold[I2, P])(f: (O, P) => Q): Fold[I2, Q] = {
+ val self = this
+ new Fold[I2, Q] {
+ type X = (self.X, other.X)
+ override def build(): FoldState[X, I2, Q] = {
+ val first = self.build()
+ val second = other.build()
+ new FoldState(
+ { case ((x, y), i) => (first.add(x, i), second.add(y, i)) },
+ (first.start, second.start),
+ { case (x, y) => f(first.end(x), second.end(y)) }
+ )
+ }
+ }
+ }
+
+ /**
+ * Convenient shorthand for joining Folds without combining at the end.
+ */
+ def join[I2 <: I, P](other: Fold[I2, P]): Fold[I2, (O, P)] =
+ joinWith(other) { case (o, p) => (o, p) }
+
+ /**
+ * Transforms the input of the fold before every accumulation. (The name comes from "contravariant map.")
+ * This is analogous to "Function1.andThen."
+ */
+ def contramap[H](f: H => I): Fold[H, O] = {
+ val self = this
+ new Fold[H, O] {
+ type X = self.X
+ override def build(): FoldState[X, H, O] =
+ self.build().contramap(f)
+ }
+ }
+
+ /**
+ * Trivially runs a Fold over an empty sequence.
+ */
+ def overEmpty: O = {
+ // build is a "def" so we construct the state once and use the pieces to run the fold
+ val state = build()
+ state.end(state.start)
+ }
+
+ /**
+ * Trivially runs a Fold over a single element sequence.
+ */
+ def overSingleton(i: I): O = {
+ val state = build()
+ state.end(state.add(state.start, i))
+ }
+
+ /**
+ * Runs a Fold over a Traversable.
+ */
+ def overTraversable(is: TraversableOnce[I]): O = {
+ val state = build()
+ state.end(is.iterator.foldLeft(state.start)(state.add))
+ }
+}
+
+/**
+ * A FoldState defines a left fold with a "hidden" accumulator type. It is exposed so library authors can run
+ * Folds over their own sequence types.
+ *
+ * The fold can be executed correctly according to the properties of "add" and your traversed data structure.
+ * For example, the "add" function of a monoidal fold will be associative. A FoldState is valid for only one
+ * iteration because the accumulator (seeded by "start" may be mutable.
+ *
+ * The three components of a fold are add: (X, I) => X - updates and returns internal state for every input I
+ * start: X - the initial state end: X => O - transforms internal state to a final result
+ *
+ * Folding over Seq(x, y) would produce the result end(add(add(start, x), y))
+ */
+final class FoldState[X, -I, +O] private[algebird] (val add: (X, I) => X, val start: X, val end: X => O)
+ extends Serializable {
+
+ /**
+ * Transforms the output type of the FoldState (see Fold.map).
+ */
+ def map[P](f: O => P): FoldState[X, I, P] =
+ new FoldState(add, start, end.andThen(f))
+
+ /**
+ * Transforms the input type of the FoldState (see Fold.contramap).
+ */
+ def contramap[H](f: H => I): FoldState[X, H, O] =
+ new FoldState((x, h) => add(x, f(h)), start, end)
+}
+
+/**
+ * Methods to create and run Folds.
+ *
+ * The Folds defined here are immutable and serializable, which we expect by default. It is important that you
+ * as a user indicate mutability or non-serializability when defining new Folds. Additionally, it is
+ * recommended that "end" functions not mutate the accumulator in order to support scans (producing a stream
+ * of intermediate outputs by calling "end" at each step).
+ */
+object Fold extends CompatFold {
+
+ /**
+ * "import Fold.applicative" will bring the Applicative instance into scope. See FoldApplicative.
+ */
+ implicit def applicative[I]: Applicative[Fold[I, _]] =
+ new FoldApplicative[I]
+
+ /**
+ * Turn a common Scala foldLeft into a Fold. The accumulator MUST be immutable and serializable.
+ */
+ def foldLeft[I, O](o: O)(add: (O, I) => O): Fold[I, O] =
+ fold[O, I, O](add, o, o => o)
+
+ /**
+ * A general way of defining Folds that supports a separate accumulator type. The accumulator MUST be
+ * immutable and serializable.
+ */
+ def fold[M, I, O](add: (M, I) => M, start: M, end: M => O): Fold[I, O] =
+ new Fold[I, O] {
+ type X = M
+ override def build(): FoldState[X, I, O] =
+ new FoldState(add, start, end)
+ }
+
+ /**
+ * A general way of defining Folds that supports constructing mutable or non-serializable accumulators.
+ */
+ def foldMutable[M, I, O](add: (M, I) => M, start: Unit => M, end: M => O): Fold[I, O] =
+ new Fold[I, O] {
+ type X = M
+ override def build(): FoldState[X, I, O] =
+ new FoldState(add, start(()), end)
+ }
+
+ /**
+ * Fuse a sequence of Folds into one that outputs the result of each.
+ */
+ def sequence[I, O](ms: Seq[Fold[I, O]]): Fold[I, Seq[O]] =
+ new Fold[I, Seq[O]] {
+ type X = Seq[Any]
+ override def build(): FoldState[Seq[Any], I, Seq[O]] = {
+ val bs: Seq[FoldState[Any, I, O]] =
+ ms.map(_.build().asInstanceOf[FoldState[Any, I, O]])
+ val adds =
+ bs.map(_.add)
+ val ends =
+ bs.map(_.end)
+ val starts: Seq[Any] =
+ bs.map(_.start)
+ val add: (Seq[Any], I) => Seq[Any] = { (xs, i) => adds.zip(xs).map { case (f, x) => f(x, i) } }
+ val end: (Seq[Any] => Seq[O]) = { xs => ends.zip(xs).map { case (f, x) => f(x) } }
+ new FoldState(add, starts, end)
+ }
+ }
+
+ /**
+ * An even simpler Fold that collects into a Seq. Shorthand for "container[I, Seq];" fewer type arguments,
+ * better type inferrence.
+ */
+ def seq[I]: Fold[I, Seq[I]] =
+ container[I, Seq]
+
+ /**
+ * A Fold that does no work and returns a constant. Analogous to Function1 const: def const[A, B](b: B): (A
+ * \=> B) = { _ => b }
+ */
+ def const[O](value: O): Fold[Any, O] =
+ Fold.foldLeft(value) { case (u, _) => u }
+
+ /**
+ * A Fold that runs the given side effect for every element.
+ */
+ def foreach[I](e: I => Unit): Fold[I, Unit] =
+ Fold.foldLeft(()) { case (_, i) => e(i) }
+
+ /**
+ * A Fold that returns the first value in a sequence.
+ */
+ def first[I]: Fold[I, Option[I]] =
+ Fold.foldLeft[I, Option[I]](None) {
+ case (None, i) => Some(i)
+ case (x, _) => x
+ }
+
+ /**
+ * A Fold that returns the last value in a sequence.
+ */
+ def last[I]: Fold[I, Option[I]] =
+ Fold.foldLeft[I, Option[I]](None) { case (_, i) => Some(i) }
+
+ /**
+ * A Fold that returns the max value in a sequence. (Biased to earlier equal values.)
+ */
+ def max[I](implicit ordering: Ordering[I]): Fold[I, Option[I]] =
+ Fold.foldLeft[I, Option[I]](None) {
+ case (None, i) => Some(i)
+ case (Some(y), i) if ordering.compare(y, i) < 0 => Some(i)
+ case (x, _) => x
+ }
+
+ /**
+ * A Fold that returns a min value in a sequence. (Biased to earlier equal values.)
+ */
+ def min[I](implicit ordering: Ordering[I]): Fold[I, Option[I]] =
+ Fold.foldLeft[I, Option[I]](None) {
+ case (None, i) => Some(i)
+ case (Some(y), i) if ordering.compare(y, i) > 0 => Some(i)
+ case (x, _) => x
+ }
+
+ /**
+ * A Fold that returns the sum of a numeric sequence. Does not protect against overflow.
+ */
+ def sum[I](implicit numeric: Monoid[I]): Fold[I, I] =
+ Fold.foldLeft(numeric.zero) { case (x, i) => numeric.plus(x, i) }
+
+ /**
+ * For a semigroup, if we get more than 0 items, use plus
+ */
+ def sumOption[T](implicit sg: Semigroup[T]): Fold[T, Option[T]] =
+ Fold.foldLeft(None: Option[T]) {
+ case (None, i) => Some(i)
+ case (Some(l), r) => Some(sg.plus(l, r))
+ }
+
+ /**
+ * A Fold that returns the product of a numeric sequence. Does not protect against overflow.
+ */
+ def product[I](implicit numeric: Ring[I]): Fold[I, I] =
+ Fold.foldLeft(numeric.one) { case (x, i) => numeric.times(x, i) }
+
+ /**
+ * A Fold that returns the length of a sequence.
+ */
+ def size: Fold[Any, Long] =
+ Fold.foldLeft(0L) { case (x, _) => x + 1 }
+
+ /**
+ * A Fold that returns "true" if all elements of the sequence statisfy the predicate. Note this does not
+ * short-circuit enumeration of the sequence.
+ */
+ def forall[I](pred: I => Boolean): Fold[I, Boolean] =
+ foldLeft(true)((b, i) => b && pred(i))
+
+ /**
+ * A Fold that returns "true" if any element of the sequence statisfies the predicate. Note this does not
+ * short-circuit enumeration of the sequence.
+ */
+ def exists[I](pred: I => Boolean): Fold[I, Boolean] =
+ foldLeft(false)((b, i) => b || pred(i))
+
+ /**
+ * A Fold that counts the number of elements satisfying the predicate.
+ */
+ def count[I](pred: I => Boolean): Fold[I, Long] =
+ foldLeft(0L) {
+ case (c, i) if pred(i) => c + 1L
+ case (c, _) => c
+ }
+}
+
+/**
+ * Folds are Applicatives!
+ */
+class FoldApplicative[I] extends Applicative[Fold[I, _]] {
+ override def map[T, U](mt: Fold[I, T])(fn: T => U): Fold[I, U] =
+ mt.map(fn)
+ override def apply[T](v: T): Fold[I, T] =
+ Fold.const(v)
+ override def join[T, U](mt: Fold[I, T], mu: Fold[I, U]): Fold[I, (T, U)] =
+ mt.join(mu)
+ override def sequence[T](ms: Seq[Fold[I, T]]): Fold[I, Seq[T]] =
+ Fold.sequence(ms)
+ override def joinWith[T, U, V](mt: Fold[I, T], mu: Fold[I, U])(fn: (T, U) => V): Fold[I, V] =
+ mt.joinWith(mu)(fn)
+}
diff --git a/algebird-core/src/main/scala-2.13/Interval.scala b/algebird-core/src/main/scala-2.13/Interval.scala
new file mode 100644
index 000000000..6a1645d16
--- /dev/null
+++ b/algebird-core/src/main/scala-2.13/Interval.scala
@@ -0,0 +1,380 @@
+/*
+ Copyright 2013 Twitter, Inc.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package com.twitter.algebird
+
+// TODO this is clearly more general than summingbird, and should be extended to be a ring (add union, etc...)
+
+/**
+ * Represents a single interval on a T with an Ordering
+ */
+sealed trait Interval[T] extends java.io.Serializable {
+ def contains(t: T)(implicit ord: Ordering[T]): Boolean
+
+ def intersect(that: Interval[T])(implicit ord: Ordering[T]): Interval[T]
+ final def apply(t: T)(implicit ord: Ordering[T]): Boolean = contains(t)
+ final def &&(that: Interval[T])(implicit ord: Ordering[T]): Interval[T] = intersect(that)
+
+ /**
+ * Map the Interval with a non-decreasing function. If you use a non-monotonic function (like x^2) then the
+ * result is meaningless. TODO: It might be good to have types for these properties in algebird.
+ */
+ def mapNonDecreasing[U](fn: T => U): Interval[U]
+}
+
+case class Universe[T]() extends Interval[T] {
+ override def contains(t: T)(implicit ord: Ordering[T]): Boolean = true
+ override def intersect(that: Interval[T])(implicit ord: Ordering[T]): Interval[T] =
+ that
+ override def mapNonDecreasing[U](fn: T => U): Interval[U] = Universe()
+}
+
+case class Empty[T]() extends Interval[T] {
+ override def contains(t: T)(implicit ord: Ordering[T]): Boolean = false
+ override def intersect(that: Interval[T])(implicit ord: Ordering[T]): Interval[T] =
+ this
+ override def mapNonDecreasing[U](fn: T => U): Interval[U] = Empty()
+}
+
+object Interval extends java.io.Serializable {
+
+ /**
+ * Class that only exists so that [[leftClosedRightOpen]] and [[leftOpenRightClosed]] can retain the type
+ * information of the returned interval. The compiler doesn't know anything about ordering, so without
+ * [[MaybeEmpty]] the only valid return type is Interval[T].
+ */
+ sealed abstract class MaybeEmpty[T, NonEmpty[t] <: Interval[t]] {
+ def isEmpty: Boolean
+ }
+ object MaybeEmpty {
+
+ /**
+ * Represents an empty interval.
+ */
+ case class SoEmpty[T, NonEmpty[t] <: Interval[t]]() extends MaybeEmpty[T, NonEmpty] {
+ override def isEmpty: Boolean = true
+ }
+
+ /**
+ * Represents a non-empty interval.
+ */
+ case class NotSoEmpty[T, NonEmpty[t] <: Interval[t]](get: NonEmpty[T]) extends MaybeEmpty[T, NonEmpty] {
+ override def isEmpty: Boolean = false
+ }
+ }
+
+ type GenIntersection[T] = Intersection[Lower, Upper, T]
+ type InLowExUp[T] = Intersection[InclusiveLower, ExclusiveUpper, T]
+ type InLowInUp[T] = Intersection[InclusiveLower, InclusiveUpper, T]
+ type ExLowExUp[T] = Intersection[ExclusiveLower, ExclusiveUpper, T]
+ type ExLowInUp[T] = Intersection[ExclusiveLower, InclusiveUpper, T]
+
+ implicit def monoid[T: Ordering]: Monoid[Interval[T]] =
+ Monoid.from[Interval[T]](Universe[T]())(_ && _)
+
+ // Automatically convert from a MaybeEmpty instance
+ implicit def fromMaybeEmpty[T, NonEmpty[t] <: Interval[t]](me: MaybeEmpty[T, NonEmpty]): Interval[T] =
+ me match {
+ case MaybeEmpty.SoEmpty() => Empty()
+ case MaybeEmpty.NotSoEmpty(i) => i
+ }
+
+ def leftClosedRightOpen[T: Ordering](lower: T, upper: T): MaybeEmpty[T, InLowExUp] =
+ if (Ordering[T].lt(lower, upper))
+ MaybeEmpty.NotSoEmpty[T, InLowExUp](Intersection(InclusiveLower(lower), ExclusiveUpper(upper)))
+ else MaybeEmpty.SoEmpty[T, InLowExUp]()
+
+ def leftOpenRightClosed[T: Ordering](lower: T, upper: T): MaybeEmpty[T, ExLowInUp] =
+ if (Ordering[T].lt(lower, upper))
+ MaybeEmpty.NotSoEmpty[T, ExLowInUp](Intersection(ExclusiveLower(lower), InclusiveUpper(upper)))
+ else MaybeEmpty.SoEmpty[T, ExLowInUp]()
+
+ def closed[T: Ordering](lower: T, upper: T): MaybeEmpty[T, InLowInUp] =
+ if (Ordering[T].lteq(lower, upper))
+ MaybeEmpty.NotSoEmpty[T, InLowInUp](Intersection(InclusiveLower(lower), InclusiveUpper(upper)))
+ else MaybeEmpty.SoEmpty[T, InLowInUp]()
+
+ def open[T: Ordering](lower: T, upper: T): MaybeEmpty[T, ExLowExUp] =
+ if (Ordering[T].lt(lower, upper))
+ MaybeEmpty.NotSoEmpty[T, ExLowExUp](Intersection(ExclusiveLower(lower), ExclusiveUpper(upper)))
+ else MaybeEmpty.SoEmpty[T, ExLowExUp]()
+
+ /**
+ * This is here for binary compatibility reasons. These methods should be moved to Interval, which should
+ * also be an abstract class for better binary compatibility at the next incompatible change
+ */
+ implicit final class IntervalMethods[T](val intr: Interval[T]) extends AnyVal {
+ def isEmpty(implicit succ: Successible[T], pred: Predecessible[T]): Boolean = intr match {
+ case Empty() => true
+ case Universe() => false
+ case Intersection(InclusiveLower(l), ExclusiveUpper(u)) =>
+ !succ.ordering.lt(l, u)
+ case Intersection(InclusiveLower(l), InclusiveUpper(u)) =>
+ !succ.ordering.lteq(l, u)
+ case Intersection(ExclusiveLower(l), ExclusiveUpper(u)) =>
+ !succ.next(l).exists(succ.ordering.lt(_, u))
+ case Intersection(ExclusiveLower(l), InclusiveUpper(u)) =>
+ !succ.next(l).exists(succ.ordering.lteq(_, u))
+ case InclusiveLower(_) => false // we at least have l
+ case InclusiveUpper(_) => false // false // we at least have u
+ case ExclusiveLower(l) =>
+ succ.next(l).isEmpty
+ case ExclusiveUpper(u) =>
+ pred.prev(u).isEmpty
+ }
+
+ /**
+ * If this returns Some(t), then intr.contains(t) and there is no s less than t such that intr.contains(s)
+ *
+ * if this returns None, it may be Empty, Upper or Universe
+ */
+ def boundedLeast(implicit succ: Successible[T]): Option[T] = intr match {
+ case Empty() => None
+ case Universe() => None
+ case _: Upper[?] => None
+ case i @ Intersection(_, _) => i.least
+ case l: Lower[?] => l.least
+ }
+
+ /**
+ * If this returns Some(t), then intr.contains(t) and there is no s greater than t such that
+ * intr.contains(s)
+ *
+ * if this returns None, it may be Empty, Lower, or Universe
+ */
+ def boundedGreatest(implicit pred: Predecessible[T]): Option[T] =
+ intr match {
+ case Empty() => None
+ case Universe() => None
+ case _: Lower[?] => None
+ case i @ Intersection(_, _) => i.greatest
+ case u: Upper[?] => u.greatest
+ }
+ }
+}
+
+// Marker traits to keep lower on the left in Intersection
+sealed trait Lower[T] extends Interval[T] {
+
+ /**
+ * This may give a false positive (but should try not to). Note the case of (0,1) for the integers. If they
+ * were doubles, this would intersect, but since there are no members of the set Int that are bigger than 0
+ * and less than 1, they don't really intersect. So, ordering is not enough here. You need a stronger
+ * notion, which we don't have a typeclass for.
+ */
+ def intersects(u: Upper[T])(implicit ord: Ordering[T]): Boolean
+
+ /**
+ * The smallest value that is contained here This is an Option, because of cases like
+ * ExclusiveLower(Int.MaxValue) which are pathological and equivalent to Empty
+ */
+ def least(implicit s: Successible[T]): Option[T]
+ def strictLowerBound(implicit p: Predecessible[T]): Option[T]
+
+ /**
+ * Iterates all the items in this Lower[T] from lowest to highest
+ */
+ def toIterable(implicit s: Successible[T]): Iterable[T] =
+ least match {
+ case Some(l) => s.iterateNext(l)
+ case None => Iterable.empty
+ }
+}
+sealed trait Upper[T] extends Interval[T] {
+
+ /**
+ * The smallest value that is contained here This is an Option, because of cases like
+ * ExclusiveUpper(Int.MinValue), which are pathological and equivalent to Empty
+ */
+ def greatest(implicit p: Predecessible[T]): Option[T]
+ // The smallest value that is not present
+ def strictUpperBound(implicit s: Successible[T]): Option[T]
+
+ /**
+ * Iterates all the items in this Upper[T] from highest to lowest
+ */
+ def toIterable(implicit p: Predecessible[T]): Iterable[T] =
+ greatest match {
+ case Some(g) => p.iteratePrev(g)
+ case None => Iterable.empty
+ }
+}
+
+case class InclusiveLower[T](lower: T) extends Interval[T] with Lower[T] {
+ override def contains(t: T)(implicit ordering: Ordering[T]): Boolean =
+ ordering.lteq(lower, t)
+ override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match {
+ case Universe() => this
+ case Empty() => that
+ case ub @ InclusiveUpper(_) =>
+ if (intersects(ub)) Intersection(this, ub) else Empty()
+ case ub @ ExclusiveUpper(_) =>
+ if (intersects(ub)) Intersection(this, ub) else Empty()
+ case InclusiveLower(thatlb) =>
+ if (ordering.gt(lower, thatlb)) this else that
+ case ExclusiveLower(thatlb) =>
+ if (ordering.gt(lower, thatlb)) this else that
+ case Intersection(thatL, thatU) => (this && thatL) && thatU
+ }
+ override def intersects(u: Upper[T])(implicit ordering: Ordering[T]): Boolean =
+ u match {
+ case InclusiveUpper(upper) => ordering.lteq(lower, upper)
+ case ExclusiveUpper(upper) => ordering.lt(lower, upper)
+ }
+ override def least(implicit s: Successible[T]): Option[T] = Some(lower)
+ override def strictLowerBound(implicit p: Predecessible[T]): Option[T] = p.prev(lower)
+ override def mapNonDecreasing[U](fn: T => U): Interval[U] = InclusiveLower(fn(lower))
+}
+case class ExclusiveLower[T](lower: T) extends Interval[T] with Lower[T] {
+ override def contains(t: T)(implicit ordering: Ordering[T]): Boolean =
+ ordering.lt(lower, t)
+ override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match {
+ case Universe() => this
+ case Empty() => that
+ case ub @ InclusiveUpper(_) =>
+ if (intersects(ub)) Intersection(this, ub) else Empty()
+ case ub @ ExclusiveUpper(_) =>
+ if (intersects(ub)) Intersection(this, ub) else Empty()
+ case InclusiveLower(thatlb) =>
+ if (ordering.gteq(lower, thatlb)) this else that
+ case ExclusiveLower(thatlb) =>
+ if (ordering.gteq(lower, thatlb)) this else that
+ case Intersection(thatL, thatU) => (this && thatL) && thatU
+ }
+ override def intersects(u: Upper[T])(implicit ordering: Ordering[T]): Boolean =
+ u match {
+ case InclusiveUpper(upper) => ordering.lt(lower, upper)
+ case ExclusiveUpper(upper) =>
+ ordering.lt(lower, upper) // This is a false positive for (x, next(x))
+ }
+ override def least(implicit s: Successible[T]): Option[T] = s.next(lower)
+ override def strictLowerBound(implicit p: Predecessible[T]): Option[T] = Some(lower)
+ override def mapNonDecreasing[U](fn: T => U): Interval[U] = ExclusiveLower(fn(lower))
+}
+case class InclusiveUpper[T](upper: T) extends Interval[T] with Upper[T] {
+ override def contains(t: T)(implicit ordering: Ordering[T]): Boolean =
+ ordering.lteq(t, upper)
+ override def greatest(implicit p: Predecessible[T]): Option[T] = Some(upper)
+ // The smallest value that is not present
+ override def strictUpperBound(implicit s: Successible[T]): Option[T] = s.next(upper)
+ override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match {
+ case Universe() => this
+ case Empty() => that
+ case lb @ InclusiveLower(_) =>
+ if (lb.intersects(this)) Intersection(lb, this) else Empty()
+ case lb @ ExclusiveLower(_) =>
+ if (lb.intersects(this)) Intersection(lb, this) else Empty()
+ case InclusiveUpper(thatub) =>
+ if (ordering.lt(upper, thatub)) this else that
+ case ExclusiveUpper(thatub) =>
+ if (ordering.lt(upper, thatub)) this else that
+ case Intersection(thatL, thatU) => thatL && (this && thatU)
+ }
+ override def mapNonDecreasing[U](fn: T => U): Interval[U] = InclusiveUpper(fn(upper))
+}
+case class ExclusiveUpper[T](upper: T) extends Interval[T] with Upper[T] {
+ override def contains(t: T)(implicit ordering: Ordering[T]): Boolean =
+ ordering.lt(t, upper)
+ override def greatest(implicit p: Predecessible[T]): Option[T] = p.prev(upper)
+ // The smallest value that is not present
+ override def strictUpperBound(implicit s: Successible[T]): Option[T] = Some(upper)
+ override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match {
+ case Universe() => this
+ case Empty() => that
+ case lb @ InclusiveLower(_) =>
+ if (lb.intersects(this)) Intersection(lb, this) else Empty()
+ case lb @ ExclusiveLower(_) =>
+ if (lb.intersects(this)) Intersection(lb, this) else Empty()
+ case InclusiveUpper(thatub) =>
+ if (ordering.lteq(upper, thatub)) this else that
+ case ExclusiveUpper(thatub) =>
+ if (ordering.lteq(upper, thatub)) this else that
+ case Intersection(thatL, thatU) => thatL && (this && thatU)
+ }
+ override def mapNonDecreasing[U](fn: T => U): Interval[U] = ExclusiveUpper(fn(upper))
+}
+
+case class Intersection[L[t] <: Lower[t], U[t] <: Upper[t], T](lower: L[T], upper: U[T]) extends Interval[T] {
+ override def contains(t: T)(implicit ordering: Ordering[T]): Boolean =
+ lower.contains(t) && upper.contains(t)
+ override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match {
+ case Universe() => this
+ case Empty() => that
+ case lb @ InclusiveLower(_) => (lb && lower) && upper
+ case lb @ ExclusiveLower(_) => (lb && lower) && upper
+ case ub @ InclusiveUpper(_) => lower && (ub && upper)
+ case ub @ ExclusiveUpper(_) => lower && (ub && upper)
+ case Intersection(thatL, thatU) => (lower && thatL) && (upper && thatU)
+ }
+ override def mapNonDecreasing[T1](fn: T => T1): Interval[T1] = {
+ val newLower = lower match {
+ case InclusiveLower(l) => InclusiveLower(fn(l))
+ case ExclusiveLower(l) => ExclusiveLower(fn(l))
+ }
+ val newUpper = upper match {
+ case InclusiveUpper(u) => InclusiveUpper(fn(u))
+ case ExclusiveUpper(u) => ExclusiveUpper(fn(u))
+ }
+ Intersection(newLower, newUpper)
+ }
+
+ def least(implicit s: Successible[T]): Option[T] =
+ lower.least.filter(upper.contains(_)(s.ordering))
+
+ /**
+ * Goes from lowest to highest for all items that are contained in this Intersection
+ */
+ def leastToGreatest(implicit s: Successible[T]): Iterable[T] = {
+ val self = this
+ implicit val ord: Ordering[T] = s.ordering
+ // TODO https://github.com/twitter/algebird/issues/263
+ new AbstractIterable[T] {
+ // We have to do this because the normal takeWhile causes OOM on big intervals
+ override def iterator: Iterator[T] = lower.toIterable.iterator.takeWhile(self.upper.contains(_))
+ }
+ }
+
+ def greatest(implicit p: Predecessible[T]): Option[T] =
+ upper.greatest.filter(lower.contains(_)(p.ordering))
+
+ /**
+ * Goes from highest to lowest for all items that are contained in this Intersection
+ */
+ def greatestToLeast(implicit p: Predecessible[T]): Iterable[T] = {
+ val self = this
+ implicit val ord: Ordering[T] = p.ordering
+ // TODO https://github.com/twitter/algebird/issues/263
+ new AbstractIterable[T] {
+ // We have to do this because the normal takeWhile causes OOM on big intervals
+ override def iterator: Iterator[T] = upper.toIterable.iterator.takeWhile(self.lower.contains(_))
+ }
+ }
+
+ /**
+ * Some intervals can actually be synonyms for empty: (0,0) for instance, contains nothing. This cannot be
+ * normalized to [a, b) form, thus we return an option Also, there are cases like [Int.MinValue,
+ * Int.MaxValue] that cannot are actually equivalent to Universe. The bottom line: if this returns None, it
+ * just means you can't express it this way, it does not mean it is empty or universe, etc... (there are
+ * other cases).
+ */
+ def toLeftClosedRightOpen(implicit
+ s: Successible[T]
+ ): Option[Intersection[InclusiveLower, ExclusiveUpper, T]] =
+ for {
+ l <- lower.least
+ g <- upper.strictUpperBound if s.ordering.lt(l, g)
+ } yield Intersection(InclusiveLower(l), ExclusiveUpper(g))
+}
diff --git a/algebird-core/src/main/scala-2.13/InvariantAlgebras.scala b/algebird-core/src/main/scala-2.13/InvariantAlgebras.scala
new file mode 100644
index 000000000..6f30ebc1c
--- /dev/null
+++ b/algebird-core/src/main/scala-2.13/InvariantAlgebras.scala
@@ -0,0 +1,48 @@
+package com.twitter.algebird
+
+class InvariantSemigroup[T, U](val forward: T => U, val reverse: U => T)(implicit val semigroup: Semigroup[T])
+ extends Semigroup[U] {
+ override def plus(l: U, r: U): U =
+ forward(semigroup.plus(reverse(l), reverse(r)))
+ override def sumOption(iter: TraversableOnce[U]): Option[U] =
+ semigroup.sumOption(iter.map(reverse)).map(forward)
+
+ /*
+ * Note these work for the subclasses since in those cases semigroup
+ * will be the appropriate algebra.
+ */
+ override val hashCode: Int = (forward, reverse, semigroup).hashCode
+ override def equals(that: Any): Boolean =
+ that match {
+ case r: InvariantSemigroup[?, ?] =>
+ (hashCode == r.hashCode) &&
+ (forward == r.forward) &&
+ (reverse == r.reverse) &&
+ (semigroup == r.semigroup)
+ case _ => false
+ }
+}
+
+class InvariantMonoid[T, U](forward: T => U, reverse: U => T)(implicit val monoid: Monoid[T])
+ extends InvariantSemigroup[T, U](forward, reverse)
+ with Monoid[U] {
+ override val zero: U = forward(monoid.zero)
+}
+
+class InvariantGroup[T, U](forward: T => U, reverse: U => T)(implicit val group: Group[T])
+ extends InvariantMonoid[T, U](forward, reverse)
+ with Group[U] {
+ override def negate(u: U): U = forward(group.negate(reverse(u)))
+ override def minus(l: U, r: U): U =
+ forward(group.minus(reverse(l), reverse(r)))
+}
+
+class InvariantRing[T, U](forward: T => U, reverse: U => T)(implicit val ring: Ring[T])
+ extends InvariantGroup[T, U](forward, reverse)
+ with Ring[U] {
+ override val one: U = forward(ring.one)
+ override def times(l: U, r: U): U =
+ forward(ring.times(reverse(l), reverse(r)))
+ override def product(iter: TraversableOnce[U]): U =
+ forward(ring.product(iter.map(reverse)))
+}
diff --git a/algebird-core/src/main/scala-2.13/JavaMonoids.scala b/algebird-core/src/main/scala-2.13/JavaMonoids.scala
new file mode 100644
index 000000000..26ce54f0a
--- /dev/null
+++ b/algebird-core/src/main/scala-2.13/JavaMonoids.scala
@@ -0,0 +1,147 @@
+/*
+Copyright 2012 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+ */
+package com.twitter.algebird
+
+import java.lang.{
+ Boolean => JBool,
+ Double => JDouble,
+ Float => JFloat,
+ Integer => JInt,
+ Long => JLong,
+ Short => JShort
+}
+import java.util.{ArrayList => JArrayList, HashMap => JHashMap, List => JList, Map => JMap}
+
+import scala.collection.JavaConverters._
+
+object JIntRing extends Ring[JInt] {
+ override val zero: JInt = JInt.valueOf(0)
+ override val one: JInt = JInt.valueOf(1)
+ override def plus(x: JInt, y: JInt): JInt = x + y
+ override def negate(x: JInt): JInt = -x
+ override def minus(x: JInt, y: JInt): JInt = x - y
+ override def times(x: JInt, y: JInt): JInt = x * y
+}
+
+object JShortRing extends Ring[JShort] {
+ override val zero: JShort = Short.box(0)
+ override val one: JShort = Short.box(1)
+ override def plus(x: JShort, y: JShort): JShort = (x + y).toShort
+ override def negate(x: JShort): JShort = (-x).toShort
+ override def minus(x: JShort, y: JShort): JShort = (x - y).toShort
+ override def times(x: JShort, y: JShort): JShort = (x * y).toShort
+}
+
+object JLongRing extends Ring[JLong] {
+ override val zero: JLong = JLong.valueOf(0L)
+ override val one: JLong = JLong.valueOf(1L)
+ override def plus(x: JLong, y: JLong): JLong = x + y
+ override def negate(x: JLong): JLong = -x
+ override def minus(x: JLong, y: JLong): JLong = x - y
+ override def times(x: JLong, y: JLong): JLong = x * y
+}
+
+object JFloatRing extends Ring[JFloat] {
+ override val zero: JFloat = JFloat.valueOf(0.0f)
+ override val one: JFloat = JFloat.valueOf(1.0f)
+ override def plus(x: JFloat, y: JFloat): JFloat = x + y
+ override def negate(x: JFloat): JFloat = -x
+ override def minus(x: JFloat, y: JFloat): JFloat = x - y
+ override def times(x: JFloat, y: JFloat): JFloat = x * y
+}
+
+object JDoubleRing extends Ring[JDouble] {
+ override val zero: JDouble = JDouble.valueOf(0.0)
+ override val one: JDouble = JDouble.valueOf(1.0)
+ override def plus(x: JDouble, y: JDouble): JDouble = x + y
+ override def negate(x: JDouble): JDouble = -x
+ override def minus(x: JDouble, y: JDouble): JDouble = x - y
+ override def times(x: JDouble, y: JDouble): JDouble = x * y
+}
+
+object JBoolRing extends Ring[JBool] {
+ override val zero: JBool = JBool.FALSE
+ override val one: JBool = JBool.TRUE
+ override def plus(x: JBool, y: JBool): JBool =
+ JBool.valueOf(x.booleanValue ^ y.booleanValue)
+ override def negate(x: JBool): JBool = x
+ override def minus(x: JBool, y: JBool): JBool = plus(x, y)
+ override def times(x: JBool, y: JBool): JBool =
+ JBool.valueOf(x.booleanValue & y.booleanValue)
+}
+
+/**
+ * Since Lists are mutable, this always makes a full copy. Prefer scala immutable Lists if you use scala
+ * immutable lists, the tail of the result of plus is always the right argument
+ */
+class JListMonoid[T] extends Monoid[JList[T]] {
+ override def isNonZero(x: JList[T]): Boolean = !x.isEmpty
+ override lazy val zero: JArrayList[T] = new JArrayList[T](0)
+ override def plus(x: JList[T], y: JList[T]): JArrayList[T] = {
+ val res = new JArrayList[T](x.size + y.size)
+ res.addAll(x)
+ res.addAll(y)
+ res
+ }
+}
+
+/**
+ * Since maps are mutable, this always makes a full copy. Prefer scala immutable maps if you use scala
+ * immutable maps, this operation is much faster TODO extend this to Group, Ring
+ */
+class JMapMonoid[K, V: Semigroup] extends Monoid[JMap[K, V]] {
+ override lazy val zero: JHashMap[K, V] = new JHashMap[K, V](0)
+
+ val nonZero: (V => Boolean) = implicitly[Semigroup[V]] match {
+ case mon: Monoid[?] => mon.isNonZero(_)
+ case _ => _ => true
+ }
+
+ override def isNonZero(x: JMap[K, V]): Boolean =
+ !x.isEmpty && (implicitly[Semigroup[V]] match {
+ case mon: Monoid[?] =>
+ x.values.asScala.exists(v => mon.isNonZero(v))
+ case _ => true
+ })
+ override def plus(x: JMap[K, V], y: JMap[K, V]): JHashMap[K, V] = {
+ val (big, small, bigOnLeft) =
+ if (x.size > y.size) {
+ (x, y, true)
+ } else {
+ (y, x, false)
+ }
+ val vsemi = implicitly[Semigroup[V]]
+ val result = new JHashMap[K, V](big.size + small.size)
+ result.putAll(big)
+ small.entrySet.asScala.foreach { kv =>
+ val smallK = kv.getKey
+ val smallV = kv.getValue
+ if (big.containsKey(smallK)) {
+ val bigV = big.get(smallK)
+ val newV =
+ if (bigOnLeft) vsemi.plus(bigV, smallV) else vsemi.plus(smallV, bigV)
+ if (nonZero(newV))
+ result.put(smallK, newV)
+ else
+ result.remove(smallK)
+ } else {
+ // No need to explicitly add with zero on V, just put in the small value
+ result.put(smallK, smallV)
+ }
+ }
+ result
+ }
+}
diff --git a/algebird-core/src/main/scala-2.13/MapAlgebra.scala b/algebird-core/src/main/scala-2.13/MapAlgebra.scala
new file mode 100644
index 000000000..9ca370eaf
--- /dev/null
+++ b/algebird-core/src/main/scala-2.13/MapAlgebra.scala
@@ -0,0 +1,320 @@
+/*
+Copyright 2012 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+ */
+package com.twitter.algebird
+
+import com.twitter.algebird.macros.{Cuber, Roller}
+import scala.collection.mutable.{Builder, Map => MMap}
+import scala.collection.{Map => ScMap}
+import algebra.ring.Rng
+import scala.collection.compat._
+
+trait MapOperations[K, V, M <: ScMap[K, V]] {
+ def add(oldMap: M, kv: (K, V)): M
+ def remove(oldMap: M, k: K): M
+ def fromMutable(mut: MMap[K, V]): M
+}
+
+abstract class GenericMapMonoid[K, V, M <: ScMap[K, V]](implicit val semigroup: Semigroup[V])
+ extends Monoid[M]
+ with MapOperations[K, V, M] {
+
+ val nonZero: (V => Boolean) = semigroup match {
+ case mon: Monoid[?] => mon.isNonZero(_)
+ case _ => _ => true
+ }
+
+ override def isNonZero(x: M): Boolean =
+ !x.isEmpty && (semigroup match {
+ case mon: Monoid[?] =>
+ x.valuesIterator.exists(v => mon.isNonZero(v))
+ case _ => true
+ })
+
+ override def plus(x: M, y: M): M = {
+ // Scala maps can reuse internal structure, so don't copy just add into the bigger one:
+ // This really saves computation when adding lots of small maps into big ones (common)
+ val (big, small, bigOnLeft) =
+ if (x.size > y.size) {
+ (x, y, true)
+ } else {
+ (y, x, false)
+ }
+ small match {
+ // Mutable maps create new copies of the underlying data on add so don't use the
+ // handleImmutable method.
+ // Cannot have a None so 'get' is safe here.
+ case _: MMap[?, ?] => sumOption(Seq(big, small)).get
+ case _ => handleImmutable(big, small, bigOnLeft)
+ }
+ }
+
+ private def handleImmutable(big: M, small: M, bigOnLeft: Boolean) =
+ small.foldLeft(big) { (oldMap, kv) =>
+ val newV = big
+ .get(kv._1)
+ .map { bigV =>
+ if (bigOnLeft)
+ semigroup.plus(bigV, kv._2)
+ else
+ semigroup.plus(kv._2, bigV)
+ }
+ .getOrElse(kv._2)
+ if (nonZero(newV))
+ add(oldMap, kv._1 -> newV)
+ else
+ remove(oldMap, kv._1)
+ }
+ override def sumOption(items: TraversableOnce[M]): Option[M] =
+ if (items.iterator.isEmpty) None
+ else {
+ val mutable = MMap[K, V]()
+ items.iterator.foreach { m =>
+ m.foreach { case (k, v) =>
+ val oldVOpt = mutable.get(k)
+ // sorry for the micro optimization here: avoiding a closure
+ val newV =
+ if (oldVOpt.isEmpty) v else Semigroup.plus(oldVOpt.get, v)
+ if (nonZero(newV))
+ mutable.update(k, newV)
+ else
+ mutable.remove(k)
+ }
+ }
+ Some(fromMutable(mutable))
+ }
+}
+
+class MapMonoid[K, V](implicit semigroup: Semigroup[V]) extends GenericMapMonoid[K, V, Map[K, V]] {
+ override lazy val zero: Map[K, V] = Map[K, V]()
+ override def add(oldMap: Map[K, V], kv: (K, V)): Map[K, V] = oldMap + kv
+ override def remove(oldMap: Map[K, V], k: K): Map[K, V] = oldMap - k
+ override def fromMutable(mut: MMap[K, V]): Map[K, V] =
+ new MutableBackedMap(mut)
+}
+
+class ScMapMonoid[K, V](implicit semigroup: Semigroup[V]) extends GenericMapMonoid[K, V, ScMap[K, V]] {
+ override lazy val zero: ScMap[K, V] = ScMap[K, V]()
+ override def add(oldMap: ScMap[K, V], kv: (K, V)): ScMap[K, V] = oldMap + kv
+ override def remove(oldMap: ScMap[K, V], k: K): ScMap[K, V] = oldMap - k
+ override def fromMutable(mut: MMap[K, V]): ScMap[K, V] =
+ new MutableBackedMap(mut)
+}
+
+/**
+ * You can think of this as a Sparse vector group
+ */
+class MapGroup[K, V](implicit val group: Group[V]) extends MapMonoid[K, V]()(group) with Group[Map[K, V]] {
+ override def negate(kv: Map[K, V]): Map[K, V] =
+ kv.iterator.map { case (k, v) =>
+ (k, group.negate(v))
+ }.toMap
+}
+
+class ScMapGroup[K, V](implicit val group: Group[V])
+ extends ScMapMonoid[K, V]()(group)
+ with Group[ScMap[K, V]] {
+ override def negate(kv: ScMap[K, V]): ScMap[K, V] =
+ kv.iterator.map { case (k, v) =>
+ (k, group.negate(v))
+ }.toMap
+}
+
+/**
+ * You can think of this as a Sparse vector ring
+ */
+trait GenericMapRing[K, V, M <: ScMap[K, V]] extends Rng[M] with MapOperations[K, V, M] {
+
+ implicit def ring: Ring[V]
+
+ override def times(x: M, y: M): M = {
+ val (big, small, bigOnLeft) =
+ if (x.size > y.size) {
+ (x, y, true)
+ } else {
+ (y, x, false)
+ }
+ small.foldLeft(zero) { (oldMap, kv) =>
+ val bigV = big.getOrElse(kv._1, ring.zero)
+ val newV =
+ if (bigOnLeft) ring.times(bigV, kv._2) else ring.times(kv._2, bigV)
+ if (ring.isNonZero(newV)) {
+ add(oldMap, kv._1 -> newV)
+ } else {
+ remove(oldMap, kv._1)
+ }
+ }
+ }
+}
+
+class MapRing[K, V](implicit override val ring: Ring[V])
+ extends MapGroup[K, V]()(ring)
+ with GenericMapRing[K, V, Map[K, V]]
+
+class ScMapRing[K, V](implicit override val ring: Ring[V])
+ extends ScMapGroup[K, V]()(ring)
+ with GenericMapRing[K, V, ScMap[K, V]]
+
+object MapAlgebra {
+ def rightContainsLeft[K, V: Equiv](l: Map[K, V], r: Map[K, V]): Boolean =
+ l.forall { case (k, v) =>
+ r.get(k).exists(Equiv[V].equiv(_, v))
+ }
+
+ implicit def sparseEquiv[K, V: Monoid: Equiv]: Equiv[Map[K, V]] =
+ Equiv.fromFunction { (m1, m2) =>
+ val cleanM1 = removeZeros(m1)
+ val cleanM2 = removeZeros(m2)
+ rightContainsLeft(cleanM1, cleanM2) && rightContainsLeft(cleanM2, cleanM1)
+ }
+
+ def mergeLookup[T, U, V: Monoid](
+ keys: TraversableOnce[T]
+ )(lookup: T => Option[V])(present: T => U): Map[U, V] =
+ sumByKey {
+ keys.iterator.map(k => present(k) -> lookup(k).getOrElse(Monoid.zero[V]))
+ }
+
+ // Returns a new map with zero-value entries removed
+ def removeZeros[K, V: Monoid](m: Map[K, V]): Map[K, V] =
+ m.filter { case (_, v) => Monoid.isNonZero(v) }
+
+ /**
+ * For each key, sum all the values. Note that if V is a Monoid, the current implementation will drop from
+ * the output any key where the values are all Monoid.zero. If the Semigroup is a Monoid, This function is
+ * equivalent to:
+ *
+ * pairs.filter(_._2 != Monoid.zero).groupBy(_._1).mapValues(_.map(_._2).sum)
+ *
+ * Otherwise, the function is equivalent to:
+ *
+ * pairs.groupBy(_._1).mapValues(_.map(_._2).sum)
+ */
+ def sumByKey[K, V: Semigroup](pairs: TraversableOnce[(K, V)]): Map[K, V] =
+ Monoid.sum(pairs.iterator.map(Map(_)))
+
+ /**
+ * For each key, creates a list of all values. This function is equivalent to:
+ *
+ * pairs.groupBy(_._1).mapValues(_.map(_._2))
+ */
+ def group[K, V](pairs: TraversableOnce[(K, V)]): Map[K, List[V]] =
+ if (pairs.iterator.isEmpty) Map.empty
+ else {
+ val mutable = MMap[K, Builder[V, List[V]]]()
+ pairs.iterator.foreach { case (k, v) =>
+ val oldVOpt = mutable.get(k)
+ // sorry for the micro optimization here: avoiding a closure
+ val bldr = if (oldVOpt.isEmpty) {
+ val b = List.newBuilder[V]
+ mutable.update(k, b)
+ b
+ } else oldVOpt.get
+ bldr += v
+ }
+ mutable.iterator.map { case (k, bldr) => (k, bldr.result()) }.toMap
+ }
+
+ // Consider this as edges from k -> v, produce a Map[K,Set[V]]
+ def toGraph[K, V](pairs: TraversableOnce[(K, V)]): Map[K, Set[V]] =
+ Monoid.sum(pairs.map { case (k, v) => Map(k -> Set(v)) })
+
+ /** join the keys of two maps (similar to outer-join in a DB) */
+ def join[K, V, W](map1: Map[K, V], map2: Map[K, W]): Map[K, (Option[V], Option[W])] =
+ Monoid
+ .plus(
+ map1.transform { case (_, v) =>
+ (List(v), List[W]())
+ },
+ map2.transform { case (_, w) =>
+ (List[V](), List(w))
+ }
+ )
+ .transform { case (_, (v, w)) => (v.headOption, w.headOption) }
+
+ /**
+ * Reverses a graph losslessly None key is for v's with no sources.
+ */
+ def invertExact[K, V](m: Map[Option[K], Set[V]]): Map[Option[V], Set[K]] = {
+ def nonEmptyIter[T](i: Iterable[T]): Iterable[Option[T]] =
+ if (i.isEmpty) Iterable(None)
+ else {
+ i.map(Some(_))
+ }
+
+ Monoid.sum {
+ for {
+ (k, sv) <- m.view.toIterable
+ v <- nonEmptyIter(sv)
+ } yield Map(v -> k.toSet)
+ }
+ }
+
+ /**
+ * Invert the Common case of exactly one value for each key
+ */
+ def invert[K, V](m: Map[K, V]): Map[V, Set[K]] =
+ Monoid.sum(m.view.toIterable.map { case (k, v) => Map(v -> Set(k)) })
+
+ def dot[K, V](left: Map[K, V], right: Map[K, V])(implicit mring: Ring[Map[K, V]], mon: Monoid[V]): V =
+ Monoid.sum(mring.times(left, right).values)
+
+ def cube[K, V](it: TraversableOnce[(K, V)])(implicit c: Cuber[K]): Map[c.K, List[V]] = {
+ val map: MMap[c.K, List[V]] = MMap[c.K, List[V]]()
+ it.iterator.foreach { case (k, v) =>
+ c(k).iterator.foreach { ik =>
+ map.get(ik) match {
+ case Some(vs) => map += ik -> (v :: vs)
+ case None => map += ik -> List(v)
+ }
+ }
+ }
+ map.foreach { case (k, v) => map(k) = v.reverse }
+ new MutableBackedMap(map)
+ }
+
+ def cubeSum[K, V](it: TraversableOnce[(K, V)])(implicit c: Cuber[K], sg: Semigroup[V]): Map[c.K, V] =
+ sumByKey(it.iterator.flatMap { case (k, v) => c(k).map((_, v)) })
+
+ def cubeAggregate[T, K, U, V](it: TraversableOnce[T], agg: Aggregator[T, U, V])(
+ fn: T => K
+ )(implicit c: Cuber[K]): Map[c.K, V] =
+ sumByKey(it.iterator.flatMap(t => c(fn(t)).iterator.map((_, agg.prepare(t)))))(agg.semigroup)
+ .map { case (k, v) => (k, agg.present(v)) }
+
+ def rollup[K, V](it: TraversableOnce[(K, V)])(implicit r: Roller[K]): Map[r.K, List[V]] = {
+ val map: MMap[r.K, List[V]] = MMap[r.K, List[V]]()
+ it.iterator.foreach { case (k, v) =>
+ r(k).iterator.foreach { ik =>
+ map.get(ik) match {
+ case Some(vs) => map += ik -> (v :: vs)
+ case None => map += ik -> List(v)
+ }
+ }
+ }
+ map.foreach { case (k, v) => map(k) = v.reverse }
+ new MutableBackedMap(map)
+ }
+
+ def rollupSum[K, V](it: TraversableOnce[(K, V)])(implicit r: Roller[K], sg: Semigroup[V]): Map[r.K, V] =
+ sumByKey(it.iterator.flatMap { case (k, v) => r(k).iterator.map((_, v)) })
+
+ def rollupAggregate[T, K, U, V](it: TraversableOnce[T], agg: Aggregator[T, U, V])(
+ fn: T => K
+ )(implicit r: Roller[K]): Map[r.K, V] =
+ sumByKey(it.iterator.flatMap(t => r(fn(t)).iterator.map((_, agg.prepare(t)))))(agg.semigroup)
+ .map { case (k, v) => (k, agg.present(v)) }
+
+}
diff --git a/algebird-core/src/main/scala-2.13/Scan.scala b/algebird-core/src/main/scala-2.13/Scan.scala
new file mode 100644
index 000000000..2dc2ff9c2
--- /dev/null
+++ b/algebird-core/src/main/scala-2.13/Scan.scala
@@ -0,0 +1,333 @@
+package com.twitter.algebird
+
+import scala.collection.compat._
+
+object Scan {
+
+ /**
+ * Most consumers of Scan don't care about the type of the type State type variable. But for those that do,
+ * we make an effort to expose it in all of our combinators.
+ * @tparam I
+ * @tparam S
+ * @tparam O
+ */
+ type Aux[-I, S, +O] = Scan[I, O] { type State = S }
+
+ implicit def applicative[I]: Applicative[Scan[I, _]] = new ScanApplicative[I]
+
+ def from[I, S, O](initState: S)(presentAndNextStateFn: (I, S) => (O, S)): Aux[I, S, O] =
+ new Scan[I, O] {
+ override type State = S
+ override val initialState = initState
+ override def presentAndNextState(i: I, s: State): (O, State) = presentAndNextStateFn(i, s)
+ }
+
+ def fromFunction[I, O](f: I => O): Aux[I, Unit, O] = new Scan[I, O] {
+ override type State = Unit
+ override val initialState = ()
+ override def presentAndNextState(i: I, stateBeforeProcessingI: Unit): (O, State) = (f(i), ())
+ }
+
+ /**
+ * Scans take streams of inputs to streams of outputs, but some scans have trivial inputs and just produce a
+ * stream of outputs. Streams can be thought of as being a hidden state that is queryable for a head
+ * element, and another hidden state that represents the rest of the stream.
+ * @param initState
+ * The initial state of the scan; think of this as an infinite stream.
+ * @param destructor
+ * This function decomposes a stream into the its head-element and tail-stream.
+ * @tparam S
+ * The hidden state of the stream that we are turning into a Scan.
+ * @tparam O
+ * The type of the elments of the stream that we are turning into a Scan
+ * @return
+ * A Scan whose inputs are irrelevant, and whose outputs are those that we would get from implementing a
+ * stream using the information provided to this method.
+ */
+ def iterate[S, O](initState: S)(destructor: S => (O, S)): Aux[Any, S, O] = new Scan[Any, O] {
+ override type State = S
+ override val initialState = initState
+ override def presentAndNextState(i: Any, stateBeforeProcessingI: S): (O, S) =
+ destructor(stateBeforeProcessingI)
+ }
+
+ /**
+ * A Scan whose `Nth` output is the number `N` (starting from 0).
+ */
+ val index: Aux[Any, Long, Long] = iterate(0L)(n => (n, n + 1))
+
+ def identity[A]: Aux[A, Unit, A] = fromFunction[A, A](x => x)
+
+ /**
+ * @param initStateCreator
+ * A call-by-name method that allocates new mutable state
+ * @param presentAndUpdateStateFn
+ * A function that both presents the output value, and has the side-effect of updating the mutable state
+ * @tparam I
+ * @tparam S
+ * @tparam O
+ * @return
+ * A Scan that safely encapsulates state while it's doing its thing.
+ */
+ def mutable[I, S, O](initStateCreator: => S)(presentAndUpdateStateFn: (I, S) => O): Aux[I, S, O] =
+ new Scan[I, O] {
+ override type State = S
+ override def initialState = initStateCreator
+ override def presentAndNextState(i: I, s: S): (O, S) = (presentAndUpdateStateFn(i, s), s)
+ }
+
+ /**
+ * The trivial scan that always returns the same value, regardless of input
+ * @param t
+ * @tparam T
+ */
+ def const[T](t: T): Aux[Any, Unit, T] = fromFunction(_ => t)
+
+ /**
+ * @param aggregator
+ * @param initState
+ * @tparam A
+ * @tparam B
+ * @tparam C
+ * @return
+ * A scan which, when given `[a_1, ..., a_n]` outputs `[c_1, ..., c_n]` where `c_i = initState +
+ * aggregator.prepare(a_1) + ... + aggregator.prepare(a_i)`
+ */
+ def fromAggregator[A, B, C](aggregator: Aggregator[A, B, C], initState: B): Aux[A, B, C] =
+ from(initState) { (a: A, stateBeforeProcessingI: B) =>
+ // nb: the order of the arguments to semigroup.plus here is what determines the order of the final summation;
+ // this matters because not all semigroups are commutative
+ val stateAfterProcessingA =
+ aggregator.append(stateBeforeProcessingI, a)
+ (aggregator.present(stateAfterProcessingA), stateAfterProcessingA)
+ }
+
+ /**
+ * @param monoidAggregator
+ * @tparam A
+ * @tparam B
+ * @tparam C
+ * @return
+ * A scan which, when given `[a_1, ..., a_n]` outputs `[c_1, ..., c_n]` where `c_i =
+ * monoidAggregator.monoid.zero + aggregator.prepare(a_1) + ... + aggregator.prepare(a_i)`
+ */
+ def fromMonoidAggregator[A, B, C](monoidAggregator: MonoidAggregator[A, B, C]): Aux[A, B, C] =
+ fromAggregator(monoidAggregator, monoidAggregator.monoid.zero)
+
+}
+
+/**
+ * The Scan trait is an alternative to the `scanLeft` method on iterators/other collections for a range of of
+ * use-cases where `scanLeft` is awkward to use. At a high level it provides some of the same functionality as
+ * `scanLeft`, but with a separation of "what is the state of the scan" from "what are the elements that I'm
+ * scanning over?". In particular, when scanning over an iterator with `N` elements, the output is an iterator
+ * with `N` elements (in contrast to scanLeft's `N+1`).
+ *
+ * If you find yourself writing a `scanLeft` over pairs of elements, where you only use one element of the
+ * pair within the `scanLeft`, then throw that element away in a `map` immediately after the scanLeft is done,
+ * then this abstraction is for you.
+ *
+ * The canonical method to use a scan is `apply`.
+ *
+ * @tparam I
+ * The type of elements that the computation is scanning over.
+ * @tparam O
+ * The output type of the scan (typically distinct from the hidden `State` of the scan).
+ */
+sealed abstract class Scan[-I, +O] extends Serializable {
+
+ import Scan.{from, Aux}
+
+ /**
+ * The computation of any given scan involves keeping track of a hidden state.
+ */
+ type State
+
+ /**
+ * The state of the scan before any elements have been processed
+ * @return
+ */
+ def initialState: State
+
+ /**
+ * @param i
+ * An element in the stream to process
+ * @param stateBeforeProcessingI
+ * The state of the scan before processing i
+ * @return
+ * The output of the scan corresponding to processing i with state stateBeforeProcessing, along with the
+ * result of updating stateBeforeProcessing with the information from i.
+ */
+ def presentAndNextState(i: I, stateBeforeProcessingI: State): (O, State)
+
+ /**
+ * @param iter
+ * @return
+ * If `iter = Iterator(a_1, ..., a_n)`, return:` `Iterator(o_1, ..., o_n)` where `(o_(i+1), state_(i+1)) =
+ * presentAndNextState(a_i, state_i)` and `state_0 = initialState`
+ */
+ def scanIterator(iter: Iterator[I]): Iterator[O] = new AbstractIterator[O] {
+ override def hasNext: Boolean = iter.hasNext
+ var state: State = initialState
+ override def next(): O = {
+ val thisState = state
+ val thisA = iter.next()
+ val (thisC, nextState) = presentAndNextState(thisA, thisState)
+ state = nextState
+ thisC
+ }
+ }
+
+ /**
+ * @param inputs
+ * @param bf
+ * @tparam In
+ * The type of the input collection
+ * @tparam Out
+ * The type of the output collection
+ * @return
+ * Given inputs as a collection of the form `[a_1, ..., a_n]` the output will be a collection of the form:
+ * `[o_1, ..., o_n]` where `(o_(i+1), state_(i+1)) = presentAndNextState(a_i, state_i)` and `state_0 =
+ * initialState`.
+ */
+ def apply[In <: TraversableOnce[I], Out](
+ inputs: In
+ )(implicit bf: BuildFrom[In, O, Out]): Out =
+ bf.fromSpecific(inputs)(scanIterator(inputs.toIterator))
+
+ // combinators
+
+ /**
+ * Return a new scan that is the same as this scan, but with a different `initialState`.
+ * @param newInitialState
+ * @return
+ */
+ def replaceState(newInitialState: => State): Aux[I, State, O] =
+ from(newInitialState)(presentAndNextState(_, _))
+
+ def composePrepare[I1](f: I1 => I): Aux[I1, State, O] = from(initialState) { (i, stateBeforeProcessingI) =>
+ presentAndNextState(f(i), stateBeforeProcessingI)
+ }
+
+ def andThenPresent[O1](g: O => O1): Aux[I, State, O1] = from(initialState) { (i, stateBeforeProcessingI) =>
+ val (c, stateAfterProcessingA) = presentAndNextState(i, stateBeforeProcessingI)
+ (g(c), stateAfterProcessingA)
+ }
+
+ /**
+ * Return a scan that is semantically identical to `this.join(Scan.identity[I1])`, but where we don't
+ * pollute the `State` by pairing it redundantly with `Unit`.
+ * @tparam I1
+ * @return
+ * If this Scan's `apply` method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1,
+ * ..., o_n`, then this results in a Scan whose `apply` method returns `[(o_1, a_1), ..., (o_n, a_n)]`
+ * when given the same input.
+ */
+ def joinWithInput[I1 <: I]: Aux[I1, State, (O, I1)] = from(initialState) { (i, stateBeforeProcessingI) =>
+ val (o, stateAfterProcessingI) = presentAndNextState(i, stateBeforeProcessingI)
+ ((o, i), stateAfterProcessingI)
+ }
+
+ /**
+ * Return a scan whose output is paired with the state of the scan before each input updates the state.
+ * @return
+ * If this Scan's `apply` method is given inputs [a_1, ..., a_n] resulting in outputs of the form `[o_1,
+ * ..., o_n]`, where `(o_(i+1), state_(i+1)) = presentAndNextState(a_i, state_i)` and `state_0 =
+ * initialState`, return a scan that whose apply method, when given inputs `[a_1, ..., a_n]` will return
+ * `[(o_1, state_0), ..., (o_n, state_(n-1))]`.
+ */
+ def joinWithPriorState: Aux[I, State, (State, O)] = from(initialState) { (i, stateBeforeProcessingI) =>
+ val (o, stateAfterProcessingA) = presentAndNextState(i, stateBeforeProcessingI)
+ ((stateBeforeProcessingI, o), stateAfterProcessingA)
+ }
+
+ /**
+ * Return a scan whose output is paired with the state of the scan after each input updates the state.
+ * @return
+ * If this Scan's `apply` method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1,
+ * ..., o_n]`, where `(o_(i+1), state_(i+1)) = presentAndNextState(a_i, state_i)`` and state_0 =
+ * initialState, return a scan that whose apply method, when given inputs `[a_1, ..., a_n]` will return
+ * `[(o_1, state_1), ..., (o_n, state_n]`.
+ */
+ def joinWithPosteriorState: Aux[I, State, (O, State)] = from(initialState) { (i, stateBeforeProcessingI) =>
+ val (c, stateAfterProcessingA) = presentAndNextState(i, stateBeforeProcessingI)
+ ((c, stateAfterProcessingA), stateAfterProcessingA)
+ }
+
+ /**
+ * For every `foo`, `scan.joinWithIndex(foo) == scan(foo).zipWithIndex`.
+ * @return
+ * If this Scan's `apply` method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1,
+ * ..., o_n]`, return a scan that whose apply method, when given the same input, will return `[(o_1, 1),
+ * ..., (o_n, n)]`.
+ */
+ def joinWithIndex: Aux[I, (State, Long), (O, Long)] = join(Scan.index)
+
+ /**
+ * Compose two scans pairwise such that, when given pairwise zipped inputs, the resulting scan will output
+ * pairwise zipped outputs.
+ * @param scan2
+ * @tparam I2
+ * @tparam O2
+ * @return
+ * If this Scan's apply method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1,
+ * ..., o_n]`, and `scan2.apply([b_1, ..., b_n] = [p_1, ..., p_n]` then `zip` will return a scan whose
+ * apply method, when given input `[(a_1, b_1), ..., (a_n, b_n)]` results in the output `[(o_1, p_1), ...,
+ * (o_2, p_2)]`. In other words: `scan.zip(scan2)(foo.zip(bar)) == scan(foo).zip(scan2(bar))`
+ */
+ def zip[I2, O2](scan2: Scan[I2, O2]): Aux[(I, I2), (State, scan2.State), (O, O2)] =
+ from((initialState, scan2.initialState)) { (i1i2, stateBeforeProcessingI1I2) =>
+ val (o1, state1AfterProcesingI1) =
+ presentAndNextState(i1i2._1, stateBeforeProcessingI1I2._1)
+ val (o2, state2AfterProcesingI2) =
+ scan2.presentAndNextState(i1i2._2, stateBeforeProcessingI1I2._2)
+ ((o1, o2), (state1AfterProcesingI1, state2AfterProcesingI2))
+ }
+
+ /**
+ * Given a scan that takes compatible input to this one, pairwise compose the state and outputs of each scan
+ * on a common input stream.
+ * @param scan2
+ * @tparam I2
+ * @tparam O2
+ * @return
+ * If this Scan's apply method is given inputs [a_1, ..., a_n] resulting in outputs of the form `[o_1,
+ * ..., o_n]`, and `scan2.apply([a_1, ..., a_n] = [p_1, ..., p_n]` then `join` will return a scan whose
+ * apply method returns `[(o_1, p_1), ..., (o_2, p_2)]`. In other words: `scan.join(scan2)(foo) ==
+ * scan(foo).zip(scan2(foo))`
+ */
+ def join[I2 <: I, O2](scan2: Scan[I2, O2]): Aux[I2, (State, scan2.State), (O, O2)] =
+ from((initialState, scan2.initialState)) { (i, stateBeforeProcessingI) =>
+ val (o1, state1AfterProcesingI1) = presentAndNextState(i, stateBeforeProcessingI._1)
+ val (o2, state2AfterProcesingI2) = scan2.presentAndNextState(i, stateBeforeProcessingI._2)
+ ((o1, o2), (state1AfterProcesingI1, state2AfterProcesingI2))
+ }
+
+ /**
+ * Takes the output of this scan and feeds as input into scan2.
+ * @param scan2
+ * @tparam P
+ * @return
+ * If this Scan's apply method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1,
+ * ..., o_n]`, and `scan2.apply([o_1, ..., o_n] = [p_1, ..., p_n]` then `compose` will return a scan which
+ * returns `[p_1, ..., p_n]`.
+ */
+ def compose[P](scan2: Scan[O, P]): Aux[I, (State, scan2.State), P] =
+ from((initialState, scan2.initialState)) { (i, stateBeforeProcessingI) =>
+ val (o, state1AfterProcesingI) = presentAndNextState(i, stateBeforeProcessingI._1)
+ val (p, state2AfterProcesingO) = scan2.presentAndNextState(o, stateBeforeProcessingI._2)
+ (p, (state1AfterProcesingI, state2AfterProcesingO))
+ }
+
+}
+
+class ScanApplicative[I] extends Applicative[Scan[I, _]] {
+ override def map[T, U](mt: Scan[I, T])(fn: T => U): Scan[I, U] =
+ mt.andThenPresent(fn)
+
+ override def apply[T](v: T): Scan[I, T] =
+ Scan.const(v)
+
+ override def join[T, U](mt: Scan[I, T], mu: Scan[I, U]): Scan[I, (T, U)] =
+ mt.join(mu)
+}
diff --git a/algebird-core/src/main/scala-2.13/SpaceSaver.scala b/algebird-core/src/main/scala-2.13/SpaceSaver.scala
new file mode 100644
index 000000000..5f9eee7e6
--- /dev/null
+++ b/algebird-core/src/main/scala-2.13/SpaceSaver.scala
@@ -0,0 +1,296 @@
+package com.twitter.algebird
+
+import java.nio.ByteBuffer
+
+import scala.collection.immutable.SortedMap
+import scala.util.{Failure, Success, Try}
+
+object SpaceSaver {
+
+ /**
+ * Construct SpaceSaver with given capacity containing a single item. This is the public api to create a new
+ * SpaceSaver.
+ */
+ def apply[T](capacity: Int, item: T): SpaceSaver[T] = SSOne(capacity, item)
+
+ /**
+ * Construct SpaceSaver with given capacity containing a single item with provided exact count. This is the
+ * public api to create a new SpaceSaver.
+ */
+ def apply[T](capacity: Int, item: T, count: Long): SpaceSaver[T] =
+ SSMany(capacity, Map(item -> ((count, 0L))))
+
+ private[algebird] val ordering =
+ Ordering.by[(?, (Long, Long)), (Long, Long)] { case (_, (count, err)) =>
+ (-count, err)
+ }
+
+ implicit def spaceSaverSemiGroup[T]: Semigroup[SpaceSaver[T]] =
+ new SpaceSaverSemigroup[T]
+
+ /**
+ * Encodes the SpaceSaver as a sequence of bytes containing in order
+ * - 1 byte: 1/2 => 1 = SSOne, 2 = SSMany
+ * - 4 bytes: the capacity
+ * - N bytes: the item/counters (counters as length + N*(item size + item + 2 * counters)
+ */
+ def toBytes[T](ss: SpaceSaver[T], tSerializer: T => Array[Byte]): Array[Byte] =
+ ss match {
+ case SSOne(capacity, item) =>
+ val itemAsBytes = tSerializer(item)
+ val itemLength = itemAsBytes.length
+ // 1 for the type, 4 for capacity, 4 for itemAsBytes.length
+ val buffer = new Array[Byte](1 + 4 + 4 + itemLength)
+ ByteBuffer
+ .wrap(buffer)
+ .put(1: Byte)
+ .putInt(capacity)
+ .putInt(itemLength)
+ .put(itemAsBytes)
+ buffer
+
+ case SSMany(
+ capacity,
+ counters,
+ _
+ ) => // We do not care about the buckets are thery are created by SSMany.apply
+ val buffer = scala.collection.mutable.ArrayBuffer.newBuilder[Byte]
+ buffer += (2: Byte)
+
+ var buff = ByteBuffer.allocate(4)
+ buff.putInt(capacity)
+ buffer ++= buff.array()
+
+ buff = ByteBuffer.allocate(4)
+ buff.putInt(counters.size)
+ buffer ++= buff.array()
+ counters.foreach { case (item, (a, b)) =>
+ val itemAsBytes = tSerializer(item)
+
+ buff = ByteBuffer.allocate(4)
+ buff.putInt(itemAsBytes.length)
+ buffer ++= buff.array()
+
+ buffer ++= itemAsBytes
+
+ buff = ByteBuffer.allocate(8 * 2)
+ buff.putLong(a)
+ buff.putLong(b)
+ buffer ++= buff.array()
+ }
+ buffer.result().toArray
+ }
+
+ // Make sure to be reversible so fromBytes(toBytes(x)) == x
+ def fromBytes[T](bytes: Array[Byte], tDeserializer: Array[Byte] => Try[T]): Try[SpaceSaver[T]] =
+ fromByteBuffer(ByteBuffer.wrap(bytes), buffer => tDeserializer(buffer.array()))
+
+ def fromByteBuffer[T](bb: ByteBuffer, tDeserializer: ByteBuffer => Try[T]): Try[SpaceSaver[T]] =
+ Try {
+ bb.get.toInt match {
+ case 1 =>
+ val capacity = bb.getInt
+ val itemLength = bb.getInt
+ val itemAsBytes = new Array[Byte](itemLength)
+ bb.get(itemAsBytes)
+ tDeserializer(ByteBuffer.wrap(itemAsBytes)).map(item => SSOne(capacity, item))
+ case 2 =>
+ val capacity = bb.getInt
+
+ var countersToDeserialize = bb.getInt
+ val counters = scala.collection.mutable.Map.empty[T, (Long, Long)]
+ while (countersToDeserialize != 0) {
+ val itemLength = bb.getInt()
+ val itemAsBytes = new Array[Byte](itemLength)
+ bb.get(itemAsBytes)
+ val item = tDeserializer(ByteBuffer.wrap(itemAsBytes))
+
+ val a = bb.getLong
+ val b = bb.getLong
+
+ item match {
+ case Failure(e) => return Failure(e)
+ case Success(i) =>
+ counters += ((i, (a, b)))
+ }
+
+ countersToDeserialize -= 1
+ }
+
+ Success(SSMany(capacity, counters.toMap))
+ }
+ }.flatten
+}
+
+/**
+ * Data structure used in the Space-Saving Algorithm to find the approximate most frequent and top-k elements.
+ * The algorithm is described in "Efficient Computation of Frequent and Top-k Elements in Data Streams". See
+ * here: www.cs.ucsb.edu/research/tech_reports/reports/2005-23.pdf In the paper the data structure is called
+ * StreamSummary but we chose to call it SpaceSaver instead. Note that the adaptation to hadoop and
+ * parallelization were not described in the article and have not been proven to be mathematically correct or
+ * preserve the guarantees or benefits of the algorithm.
+ */
+sealed abstract class SpaceSaver[T] {
+ import SpaceSaver.ordering
+
+ /**
+ * Maximum number of counters to keep (parameter "m" in the research paper).
+ */
+ def capacity: Int
+
+ /**
+ * Current lowest value for count
+ */
+ def min: Long
+
+ /**
+ * Map of item to counter, where each counter consists of an observed count and possible over-estimation
+ * (error)
+ */
+ def counters: Map[T, (Long, Long)]
+
+ def ++(other: SpaceSaver[T]): SpaceSaver[T]
+
+ /**
+ * returns the frequency estimate for the item
+ */
+ def frequency(item: T): Approximate[Long] = {
+ val (count, err) = counters.getOrElse(item, (min, min))
+ Approximate(count - err, count, count, 1.0)
+ }
+
+ /**
+ * Get the elements that show up more than thres times. Returns sorted in descending order: (item,
+ * Approximate[Long], guaranteed)
+ */
+ def mostFrequent(thres: Int): Seq[(T, Approximate[Long], Boolean)] =
+ counters.iterator
+ .filter { case (_, (count, _)) => count >= thres }
+ .toList
+ .sorted(ordering)
+ .map { case (item, (count, err)) =>
+ (item, Approximate(count - err, count, count, 1.0), thres <= count - err)
+ }
+
+ /**
+ * Get the top-k elements. Returns sorted in descending order: (item, Approximate[Long], guaranteed)
+ */
+ def topK(k: Int): Seq[(T, Approximate[Long], Boolean)] = {
+ require(k < capacity)
+ val si = counters.toList
+ .sorted(ordering)
+ val siK = si.take(k)
+ val countKPlus1 = si.drop(k).headOption.map(_._2._1).getOrElse(0L)
+ siK.map { case (item, (count, err)) =>
+ (item, Approximate(count - err, count, count, 1.0), countKPlus1 < count - err)
+ }
+ }
+
+ /**
+ * Check consistency with other SpaceSaver, useful for testing. Returns boolean indicating if they are
+ * consistent
+ */
+ def consistentWith(that: SpaceSaver[T]): Boolean =
+ (counters.keys ++ that.counters.keys).forall(item => (frequency(item) - that.frequency(item)) ~ 0)
+}
+
+case class SSOne[T] private[algebird] (override val capacity: Int, item: T) extends SpaceSaver[T] {
+ require(capacity > 1)
+
+ override def min: Long = 0L
+
+ override def counters: Map[T, (Long, Long)] = Map(item -> ((1L, 1L)))
+
+ override def ++(other: SpaceSaver[T]): SpaceSaver[T] = other match {
+ case other: SSOne[?] => SSMany(this).add(other)
+ case other: SSMany[?] => other.add(this)
+ }
+}
+
+object SSMany {
+ private def bucketsFromCounters[T](counters: Map[T, (Long, Long)]): SortedMap[Long, Set[T]] =
+ SortedMap[Long, Set[T]]() ++ counters.groupBy(_._2._1).mapValues(_.keySet).toMap
+
+ private[algebird] def apply[T](capacity: Int, counters: Map[T, (Long, Long)]): SSMany[T] =
+ SSMany(capacity, counters, bucketsFromCounters(counters))
+
+ private[algebird] def apply[T](one: SSOne[T]): SSMany[T] =
+ SSMany(one.capacity, Map(one.item -> ((1L, 0L))), SortedMap(1L -> Set(one.item)))
+}
+
+case class SSMany[T] private (
+ override val capacity: Int,
+ override val counters: Map[T, (Long, Long)],
+ buckets: SortedMap[Long, Set[T]]
+) extends SpaceSaver[T] {
+ private val exact: Boolean = counters.size < capacity
+
+ override val min: Long = if (counters.size < capacity) 0L else buckets.firstKey
+
+ // item is already present and just needs to be bumped up one
+ private def bump(item: T) = {
+ val (count, err) = counters(item)
+ val counters1 = counters + (item -> ((count + 1L, err))) // increment by one
+ val currBucket = buckets(count) // current bucket
+ val buckets1 = {
+ if (currBucket.size == 1) // delete current bucket since it will be empty
+ buckets - count
+ else // remove item from current bucket
+ buckets + (count -> (currBucket - item))
+ } + (count + 1L -> (buckets.getOrElse(count + 1L, Set()) + item))
+ SSMany(capacity, counters1, buckets1)
+ }
+
+ // lose one item to meet capacity constraint
+ private def loseOne = {
+ val firstBucket = buckets(buckets.firstKey)
+ val itemToLose = firstBucket.head
+ val counters1 = counters - itemToLose
+ val buckets1 =
+ if (firstBucket.size == 1)
+ buckets - min
+ else
+ buckets + (min -> (firstBucket - itemToLose))
+ SSMany(capacity, counters1, buckets1)
+ }
+
+ // introduce new item
+ private def introduce(item: T, count: Long, err: Long) = {
+ val counters1 = counters + (item -> ((count, err)))
+ val buckets1 = buckets + (count -> (buckets.getOrElse(count, Set()) + item))
+ SSMany(capacity, counters1, buckets1)
+ }
+
+ // add a single element
+ private[algebird] def add(x: SSOne[T]): SSMany[T] = {
+ require(x.capacity == capacity)
+ if (counters.contains(x.item))
+ bump(x.item)
+ else
+ (if (exact) this else this.loseOne).introduce(x.item, min + 1L, min)
+ }
+
+ // merge two stream summaries
+ private def merge(x: SSMany[T]): SSMany[T] = {
+ require(x.capacity == capacity)
+ val counters1 = Map() ++
+ (counters.keySet ++ x.counters.keySet).toList
+ .map { key =>
+ val (count1, err1) = counters.getOrElse(key, (min, min))
+ val (count2, err2) = x.counters.getOrElse(key, (x.min, x.min))
+ key -> ((count1 + count2, err1 + err2))
+ }
+ .sorted(SpaceSaver.ordering)
+ .take(capacity)
+ SSMany(capacity, counters1)
+ }
+
+ override def ++(other: SpaceSaver[T]): SpaceSaver[T] = other match {
+ case other: SSOne[?] => add(other)
+ case other: SSMany[?] => merge(other)
+ }
+}
+
+class SpaceSaverSemigroup[T] extends Semigroup[SpaceSaver[T]] {
+ override def plus(x: SpaceSaver[T], y: SpaceSaver[T]): SpaceSaver[T] = x ++ y
+}
diff --git a/algebird-core/src/main/scala-2.13/VectorSpace.scala b/algebird-core/src/main/scala-2.13/VectorSpace.scala
new file mode 100644
index 000000000..f8818600c
--- /dev/null
+++ b/algebird-core/src/main/scala-2.13/VectorSpace.scala
@@ -0,0 +1,59 @@
+/*
+Copyright 2012 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+ */
+
+package com.twitter.algebird
+
+import scala.annotation.implicitNotFound
+
+/**
+ * This class represents a vector space. For the required properties see:
+ *
+ * http://en.wikipedia.org/wiki/Vector_space#Definition
+ */
+object VectorSpace extends VectorSpaceOps with Implicits
+
+sealed trait VectorSpaceOps {
+ def scale[F, C[_]](v: F, c: C[F])(implicit vs: VectorSpace[F, C]): C[F] =
+ vs.scale(v, c)
+ def from[F, C[_]](scaleFn: (F, C[F]) => C[F])(implicit r: Ring[F], cGroup: Group[C[F]]): VectorSpace[F, C] =
+ new VectorSpace[F, C] {
+ override def ring: Ring[F] = r
+ override def group: Group[C[F]] = cGroup
+ override def scale(v: F, c: C[F]): C[F] =
+ if (r.isNonZero(v)) scaleFn(v, c) else cGroup.zero
+ }
+}
+private object VectorSpaceOps extends VectorSpaceOps
+
+sealed trait Implicits extends LowPrioImpicits {
+ implicit def indexedSeqSpace[T: Ring]: VectorSpace[T, IndexedSeq] =
+ VectorSpaceOps.from[T, IndexedSeq]((s, seq) => seq.map(Ring.times(s, _)))
+}
+
+sealed trait LowPrioImpicits {
+ implicit def mapSpace[K, T: Ring]: VectorSpace[T, Map[K, _]] =
+ VectorSpaceOps.from[T, Map[K, _]] { (s, m) =>
+ m.transform { case (_, v) => Ring.times(s, v) }
+ }
+}
+
+@implicitNotFound(msg = "Cannot find VectorSpace type class for Container: ${C} and Ring: ${F}")
+trait VectorSpace[F, C[_]] extends java.io.Serializable {
+ implicit def ring: Ring[F]
+ def field: Ring[F] = ring // this is for compatibility with older versions
+ implicit def group: Group[C[F]]
+ def scale(v: F, c: C[F]): C[F]
+}
diff --git a/algebird-core/src/main/scala-2.13/monad/EitherMonad.scala b/algebird-core/src/main/scala-2.13/monad/EitherMonad.scala
new file mode 100644
index 000000000..b6d5e2ffc
--- /dev/null
+++ b/algebird-core/src/main/scala-2.13/monad/EitherMonad.scala
@@ -0,0 +1,37 @@
+/*
+ Copyright 2013 Twitter, Inc.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package com.twitter.algebird.monad
+
+import com.twitter.algebird.Monad
+
+// Monad for either, used for modeling Error where L is the type of the error
+object EitherMonad {
+ class Error[L] extends Monad[Either[L, *]] {
+ override def apply[R](r: R): Right[L, R] = Right(r)
+
+ override def flatMap[T, U](self: Either[L, T])(next: T => Either[L, U]): Either[L, U] =
+ self.right.flatMap(next)
+
+ override def map[T, U](self: Either[L, T])(fn: T => U): Either[L, U] =
+ self.right.map(fn)
+ }
+
+ implicit def monad[L]: Monad[Either[L, _]] = new Error[L]
+
+ def assert[L](truth: Boolean, failure: => L): Either[L, Unit] =
+ if (truth) Right(()) else Left(failure)
+}
diff --git a/algebird-core/src/main/scala-2.13/monad/Reader.scala b/algebird-core/src/main/scala-2.13/monad/Reader.scala
new file mode 100644
index 000000000..e0747af20
--- /dev/null
+++ b/algebird-core/src/main/scala-2.13/monad/Reader.scala
@@ -0,0 +1,76 @@
+/*
+ Copyright 2013 Twitter, Inc.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package com.twitter.algebird.monad
+
+import com.twitter.algebird.Monad
+
+// TODO this is general, move somewhere better
+
+// Reader Monad, represents a series of operations that mutate some environment
+// type (the input to the function)
+
+sealed trait Reader[-Env, +T] {
+ def apply(env: Env): T
+ def flatMap[E1 <: Env, U](next: T => Reader[E1, U]): Reader[E1, U] =
+ FlatMappedReader[E1, T, U](this, next)
+ def map[U](thatFn: T => U): Reader[Env, U] =
+ FlatMappedReader(this, (t: T) => ConstantReader(thatFn(t)))
+}
+
+final case class ConstantReader[+T](get: T) extends Reader[Any, T] {
+ override def apply(env: Any): T = get
+ override def map[U](fn: T => U): ConstantReader[U] = ConstantReader(fn(get))
+ override def flatMap[E1 <: Any, U](next: T => Reader[E1, U]): Reader[E1, U] =
+ next(get)
+}
+final case class ReaderFn[E, +T](fn: E => T) extends Reader[E, T] {
+ override def apply(env: E): T = fn(env)
+}
+final case class FlatMappedReader[E, U, +T](first: Reader[E, U], fn: U => Reader[E, T]) extends Reader[E, T] {
+ override def apply(env: E): T = {
+ @annotation.tailrec
+ def loop(r: Reader[E, Any], stack: List[(Any) => Reader[E, Any]]): Any =
+ r match {
+ case ConstantReader(get) =>
+ stack match {
+ case head :: tail => loop(head(get), tail)
+ case Nil => get
+ }
+ case ReaderFn(fn) =>
+ stack match {
+ case head :: tail => loop(head(fn(env)), tail)
+ case Nil => fn(env)
+ }
+ case FlatMappedReader(first, nextFn) => loop(first, nextFn :: stack)
+ }
+ loop(first, List(fn.asInstanceOf[(Any) => Reader[E, Any]])).asInstanceOf[T]
+ }
+}
+
+object Reader {
+ def const[T](t: T): Reader[Any, T] = ConstantReader(t)
+ implicit def apply[E, T](fn: (E) => T): Reader[E, T] = ReaderFn(fn)
+
+ class ReaderM[Env] extends Monad[Reader[Env, _]] {
+ override def apply[T](t: T): ConstantReader[T] = ConstantReader(t)
+ override def flatMap[T, U](self: Reader[Env, T])(next: T => Reader[Env, U]): Reader[Env, U] =
+ self.flatMap(next)
+ override def map[T, U](self: Reader[Env, T])(fn: T => U): Reader[Env, U] = self.map(fn)
+ }
+
+ implicit def monad[Env]: Monad[Reader[Env, _]] = new ReaderM[Env]
+}
diff --git a/algebird-core/src/main/scala-2.13/monad/StateWithError.scala b/algebird-core/src/main/scala-2.13/monad/StateWithError.scala
new file mode 100644
index 000000000..e15a9ebc3
--- /dev/null
+++ b/algebird-core/src/main/scala-2.13/monad/StateWithError.scala
@@ -0,0 +1,130 @@
+/*
+ Copyright 2013 Twitter, Inc.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package com.twitter.algebird.monad
+
+import com.twitter.algebird.{Monad, Semigroup}
+
+/**
+ * Monad to handle mutating input state and possible failures. This is used to interact in the planning phase
+ * with existing mutable APIs (like storm or cascading), but retain the ability to compose carefully.
+ */
+sealed trait StateWithError[S, +F, +T] {
+ def join[F1 >: F, U](
+ that: StateWithError[S, F1, U],
+ mergeErr: (F1, F1) => F1,
+ mergeState: (S, S) => S
+ ): StateWithError[S, F1, (T, U)] =
+ join(that)(Semigroup.from(mergeErr), Semigroup.from(mergeState))
+
+ def join[F1 >: F, U](that: StateWithError[S, F1, U])(implicit
+ sgf: Semigroup[F1],
+ sgs: Semigroup[S]
+ ): // TODO: deep joins could blow the stack, not yet using trampoline here
+ StateWithError[S, F1, (T, U)] =
+ StateFn { (requested: S) =>
+ (run(requested), that.run(requested)) match {
+ case (Right((s1, r1)), Right((s2, r2))) =>
+ Right((sgs.plus(s1, s2), (r1, r2)))
+ case (Left(err1), Left(err2)) =>
+ Left(sgf.plus(err1, err2)) // Our earlier is not ready
+ case (Left(err), _) => Left(err)
+ case (_, Left(err)) => Left(err)
+ }
+ }
+
+ def apply(state: S): Either[F, (S, T)] = run(state)
+
+ def run(state: S): Either[F, (S, T)]
+
+ def flatMap[F1 >: F, U](next: T => StateWithError[S, F1, U]): StateWithError[S, F1, U] =
+ FlatMappedState(this, next)
+
+ def map[U](fn: (T) => U): StateWithError[S, F, U] =
+ FlatMappedState(this, (t: T) => StateWithError.const(fn(t)))
+}
+
+/** Simple wrapper of a function in the Monad */
+final case class StateFn[S, F, T](fn: S => Either[F, (S, T)]) extends StateWithError[S, F, T] {
+ override def run(state: S): Either[F, (S, T)] = fn(state)
+}
+
+/**
+ * A Trampolining instance that should prevent stack overflow at the expense of performance
+ */
+final case class FlatMappedState[S, F, T, U](start: StateWithError[S, F, T], fn: T => StateWithError[S, F, U])
+ extends StateWithError[S, F, U] {
+ override def run(state: S): Either[F, (S, U)] = {
+ @annotation.tailrec
+ def loop(inState: S, st: StateWithError[S, F, Any], stack: List[Any => StateWithError[S, F, Any]]): Any =
+ st match {
+ case StateFn(fn) =>
+ fn(inState) match {
+ case err @ Left(_) => err // bail at first error
+ case noError @ Right((newState, out)) =>
+ stack match {
+ case head :: tailStack => loop(newState, head(out), tailStack)
+ case Nil => noError // recursion ends
+ }
+ }
+ case FlatMappedState(st, next) => loop(inState, st, next :: stack)
+ }
+ loop(state, this, Nil).asInstanceOf[Either[F, (S, U)]]
+ }
+}
+
+object StateWithError {
+ def getState[S]: StateWithError[S, Nothing, S] =
+ StateFn((state: S) => Right((state, state)))
+ def putState[S](newState: S): StateWithError[S, Nothing, Unit] =
+ StateFn((_: S) => Right((newState, ())))
+ def swapState[S](newState: S): StateWithError[S, Nothing, S] =
+ StateFn((old: S) => Right((newState, old)))
+
+ def const[S, T](t: T): StateWithError[S, Nothing, T] =
+ StateFn((state: S) => Right((state, t)))
+ def lazyVal[S, T](t: => T): StateWithError[S, Nothing, T] =
+ StateFn((state: S) => Right((state, t)))
+ def failure[S, F](f: F): StateWithError[S, F, Nothing] =
+ StateFn(_ => Left(f))
+
+ /**
+ * Use like fromEither[Int](Right("good")) to get a constant Either in the monad
+ */
+ def fromEither[S]: ConstantStateMaker[S] = new ConstantStateMaker[S]
+ class ConstantStateMaker[S] {
+ def apply[F, T](either: Either[F, T]): StateWithError[S, F, T] = { (s: S) => either.right.map((s, _)) }
+ }
+
+ class FunctionLifter[S] {
+ def apply[I, F, T](fn: I => Either[F, T]): (I => StateWithError[S, F, T]) = { (i: I) =>
+ StateFn((s: S) => fn(i).right.map((s, _)))
+ }
+ }
+ // TODO this should move to Monad and work for any Monad
+ def toKleisli[S]: FunctionLifter[S] = new FunctionLifter[S]
+
+ implicit def apply[S, F, T](fn: S => Either[F, (S, T)]): StateWithError[S, F, T] = StateFn(fn)
+ implicit def monad[S, F]: Monad[StateWithError[S, F, _]] = new StateFMonad[F, S]
+
+ class StateFMonad[F, S] extends Monad[StateWithError[S, F, _]] {
+ override def apply[T](const: T): StateWithError[S, Nothing, T] = { (s: S) => Right((s, const)) }
+ override def flatMap[T, U](
+ earlier: StateWithError[S, F, T]
+ )(next: T => StateWithError[S, F, U]): StateWithError[S, F, U] =
+ earlier.flatMap(next)
+ }
+}
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/macros/Cuber.scala b/algebird-core/src/main/scala-2/com/twitter/algebird/macros/Cuber.scala
similarity index 100%
rename from algebird-core/src/main/scala/com/twitter/algebird/macros/Cuber.scala
rename to algebird-core/src/main/scala-2/com/twitter/algebird/macros/Cuber.scala
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/macros/GroupMacro.scala b/algebird-core/src/main/scala-2/com/twitter/algebird/macros/GroupMacro.scala
similarity index 100%
rename from algebird-core/src/main/scala/com/twitter/algebird/macros/GroupMacro.scala
rename to algebird-core/src/main/scala-2/com/twitter/algebird/macros/GroupMacro.scala
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/macros/MonoidMacro.scala b/algebird-core/src/main/scala-2/com/twitter/algebird/macros/MonoidMacro.scala
similarity index 100%
rename from algebird-core/src/main/scala/com/twitter/algebird/macros/MonoidMacro.scala
rename to algebird-core/src/main/scala-2/com/twitter/algebird/macros/MonoidMacro.scala
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/macros/RingMacro.scala b/algebird-core/src/main/scala-2/com/twitter/algebird/macros/RingMacro.scala
similarity index 100%
rename from algebird-core/src/main/scala/com/twitter/algebird/macros/RingMacro.scala
rename to algebird-core/src/main/scala-2/com/twitter/algebird/macros/RingMacro.scala
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/macros/Roller.scala b/algebird-core/src/main/scala-2/com/twitter/algebird/macros/Roller.scala
similarity index 100%
rename from algebird-core/src/main/scala/com/twitter/algebird/macros/Roller.scala
rename to algebird-core/src/main/scala-2/com/twitter/algebird/macros/Roller.scala
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/macros/SemigroupMacro.scala b/algebird-core/src/main/scala-2/com/twitter/algebird/macros/SemigroupMacro.scala
similarity index 100%
rename from algebird-core/src/main/scala/com/twitter/algebird/macros/SemigroupMacro.scala
rename to algebird-core/src/main/scala-2/com/twitter/algebird/macros/SemigroupMacro.scala
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/macros/caseclass.scala b/algebird-core/src/main/scala-2/com/twitter/algebird/macros/caseclass.scala
similarity index 100%
rename from algebird-core/src/main/scala/com/twitter/algebird/macros/caseclass.scala
rename to algebird-core/src/main/scala-2/com/twitter/algebird/macros/caseclass.scala
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/macros/package.scala b/algebird-core/src/main/scala-2/com/twitter/algebird/macros/package.scala
similarity index 100%
rename from algebird-core/src/main/scala/com/twitter/algebird/macros/package.scala
rename to algebird-core/src/main/scala-2/com/twitter/algebird/macros/package.scala
diff --git a/algebird-core/src/main/scala-3/Aggregator.scala b/algebird-core/src/main/scala-3/Aggregator.scala
new file mode 100644
index 000000000..8a4d2b230
--- /dev/null
+++ b/algebird-core/src/main/scala-3/Aggregator.scala
@@ -0,0 +1,637 @@
+package com.twitter.algebird
+
+import java.util.PriorityQueue
+import scala.collection.compat._
+import scala.collection.generic.CanBuildFrom
+
+/**
+ * Aggregators compose well.
+ *
+ * To create a parallel aggregator that operates on a single input in parallel, use:
+ * GeneratedTupleAggregator.from2((agg1, agg2))
+ */
+object Aggregator extends java.io.Serializable {
+ implicit def applicative[I]: Applicative[({ type L[O] = Aggregator[I, ?, O] })#L] =
+ new AggregatorApplicative[I]
+
+ private val DefaultSeed = 471312384
+
+ /**
+ * This is a trivial aggregator that always returns a single value
+ */
+ def const[T](t: T): MonoidAggregator[Any, Unit, T] =
+ prepareMonoid { (_: Any) => () }.andThenPresent(_ => t)
+
+ /**
+ * Using Aggregator.prepare,present you can add to this aggregator
+ */
+ def fromReduce[T](red: (T, T) => T): Aggregator[T, T, T] =
+ fromSemigroup(Semigroup.from(red))
+ def fromSemigroup[T](implicit sg: Semigroup[T]): Aggregator[T, T, T] =
+ new Aggregator[T, T, T] {
+ override def prepare(input: T): T = input
+ override def semigroup: Semigroup[T] = sg
+ override def present(reduction: T): T = reduction
+ }
+ def fromMonoid[T](implicit mon: Monoid[T]): MonoidAggregator[T, T, T] =
+ prepareMonoid(identity[T])
+ // Uses the product from the ring
+ def fromRing[T](implicit rng: Ring[T]): RingAggregator[T, T, T] =
+ fromRing[T, T](rng, identity[T])
+
+ def fromMonoid[F, T](implicit mon: Monoid[T], prep: F => T): MonoidAggregator[F, T, T] =
+ prepareMonoid(prep)(mon)
+
+ def prepareSemigroup[F, T](prep: F => T)(implicit sg: Semigroup[T]): Aggregator[F, T, T] =
+ new Aggregator[F, T, T] {
+ override def prepare(input: F): T = prep(input)
+ override def semigroup: Semigroup[T] = sg
+ override def present(reduction: T): T = reduction
+ }
+ def prepareMonoid[F, T](prep: F => T)(implicit m: Monoid[T]): MonoidAggregator[F, T, T] =
+ new MonoidAggregator[F, T, T] {
+ override def prepare(input: F): T = prep(input)
+ override def monoid: Monoid[T] = m
+ override def present(reduction: T): T = reduction
+ }
+ // Uses the product from the ring
+ def fromRing[F, T](implicit rng: Ring[T], prep: F => T): RingAggregator[F, T, T] =
+ new RingAggregator[F, T, T] {
+ override def prepare(input: F): T = prep(input)
+ override def ring: Ring[T] = rng
+ override def present(reduction: T): T = reduction
+ }
+
+ /**
+ * Obtain an [[Aggregator]] that uses an efficient append operation for faster aggregation. Equivalent to
+ * {{{appendSemigroup(prep, appnd, identity[T]_)(sg)}}}
+ */
+ def appendSemigroup[F, T](prep: F => T, appnd: (T, F) => T)(implicit
+ sg: Semigroup[T]
+ ): Aggregator[F, T, T] =
+ appendSemigroup(prep, appnd, identity[T])(sg)
+
+ /**
+ * Obtain an [[Aggregator]] that uses an efficient append operation for faster aggregation
+ * @tparam F
+ * Data input type
+ * @tparam T
+ * Aggregating [[Semigroup]] type
+ * @tparam P
+ * Presentation (output) type
+ * @param prep
+ * The preparation function. Expected to construct an instance of type T from a single data element.
+ * @param appnd
+ * Function that appends the [[Semigroup]]. Defines the [[Aggregator.append]] method for this aggregator.
+ * Analogous to the 'seqop' function in Scala's sequence 'aggregate' method
+ * @param pres
+ * The presentation function
+ * @param sg
+ * The [[Semigroup]] type class
+ * @note
+ * The functions 'appnd' and 'prep' are expected to obey the law: {{{appnd(t, f) == sg.plus(t, prep(f))}}}
+ */
+ def appendSemigroup[F, T, P](prep: F => T, appnd: (T, F) => T, pres: T => P)(implicit
+ sg: Semigroup[T]
+ ): Aggregator[F, T, P] =
+ new Aggregator[F, T, P] {
+ override def semigroup: Semigroup[T] = sg
+ override def prepare(input: F): T = prep(input)
+ override def present(reduction: T): P = pres(reduction)
+
+ override def apply(inputs: TraversableOnce[F]): P =
+ applyOption(inputs).get
+
+ override def applyOption(inputs: TraversableOnce[F]): Option[P] =
+ agg(inputs).map(pres)
+
+ override def append(l: T, r: F): T = appnd(l, r)
+
+ override def appendAll(old: T, items: TraversableOnce[F]): T =
+ if (items.iterator.isEmpty) old else reduce(old, agg(items).get)
+
+ private def agg(inputs: TraversableOnce[F]): Option[T] =
+ if (inputs.iterator.isEmpty) None
+ else {
+ val itr = inputs.iterator
+ val t = prepare(itr.next)
+ Some(itr.foldLeft(t)(appnd))
+ }
+ }
+
+ /**
+ * Obtain a [[MonoidAggregator]] that uses an efficient append operation for faster aggregation. Equivalent
+ * to {{{appendMonoid(appnd, identity[T]_)(m)}}}
+ */
+ def appendMonoid[F, T](appnd: (T, F) => T)(implicit m: Monoid[T]): MonoidAggregator[F, T, T] =
+ appendMonoid(appnd, identity[T])(m)
+
+ /**
+ * Obtain a [[MonoidAggregator]] that uses an efficient append operation for faster aggregation
+ * @tparam F
+ * Data input type
+ * @tparam T
+ * Aggregating [[Monoid]] type
+ * @tparam P
+ * Presentation (output) type
+ * @param appnd
+ * Function that appends the [[Monoid]]. Defines the [[MonoidAggregator.append]] method for this
+ * aggregator. Analogous to the 'seqop' function in Scala's sequence 'aggregate' method
+ * @param pres
+ * The presentation function
+ * @param m
+ * The [[Monoid]] type class
+ * @note
+ * The function 'appnd' is expected to obey the law: {{{appnd(t, f) == m.plus(t, appnd(m.zero, f))}}}
+ */
+ def appendMonoid[F, T, P](appnd: (T, F) => T, pres: T => P)(implicit
+ m: Monoid[T]
+ ): MonoidAggregator[F, T, P] =
+ new MonoidAggregator[F, T, P] {
+ override def monoid: Monoid[T] = m
+ override def prepare(input: F): T = appnd(m.zero, input)
+ override def present(reduction: T): P = pres(reduction)
+
+ override def apply(inputs: TraversableOnce[F]): P = present(agg(inputs))
+
+ override def applyOption(inputs: TraversableOnce[F]): Option[P] =
+ if (inputs.isEmpty) None else Some(apply(inputs))
+
+ override def append(l: T, r: F): T = appnd(l, r)
+
+ override def appendAll(old: T, items: TraversableOnce[F]): T =
+ reduce(old, agg(items))
+
+ override def appendAll(items: TraversableOnce[F]): T = agg(items)
+
+ private def agg(inputs: TraversableOnce[F]): T =
+ inputs.foldLeft(m.zero)(append)
+ }
+
+ /**
+ * How many items satisfy a predicate
+ */
+ def count[T](pred: T => Boolean): MonoidAggregator[T, Long, Long] =
+ prepareMonoid { (t: T) => if (pred(t)) 1L else 0L }
+
+ /**
+ * Do any items satisfy some predicate
+ */
+ def exists[T](pred: T => Boolean): MonoidAggregator[T, Boolean, Boolean] =
+ prepareMonoid(pred)(OrVal.unboxedMonoid)
+
+ /**
+ * Do all items satisfy a predicate
+ */
+ def forall[T](pred: T => Boolean): MonoidAggregator[T, Boolean, Boolean] =
+ prepareMonoid(pred)(AndVal.unboxedMonoid)
+
+ /**
+ * Take the first (left most in reduce order) item found
+ */
+ def head[T]: Aggregator[T, T, T] = fromReduce[T]((l, _) => l)
+
+ /**
+ * Take the last (right most in reduce order) item found
+ */
+ def last[T]: Aggregator[T, T, T] = fromReduce[T]((_, r) => r)
+
+ /**
+ * Get the maximum item
+ */
+ def max[T: Ordering]: Aggregator[T, T, T] = new MaxAggregator[T]
+ def maxBy[U, T: Ordering](fn: U => T): Aggregator[U, U, U] = {
+ implicit val ordU: Ordering[U] = Ordering.by(fn)
+ max[U]
+ }
+
+ /**
+ * Get the minimum item
+ */
+ def min[T: Ordering]: Aggregator[T, T, T] = new MinAggregator[T]
+ def minBy[U, T: Ordering](fn: U => T): Aggregator[U, U, U] = {
+ implicit val ordU: Ordering[U] = Ordering.by(fn)
+ min[U]
+ }
+
+ /**
+ * This returns the number of items we find
+ */
+ def size: MonoidAggregator[Any, Long, Long] =
+ prepareMonoid((_: Any) => 1L)
+
+ /**
+ * Take the smallest `count` items using a heap
+ */
+ def sortedTake[T: Ordering](count: Int): MonoidAggregator[T, PriorityQueue[T], Seq[T]] =
+ new mutable.PriorityQueueToListAggregator[T](count)
+
+ /**
+ * Same as sortedTake, but using a function that returns a value that has an Ordering.
+ *
+ * This function is like writing list.sortBy(fn).take(count).
+ */
+ def sortByTake[T, U: Ordering](count: Int)(fn: T => U): MonoidAggregator[T, PriorityQueue[T], Seq[T]] =
+ Aggregator.sortedTake(count)(Ordering.by(fn))
+
+ /**
+ * Take the largest `count` items using a heap
+ */
+ def sortedReverseTake[T: Ordering](count: Int): MonoidAggregator[T, PriorityQueue[T], Seq[T]] =
+ new mutable.PriorityQueueToListAggregator[T](count)(implicitly[Ordering[T]].reverse)
+
+ /**
+ * Same as sortedReverseTake, but using a function that returns a value that has an Ordering.
+ *
+ * This function is like writing list.sortBy(fn).reverse.take(count).
+ */
+ def sortByReverseTake[T, U: Ordering](
+ count: Int
+ )(fn: T => U): MonoidAggregator[T, PriorityQueue[T], Seq[T]] =
+ Aggregator.sortedReverseTake(count)(Ordering.by(fn))
+
+ /**
+ * Immutable version of sortedTake, for frameworks that check immutability of reduce functions.
+ */
+ def immutableSortedTake[T: Ordering](count: Int): MonoidAggregator[T, TopK[T], Seq[T]] =
+ new TopKToListAggregator[T](count)
+
+ /**
+ * Immutable version of sortedReverseTake, for frameworks that check immutability of reduce functions.
+ */
+ def immutableSortedReverseTake[T: Ordering](count: Int): MonoidAggregator[T, TopK[T], Seq[T]] =
+ new TopKToListAggregator[T](count)(implicitly[Ordering[T]].reverse)
+
+ /**
+ * Randomly selects input items where each item has an independent probability 'prob' of being selected.
+ * This assumes that all sampled records can fit in memory, so use this only when the expected number of
+ * sampled values is small.
+ */
+ def randomSample[T](
+ prob: Double,
+ seed: Int = DefaultSeed
+ ): MonoidAggregator[T, Option[Batched[T]], List[T]] = {
+ assert(prob >= 0 && prob <= 1, "randomSample.prob must lie in [0, 1]")
+ val rng = new java.util.Random(seed)
+ Preparer[T]
+ .filter(_ => rng.nextDouble() <= prob)
+ .monoidAggregate(toList)
+ }
+
+ /**
+ * Selects exactly 'count' of the input records randomly (or all of the records if there are less then
+ * 'count' total records). This assumes that all 'count' of the records can fit in memory, so use this only
+ * for small values of 'count'.
+ */
+ def reservoirSample[T](
+ count: Int,
+ seed: Int = DefaultSeed
+ ): MonoidAggregator[T, PriorityQueue[(Double, T)], Seq[T]] = {
+ val rng = new java.util.Random(seed)
+ Preparer[T]
+ .map(rng.nextDouble() -> _)
+ .monoidAggregate(sortByTake(count)(_._1))
+ .andThenPresent(_.map(_._2))
+ }
+
+ /**
+ * Put everything in a List. Note, this could fill the memory if the List is very large.
+ */
+ def toList[T]: MonoidAggregator[T, Option[Batched[T]], List[T]] =
+ new MonoidAggregator[T, Option[Batched[T]], List[T]] {
+ override def prepare(t: T): Option[Batched[T]] = Some(Batched(t))
+ override def monoid: Monoid[Option[Batched[T]]] =
+ Monoid.optionMonoid(Batched.semigroup)
+ override def present(o: Option[Batched[T]]): List[T] =
+ o.map(_.toList).getOrElse(Nil)
+ }
+
+ /**
+ * Put everything in a Set. Note, this could fill the memory if the Set is very large.
+ */
+ def toSet[T]: MonoidAggregator[T, Set[T], Set[T]] =
+ prepareMonoid { (t: T) => Set(t) }
+
+ /**
+ * This builds an in-memory Set, and then finally gets the size of that set. This may not be scalable if the
+ * Uniques are very large. You might check the approximateUniqueCount or HyperLogLog Aggregator to get an
+ * approximate version of this that is scalable.
+ */
+ def uniqueCount[T]: MonoidAggregator[T, Set[T], Int] =
+ toSet[T].andThenPresent(_.size)
+
+ /**
+ * Using a constant amount of memory, give an approximate unique count (~ 1% error). This uses an exact set
+ * for up to 100 items, then HyperLogLog (HLL) with an 1.2% standard error which uses at most 8192 bytes for
+ * each HLL. For more control, see HyperLogLogAggregator.
+ */
+ def approximateUniqueCount[T: Hash128]: MonoidAggregator[T, Either[HLL, Set[T]], Long] =
+ SetSizeHashAggregator[T](hllBits = 13, maxSetSize = 100)
+
+ /**
+ * Returns the lower bound of a given percentile where the percentile is between (0,1] The items that are
+ * iterated over cannot be negative.
+ */
+ def approximatePercentile[T](percentile: Double, k: Int = QTreeAggregator.DefaultK)(implicit
+ num: Numeric[T]
+ ): QTreeAggregatorLowerBound[T] =
+ QTreeAggregatorLowerBound[T](percentile, k)
+
+ /**
+ * Returns the intersection of a bounded percentile where the percentile is between (0,1] The items that are
+ * iterated over cannot be negative.
+ */
+ def approximatePercentileBounds[T](percentile: Double, k: Int = QTreeAggregator.DefaultK)(implicit
+ num: Numeric[T]
+ ): QTreeAggregator[T] =
+ QTreeAggregator[T](percentile, k)
+
+ /**
+ * An aggregator that sums Numeric values into Doubles.
+ *
+ * This is really no more than converting to Double and then summing. The conversion to double means we
+ * don't have the overflow semantics of integer types on the jvm (e.g. Int.MaxValue + 1 == Int.MinValue).
+ *
+ * Note that if you instead wanted to aggregate Numeric values of a type T into the same type T (e.g. if you
+ * want MonoidAggregator[T, T, T] for some Numeric type T), you can directly use Aggregator.fromMonoid[T]
+ * after importing the numericRing implicit:
+ *
+ * > import com.twitter.algebird.Ring.numericRing > def numericAggregator[T: Numeric]: MonoidAggregator[T,
+ * T, T] = Aggregator.fromMonoid[T]
+ */
+ def numericSum[T](implicit num: Numeric[T]): MonoidAggregator[T, Double, Double] =
+ Preparer[T].map(num.toDouble).monoidAggregate(Aggregator.fromMonoid)
+
+}
+
+/**
+ * This is a type that models map/reduce(map). First each item is mapped, then we reduce with a semigroup,
+ * then finally we present the results.
+ *
+ * Unlike Fold, Aggregator keeps it's middle aggregation type externally visible. This is because Aggregators
+ * are useful in parallel map/reduce systems where there may be some additional types needed to cross the
+ * map/reduce boundary (such a serialization and intermediate storage). If you don't care about the middle
+ * type, an _ may be used and the main utility of the instance is still preserved (e.g. def operate[T](ag:
+ * Aggregator[T, _, Int]): Int)
+ *
+ * Note, join is very useful to combine multiple aggregations with one pass. Also
+ * GeneratedTupleAggregator.fromN((agg1, agg2, ... aggN)) can glue these together well.
+ *
+ * This type is the the Fold.M from Haskell's fold package:
+ * https://hackage.haskell.org/package/folds-0.6.2/docs/Data-Fold-M.html
+ */
+trait Aggregator[-A, B, +C] extends java.io.Serializable { self =>
+ def prepare(input: A): B
+ def semigroup: Semigroup[B]
+ def present(reduction: B): C
+
+ /* *****
+ * All the following are in terms of the above
+ */
+
+ /**
+ * combine two inner values
+ */
+ def reduce(l: B, r: B): B = semigroup.plus(l, r)
+
+ /**
+ * This may error if items is empty. To be safe you might use reduceOption if you don't know that items is
+ * non-empty
+ */
+ def reduce(items: TraversableOnce[B]): B = semigroup.sumOption(items).get
+
+ /**
+ * This is the safe version of the above. If the input in empty, return None, else reduce the items
+ */
+ def reduceOption(items: TraversableOnce[B]): Option[B] =
+ semigroup.sumOption(items)
+
+ /**
+ * This may error if inputs are empty (for Monoid Aggregators it never will, instead you see
+ * present(Monoid.zero[B])
+ */
+ def apply(inputs: TraversableOnce[A]): C =
+ present(reduce(inputs.iterator.map(prepare)))
+
+ /**
+ * This returns None if the inputs are empty
+ */
+ def applyOption(inputs: TraversableOnce[A]): Option[C] =
+ reduceOption(inputs.iterator.map(prepare))
+ .map(present)
+
+ /**
+ * This returns the cumulative sum of its inputs, in the same order. If the inputs are empty, the result
+ * will be empty too.
+ */
+ def cumulativeIterator(inputs: Iterator[A]): Iterator[C] =
+ inputs
+ .scanLeft(None: Option[B]) {
+ case (None, a) => Some(prepare(a))
+ case (Some(b), a) => Some(append(b, a))
+ }
+ .collect { case Some(b) => present(b) }
+
+ /**
+ * This returns the cumulative sum of its inputs, in the same order. If the inputs are empty, the result
+ * will be empty too.
+ */
+ def applyCumulatively[In <: TraversableOnce[A], Out](
+ inputs: In
+ )(implicit bf: CanBuildFrom[In, C, Out]): Out =
+ (bf: BuildFrom[In, C, Out]).fromSpecific(inputs)(cumulativeIterator(inputs.iterator))
+
+ def append(l: B, r: A): B = reduce(l, prepare(r))
+
+ def appendAll(old: B, items: TraversableOnce[A]): B =
+ if (items.iterator.isEmpty) old else reduce(old, reduce(items.iterator.map(prepare)))
+
+ /** Like calling andThen on the present function */
+ def andThenPresent[D](present2: C => D): Aggregator[A, B, D] =
+ new Aggregator[A, B, D] {
+ override def prepare(input: A): B = self.prepare(input)
+ override def semigroup: Semigroup[B] = self.semigroup
+ override def present(reduction: B): D = present2(self.present(reduction))
+ }
+
+ /** Like calling compose on the prepare function */
+ def composePrepare[A1](prepare2: A1 => A): Aggregator[A1, B, C] =
+ new Aggregator[A1, B, C] {
+ override def prepare(input: A1): B = self.prepare(prepare2(input))
+ override def semigroup: Semigroup[B] = self.semigroup
+ override def present(reduction: B): C = self.present(reduction)
+ }
+
+ /**
+ * This allows you to run two aggregators on the same data with a single pass
+ */
+ def join[A2 <: A, B2, C2](that: Aggregator[A2, B2, C2]): Aggregator[A2, (B, B2), (C, C2)] =
+ GeneratedTupleAggregator.from2((this, that))
+
+ /**
+ * This allows you to join two aggregators into one that takes a tuple input, which in turn allows you to
+ * chain .composePrepare onto the result if you have an initial input that has to be prepared differently
+ * for each of the joined aggregators.
+ *
+ * The law here is: ag1.zip(ag2).apply(as.zip(bs)) == (ag1(as), ag2(bs))
+ */
+ def zip[A2, B2, C2](ag2: Aggregator[A2, B2, C2]): Aggregator[(A, A2), (B, B2), (C, C2)] = {
+ val ag1 = this
+ new Aggregator[(A, A2), (B, B2), (C, C2)] {
+ override def prepare(a: (A, A2)): (B, B2) = (ag1.prepare(a._1), ag2.prepare(a._2))
+ override val semigroup = new Tuple2Semigroup()(ag1.semigroup, ag2.semigroup)
+ override def present(b: (B, B2)): (C, C2) = (ag1.present(b._1), ag2.present(b._2))
+ }
+ }
+
+ /**
+ * An Aggregator can be converted to a Fold, but not vice-versa Note, a Fold is more constrained so only do
+ * this if you require joining a Fold with an Aggregator to produce a Fold
+ */
+ def toFold: Fold[A, Option[C]] =
+ Fold.fold[Option[B], A, Option[C]](
+ {
+ case (None, a) => Some(self.prepare(a))
+ case (Some(b), a) => Some(self.append(b, a))
+ },
+ None,
+ _.map(self.present)
+ )
+
+ def lift: MonoidAggregator[A, Option[B], Option[C]] =
+ new MonoidAggregator[A, Option[B], Option[C]] {
+ override def prepare(input: A): Option[B] = Some(self.prepare(input))
+ override def present(reduction: Option[B]): Option[C] = reduction.map(self.present)
+ override def monoid = new OptionMonoid[B]()(self.semigroup)
+ }
+}
+
+/**
+ * Aggregators are Applicatives, but this hides the middle type. If you need a join that does not hide the
+ * middle type use join on the trait, or GeneratedTupleAggregator.fromN
+ */
+class AggregatorApplicative[I] extends Applicative[({ type L[O] = Aggregator[I, ?, O] })#L] {
+ override def map[T, U](mt: Aggregator[I, ?, T])(fn: T => U): Aggregator[I, ?, U] =
+ mt.andThenPresent(fn)
+ override def apply[T](v: T): Aggregator[I, ?, T] =
+ Aggregator.const(v)
+ override def join[T, U](mt: Aggregator[I, ?, T], mu: Aggregator[I, ?, U]): Aggregator[I, ?, (T, U)] =
+ mt.join(mu)
+ override def join[T1, T2, T3](
+ m1: Aggregator[I, ?, T1],
+ m2: Aggregator[I, ?, T2],
+ m3: Aggregator[I, ?, T3]
+ ): Aggregator[I, ?, (T1, T2, T3)] =
+ GeneratedTupleAggregator.from3((m1, m2, m3))
+
+ override def join[T1, T2, T3, T4](
+ m1: Aggregator[I, ?, T1],
+ m2: Aggregator[I, ?, T2],
+ m3: Aggregator[I, ?, T3],
+ m4: Aggregator[I, ?, T4]
+ ): Aggregator[I, ?, (T1, T2, T3, T4)] =
+ GeneratedTupleAggregator.from4((m1, m2, m3, m4))
+
+ override def join[T1, T2, T3, T4, T5](
+ m1: Aggregator[I, ?, T1],
+ m2: Aggregator[I, ?, T2],
+ m3: Aggregator[I, ?, T3],
+ m4: Aggregator[I, ?, T4],
+ m5: Aggregator[I, ?, T5]
+ ): Aggregator[I, ?, (T1, T2, T3, T4, T5)] =
+ GeneratedTupleAggregator.from5((m1, m2, m3, m4, m5))
+}
+
+trait MonoidAggregator[-A, B, +C] extends Aggregator[A, B, C] { self =>
+ def monoid: Monoid[B]
+ override def semigroup: Monoid[B] = monoid
+ final override def reduce(items: TraversableOnce[B]): B =
+ monoid.sum(items)
+
+ def appendAll(items: TraversableOnce[A]): B = reduce(items.iterator.map(prepare))
+
+ override def andThenPresent[D](present2: C => D): MonoidAggregator[A, B, D] = {
+ val self = this
+ new MonoidAggregator[A, B, D] {
+ override def prepare(a: A): B = self.prepare(a)
+ override def monoid: Monoid[B] = self.monoid
+ override def present(b: B): D = present2(self.present(b))
+ }
+ }
+ override def composePrepare[A2](prepare2: A2 => A): MonoidAggregator[A2, B, C] = {
+ val self = this
+ new MonoidAggregator[A2, B, C] {
+ override def prepare(a: A2): B = self.prepare(prepare2(a))
+ override def monoid: Monoid[B] = self.monoid
+ override def present(b: B): C = self.present(b)
+ }
+ }
+
+ /**
+ * Build a MonoidAggregator that either takes left or right input and outputs the pair from both
+ */
+ def either[A2, B2, C2](
+ that: MonoidAggregator[A2, B2, C2]
+ ): MonoidAggregator[Either[A, A2], (B, B2), (C, C2)] =
+ new MonoidAggregator[Either[A, A2], (B, B2), (C, C2)] {
+ override def prepare(e: Either[A, A2]): (B, B2) = e match {
+ case Left(a) => (self.prepare(a), that.monoid.zero)
+ case Right(a2) => (self.monoid.zero, that.prepare(a2))
+ }
+ override val monoid = new Tuple2Monoid[B, B2]()(self.monoid, that.monoid)
+ override def present(bs: (B, B2)): (C, C2) = (self.present(bs._1), that.present(bs._2))
+ }
+
+ /**
+ * Only transform values where the function is defined, else discard
+ */
+ def collectBefore[A2](fn: PartialFunction[A2, A]): MonoidAggregator[A2, B, C] =
+ new MonoidAggregator[A2, B, C] {
+ override def prepare(a: A2): B =
+ if (fn.isDefinedAt(a)) self.prepare(fn(a)) else self.monoid.zero
+ override def monoid: Monoid[B] = self.monoid
+ override def present(b: B): C = self.present(b)
+ }
+
+ /**
+ * Only aggregate items that match a predicate
+ */
+ def filterBefore[A1 <: A](pred: A1 => Boolean): MonoidAggregator[A1, B, C] =
+ new MonoidAggregator[A1, B, C] {
+ override def prepare(a: A1): B = if (pred(a)) self.prepare(a) else self.monoid.zero
+ override def monoid: Monoid[B] = self.monoid
+ override def present(b: B): C = self.present(b)
+ }
+
+ /**
+ * This maps the inputs to Bs, then sums them, effectively flattening the inputs to the MonoidAggregator
+ */
+ def sumBefore: MonoidAggregator[TraversableOnce[A], B, C] =
+ new MonoidAggregator[TraversableOnce[A], B, C] {
+ override def monoid: Monoid[B] = self.monoid
+ override def prepare(input: TraversableOnce[A]): B =
+ monoid.sum(input.iterator.map(self.prepare))
+ override def present(reduction: B): C = self.present(reduction)
+ }
+
+ /**
+ * This allows you to join two aggregators into one that takes a tuple input, which in turn allows you to
+ * chain .composePrepare onto the result if you have an initial input that has to be prepared differently
+ * for each of the joined aggregators.
+ *
+ * The law here is: ag1.zip(ag2).apply(as.zip(bs)) == (ag1(as), ag2(bs))
+ */
+ def zip[A2, B2, C2](ag2: MonoidAggregator[A2, B2, C2]): MonoidAggregator[(A, A2), (B, B2), (C, C2)] = {
+ val ag1 = self
+ new MonoidAggregator[(A, A2), (B, B2), (C, C2)] {
+ override def prepare(a: (A, A2)): (B, B2) = (ag1.prepare(a._1), ag2.prepare(a._2))
+ override val monoid = new Tuple2Monoid[B, B2]()(ag1.monoid, ag2.monoid)
+ override def present(b: (B, B2)): (C, C2) = (ag1.present(b._1), ag2.present(b._2))
+ }
+ }
+}
+
+trait RingAggregator[-A, B, +C] extends MonoidAggregator[A, B, C] {
+ def ring: Ring[B]
+ override def monoid: Monoid[B] = Ring.asTimesMonoid(ring)
+}
diff --git a/algebird-core/src/main/scala-3/CountMinSketch.scala b/algebird-core/src/main/scala-3/CountMinSketch.scala
new file mode 100644
index 000000000..a526b2a51
--- /dev/null
+++ b/algebird-core/src/main/scala-3/CountMinSketch.scala
@@ -0,0 +1,1418 @@
+/*
+Copyright 2012 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+ */
+
+import algebra.CommutativeMonoid
+
+import scala.collection.compat._
+
+/**
+ * A Count-Min sketch is a probabilistic data structure used for summarizing streams of data in sub-linear
+ * space.
+ *
+ * It works as follows. Let `(eps, delta)` be two parameters that describe the confidence in our error
+ * estimates, and let `d = ceil(ln 1/delta)` and `w = ceil(e / eps)`.
+ *
+ * Note: Throughout the code `d` and `w` are called `depth` and `width`, respectively.
+ *
+ * Then:
+ *
+ * - Take `d` pairwise independent hash functions `h_i`, each of which maps onto the domain `[0, w - 1]`.
+ * - Create a 2-dimensional table of counts, with `d` rows and `w` columns, initialized with all zeroes.
+ * - When a new element x arrives in the stream, update the table of counts by setting `counts[i, h_i[x]] +=
+ * 1`, for each `1 <= i <= d`.
+ * - (Note the rough similarity to a Bloom filter.)
+ *
+ * As an example application, suppose you want to estimate the number of times an element `x` has appeared in
+ * a data stream so far. The Count-Min sketch estimate of this frequency is
+ *
+ * min_i { counts[i, h_i[x]] }
+ *
+ * With probability at least `1 - delta`, this estimate is within `eps * N` of the true frequency (i.e., `true
+ * frequency <= estimate <= true frequency + eps * N`), where N is the total size of the stream so far.
+ *
+ * See http://www.eecs.harvard.edu/~michaelm/CS222/countmin.pdf for technical details, including proofs of the
+ * estimates and error bounds used in this implementation.
+ *
+ * Parts of this implementation are taken from
+ * https://github.com/clearspring/stream-lib/blob/master/src/main/java/com/clearspring/analytics/stream/frequency/CountMinSketch.java
+ *
+ * @author
+ * Edwin Chen
+ */
+/**
+ * Monoid for adding CMS sketches.
+ *
+ * =Usage=
+ *
+ * `eps` and `delta` are parameters that bound the error of each query estimate. For example, errors in
+ * answering point queries (e.g., how often has element x appeared in the stream described by the sketch?) are
+ * often of the form: "with probability p >= 1 - delta, the estimate is close to the truth by some factor
+ * depending on eps."
+ *
+ * The type `K` is the type of items you want to count. You must provide an implicit `CMSHasher[K]` for `K`,
+ * and Algebird ships with several such implicits for commonly used types such as `Long` and `BigInt`.
+ *
+ * If your type `K` is not supported out of the box, you have two options: 1) You provide a "translation"
+ * function to convert items of your (unsupported) type `K` to a supported type such as Double, and then use
+ * the `contramap` function of [[CMSHasher]] to create the required `CMSHasher[K]` for your type (see the
+ * documentation of [[CMSHasher]] for an example); 2) You implement a `CMSHasher[K]` from scratch, using the
+ * existing CMSHasher implementations as a starting point.
+ *
+ * Note: Because Arrays in Scala/Java not have sane `equals` and `hashCode` implementations, you cannot safely
+ * use types such as `Array[Byte]`. Extra work is required for Arrays. For example, you may opt to convert
+ * `Array[T]` to a `Seq[T]` via `toSeq`, or you can provide appropriate wrapper classes. Algebird provides one
+ * such wrapper class, [[Bytes]], to safely wrap an `Array[Byte]` for use with CMS.
+ *
+ * @param eps
+ * One-sided error bound on the error of each point query, i.e. frequency estimate.
+ * @param delta
+ * A bound on the probability that a query estimate does not lie within some small interval (an interval
+ * that depends on `eps`) around the truth.
+ * @param seed
+ * A seed to initialize the random number generator used to create the pairwise independent hash functions.
+ * @param maxExactCountOpt
+ * An Option parameter about how many exact counts a sparse CMS wants to keep.
+ * @tparam K
+ * The type used to identify the elements to be counted. For example, if you want to count the occurrence of
+ * user names, you could map each username to a unique numeric ID expressed as a `Long`, and then count the
+ * occurrences of those `Long`s with a CMS of type `K=Long`. Note that this mapping between the elements of
+ * your problem domain and their identifiers used for counting via CMS should be bijective. We require a
+ * [[CMSHasher]] context bound for `K`, see [[CMSHasherImplicits]] for available implicits that can be
+ * imported. Which type K should you pick in practice? For domains that have less than `2^64` unique
+ * elements, you'd typically use `Long`. For larger domains you can try `BigInt`, for example. Other
+ * possibilities include Spire's `SafeLong` and `Numerical` data types (https://github.com/non/spire),
+ * though Algebird does not include the required implicits for CMS-hashing (cf. [[CMSHasherImplicits]].
+ */
+class CMSMonoid[K: CMSHasher](eps: Double, delta: Double, seed: Int, maxExactCountOpt: Option[Int] = None)
+ extends Monoid[CMS[K]]
+ with CommutativeMonoid[CMS[K]] {
+
+ val params: CMSParams[K] = {
+ val hashes: Seq[CMSHash[K]] = CMSFunctions.generateHashes(eps, delta, seed)
+ CMSParams(hashes, eps, delta, maxExactCountOpt)
+ }
+
+ override val zero: CMS[K] = CMSZero[K](params)
+
+ /**
+ * Combines the two sketches.
+ *
+ * The sketches must use the same hash functions.
+ */
+ override def plus(left: CMS[K], right: CMS[K]): CMS[K] = {
+ require(left.params.hashes == right.params.hashes, "The sketches must use the same hash functions.")
+ left ++ right
+ }
+
+ /**
+ * Creates a sketch out of a single item.
+ */
+ def create(item: K): CMS[K] = CMSItem[K](item, 1L, params)
+
+ /**
+ * Creates a sketch out of multiple items.
+ */
+ def create(data: Seq[K]): CMS[K] = {
+ val summation = new CMSSummation(params)
+ data.foreach(k => summation.insert(k, 1L))
+ summation.result
+ }
+
+ override def sumOption(sketches: TraversableOnce[CMS[K]]): Option[CMS[K]] =
+ if (sketches.iterator.isEmpty) None else Some(sum(sketches))
+
+ override def sum(sketches: TraversableOnce[CMS[K]]): CMS[K] = {
+ val summation = new CMSSummation(params)
+ summation.updateAll(sketches)
+ summation.result
+ }
+}
+
+/**
+ * This mutable builder can be used when speed is essential and you can be sure the scope of the mutability
+ * cannot escape in an unsafe way. The intended use is to allocate and call result in one method without
+ * letting a reference to the instance escape into a closure.
+ */
+class CMSSummation[K](params: CMSParams[K]) {
+ private[this] val hashes = params.hashes.toArray
+ private[this] val height = CMSFunctions.depth(params.delta)
+ private[this] val width = CMSFunctions.width(params.eps)
+ private[this] val cells = new Array[Long](height * width)
+ private[this] var totalCount = 0L
+
+ final def insert(k: K, count: Long): Unit = {
+ var row = 0
+ var offset = 0
+ val hs = hashes
+ while (row < hs.length) {
+ cells(offset + hs(row)(k)) += count
+ offset += width
+ row += 1
+ }
+ totalCount += count
+ }
+
+ def updateAll(sketches: TraversableOnce[CMS[K]]): Unit =
+ sketches.iterator.foreach(updateInto)
+
+ def updateInto(cms: CMS[K]): Unit =
+ cms match {
+ case CMSZero(_) =>
+ ()
+ case CMSItem(item, count, _) =>
+ insert(item, count)
+ case SparseCMS(table, _, _) =>
+ table.foreach { case (item, c) =>
+ insert(item, c)
+ }
+ case CMSInstance(CMSInstance.CountsTable(matrix), count, _) =>
+ var offset = 0
+ val rit = matrix.iterator
+ while (rit.hasNext) {
+ var col = 0
+ val cit = rit.next().iterator
+ while (cit.hasNext) {
+ cells(offset + col) += cit.next()
+ col += 1
+ }
+ offset += width
+ }
+ totalCount += count
+ }
+
+ def result: CMS[K] =
+ if (totalCount == 0L) CMSZero(params)
+ else {
+ def vectorize(row: Int): Vector[Long] = {
+ val offset = row * width
+ val b = Vector.newBuilder[Long]
+ var col = 0
+ while (col < width) {
+ b += cells(offset + col)
+ col += 1
+ }
+ b.result()
+ }
+
+ val b = Vector.newBuilder[Vector[Long]]
+ var row = 0
+ while (row < height) {
+ b += vectorize(row)
+ row += 1
+ }
+ CMSInstance(CMSInstance.CountsTable(b.result()), totalCount, params)
+ }
+}
+
+/**
+ * An Aggregator for [[CMS]]. Can be created using CMS.aggregator.
+ */
+case class CMSAggregator[K](cmsMonoid: CMSMonoid[K]) extends MonoidAggregator[K, CMS[K], CMS[K]] {
+ override val monoid: CMSMonoid[K] = cmsMonoid
+
+ override def prepare(value: K): CMS[K] = monoid.create(value)
+
+ override def present(cms: CMS[K]): CMS[K] = cms
+
+}
+
+/**
+ * Configuration parameters for [[CMS]].
+ *
+ * @param hashes
+ * Pair-wise independent hashes functions. We need `N=depth` such functions (`depth` can be derived from
+ * `delta`).
+ * @param eps
+ * One-sided error bound on the error of each point query, i.e. frequency estimate.
+ * @param delta
+ * A bound on the probability that a query estimate does not lie within some small interval (an interval
+ * that depends on `eps`) around the truth.
+ * @param maxExactCountOpt
+ * An Option parameter about how many exact counts a sparse CMS wants to keep.
+ * @tparam K
+ * The type used to identify the elements to be counted.
+ */
+case class CMSParams[K](
+ hashes: Seq[CMSHash[K]],
+ eps: Double,
+ delta: Double,
+ maxExactCountOpt: Option[Int] = None
+) {
+
+ require(0 < eps && eps < 1, "eps must lie in (0, 1)")
+ require(0 < delta && delta < 1, "delta must lie in (0, 1)")
+ require(
+ hashes.size >= CMSFunctions.depth(delta),
+ s"we require at least ${CMSFunctions.depth(delta)} hash functions"
+ )
+
+}
+
+/**
+ * Helper functions to generate or to translate between various CMS parameters (cf. [[CMSParams]]).
+ */
+object CMSFunctions {
+
+ /**
+ * Translates from `width` to `eps`.
+ */
+ def eps(width: Int): Double = scala.math.exp(1.0) / width
+
+ /**
+ * Translates from `depth` to `delta`.
+ */
+ @throws[IllegalArgumentException]("if depth is too large, causing precision errors when computing delta")
+ def delta(depth: Int): Double = {
+ val i = scala.math.exp(-depth)
+ require(
+ i > 0.0,
+ s"depth must be smaller as it causes precision errors when computing delta ($depth led to an invalid delta of $i)"
+ )
+ i
+ }
+
+ /**
+ * Translates from `delta` to `depth`.
+ */
+ @throws[IllegalArgumentException]("if delta is is not in (0, 1)")
+ def depth(delta: Double): Int = {
+ require(0 < delta && delta < 1, "delta must lie in (0, 1)")
+ scala.math.ceil(scala.math.log(1.0 / delta)).toInt
+ }
+
+ /**
+ * Translates from `eps` to `width`.
+ */
+ def width(eps: Double): Int =
+ scala.math.ceil(truncatePrecisionError(scala.math.exp(1) / eps)).toInt
+
+ /**
+ * Compute maxExactCount from parameters or `depth` and `width`
+ */
+ def maxExactCount(maxExactCountOpt: Option[Int], depth: Int, width: Int): Int =
+ maxExactCountOpt.getOrElse(math.max(width * depth / 100, 50))
+
+ // Eliminates precision errors such as the following:
+ //
+ // scala> val width = 39
+ // scala> scala.math.exp(1) / CMSFunctions.eps(width)
+ // res171: Double = 39.00000000000001 <<< should be 39.0
+ //
+ // Because of the actual types on which CMSFunctions operates (i.e. Int and Double), the maximum number of decimal
+ // places should be 6.
+ private def truncatePrecisionError(i: Double, decimalPlaces: Int = 6) =
+ BigDecimal(i)
+ .setScale(decimalPlaces, BigDecimal.RoundingMode.HALF_UP)
+ .toDouble
+
+ /**
+ * Generates `N=depth` pair-wise independent hash functions.
+ *
+ * @param eps
+ * One-sided error bound on the error of each point query, i.e. frequency estimate.
+ * @param delta
+ * Error bound on the probability that a query estimate does NOT lie within some small interval around the
+ * truth.
+ * @param seed
+ * Seed for the random number generator.
+ * @tparam K
+ * The type used to identify the elements to be counted.
+ * @return
+ * The generated hash functions.
+ */
+ def generateHashes[K: CMSHasher](eps: Double, delta: Double, seed: Int): Seq[CMSHash[K]] = {
+ // Typically, we would use d -- aka depth -- pair-wise independent hash functions of the form
+ //
+ // h_i(x) = a_i * x + b_i (mod p)
+ //
+ // But for this particular application, setting b_i does not matter (since all it does is shift the results of a
+ // particular hash), so we omit it (by setting b_i to 0) and simply use hash functions of the form
+ //
+ // h_i(x) = a_i * x (mod p)
+ //
+ val r = new scala.util.Random(seed)
+ val numHashes = depth(delta)
+ val numCounters = width(eps)
+ (0 to (numHashes - 1)).map(_ => CMSHash[K](r.nextInt(), 0, numCounters))
+ }
+
+}
+
+/**
+ * A trait for CMS implementations that can count elements in a data stream and that can answer point queries
+ * (i.e. frequency estimates) for these elements.
+ *
+ * Known implementations: [[CMS]], [[TopCMS]].
+ *
+ * @tparam K
+ * The type used to identify the elements to be counted.
+ * @tparam C
+ * The type of the actual CMS that implements this trait.
+ */
+trait CMSCounting[K, C[_]] {
+
+ /**
+ * Returns the one-sided error bound on the error of each point query, i.e. frequency estimate.
+ */
+ def eps: Double
+
+ /**
+ * Returns the bound on the probability that a query estimate does NOT lie within some small interval (an
+ * interval that depends on `eps`) around the truth.
+ */
+ def delta: Double
+
+ /**
+ * Number of hash functions (also: number of rows in the counting table). This number is derived from
+ * `delta`.
+ */
+ def depth: Int = CMSFunctions.depth(delta)
+
+ /**
+ * Number of counters per hash function (also: number of columns in the counting table). This number is
+ * derived from `eps`.
+ */
+ def width: Int = CMSFunctions.width(eps)
+
+ /**
+ * An Option parameter about how many exact counts a sparse CMS wants to keep
+ */
+ def maxExactCountOpt: Option[Int]
+
+ /**
+ * Number of exact counts a sparse CMS wants to keep. This number is derived from `maxExactCountOpt`.
+ */
+ def maxExactCount: Int =
+ CMSFunctions.maxExactCount(maxExactCountOpt, depth, width)
+
+ /**
+ * Returns a new sketch that is the combination of this sketch and the other sketch.
+ */
+ def ++(other: C[K]): C[K]
+
+ /**
+ * Counts the item and returns the result as a new sketch.
+ */
+ def +(item: K): C[K] = this + (item, 1L)
+
+ /**
+ * Counts the item `count` times and returns the result as a new sketch.
+ */
+ def +(item: K, count: Long): C[K]
+
+ /**
+ * Returns an estimate of the total number of times this item has been seen in the stream so far. This
+ * estimate is an upper bound.
+ *
+ * It is always true that `estimatedFrequency >= trueFrequency`. With probability `p >= 1 - delta`, it also
+ * holds that `estimatedFrequency <= trueFrequency + eps * totalCount`.
+ */
+ def frequency(item: K): Approximate[Long]
+
+ /**
+ * Returns an estimate of the inner product against another data stream.
+ *
+ * In other words, let a_i denote the number of times element i has been seen in the data stream summarized
+ * by this CMS, and let b_i denote the same for the other CMS. Then this returns an estimate of ` =
+ * \sum a_i b_i`.
+ *
+ * Note: This can also be viewed as the join size between two relations.
+ *
+ * It is always true that actualInnerProduct <= estimatedInnerProduct. With probability `p >= 1 - delta`, it
+ * also holds that `estimatedInnerProduct <= actualInnerProduct + eps * thisTotalCount * otherTotalCount`.
+ */
+ def innerProduct(other: C[K]): Approximate[Long]
+
+ /**
+ * Total number of elements counted (i.e. seen in the data stream) so far.
+ */
+ def totalCount: Long
+
+ /**
+ * The first frequency moment is the total number of elements in the stream.
+ */
+ def f1: Long = totalCount
+
+ /**
+ * The second frequency moment is `\sum a_i^2`, where `a_i` is the count of the i-th element.
+ */
+ def f2: Approximate[Long]
+
+}
+
+/**
+ * A trait for CMS implementations that can track heavy hitters in a data stream.
+ *
+ * It is up to the implementation how the semantics of tracking heavy hitters are defined. For instance, one
+ * implementation could track the "top %" heavy hitters whereas another implementation could track the "top N"
+ * heavy hitters.
+ *
+ * Known implementations: [[TopCMS]].
+ *
+ * @tparam K
+ * The type used to identify the elements to be counted.
+ */
+trait CMSHeavyHitters[K] {
+
+ /**
+ * The pluggable logic of how heavy hitters are being tracked.
+ */
+ def heavyHittersLogic: HeavyHittersLogic[K]
+
+ /**
+ * Returns the set of heavy hitters.
+ */
+ def heavyHitters: Set[K]
+
+}
+
+object CMS {
+
+ def monoid[K: CMSHasher](eps: Double, delta: Double, seed: Int): CMSMonoid[K] =
+ monoid(eps, delta, seed, None)
+ def monoid[K: CMSHasher](
+ eps: Double,
+ delta: Double,
+ seed: Int,
+ maxExactCountOpt: Option[Int]
+ ): CMSMonoid[K] =
+ new CMSMonoid[K](eps, delta, seed, maxExactCountOpt)
+
+ def monoid[K: CMSHasher](depth: Int, width: Int, seed: Int): CMSMonoid[K] =
+ monoid(depth, width, seed, None)
+ def monoid[K: CMSHasher](depth: Int, width: Int, seed: Int, maxExactCountOpt: Option[Int]): CMSMonoid[K] =
+ monoid(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, maxExactCountOpt)
+
+ def aggregator[K: CMSHasher](eps: Double, delta: Double, seed: Int): CMSAggregator[K] =
+ aggregator(eps, delta, seed, None)
+ def aggregator[K: CMSHasher](
+ eps: Double,
+ delta: Double,
+ seed: Int,
+ maxExactCountOpt: Option[Int]
+ ): CMSAggregator[K] =
+ new CMSAggregator[K](monoid(eps, delta, seed, maxExactCountOpt))
+
+ def aggregator[K: CMSHasher](depth: Int, width: Int, seed: Int): CMSAggregator[K] =
+ aggregator(depth, width, seed, None)
+ def aggregator[K: CMSHasher](
+ depth: Int,
+ width: Int,
+ seed: Int,
+ maxExactCountOpt: Option[Int]
+ ): CMSAggregator[K] =
+ aggregator(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, maxExactCountOpt)
+
+ /**
+ * Returns a fresh, zeroed CMS instance.
+ */
+ def apply[K: CMSHasher](
+ eps: Double,
+ delta: Double,
+ seed: Int,
+ maxExactCountOpt: Option[Int] = None
+ ): CMS[K] = {
+ val params = {
+ val hashes: Seq[CMSHash[K]] =
+ CMSFunctions.generateHashes(eps, delta, seed)
+ CMSParams(hashes, eps, delta, maxExactCountOpt)
+ }
+ CMSZero[K](params)
+ }
+
+}
+
+/**
+ * A Count-Min sketch data structure that allows for counting and frequency estimation of elements in a data
+ * stream.
+ *
+ * Tip: If you also need to track heavy hitters ("Top N" problems), take a look at [[TopCMS]].
+ *
+ * =Usage=
+ *
+ * This example demonstrates how to count `Long` elements with [[CMS]], i.e. `K=Long`.
+ *
+ * Note that the actual counting is always performed with a `Long`, regardless of your choice of `K`. That is,
+ * the counting table behind the scenes is backed by `Long` values (at least in the current implementation),
+ * and thus the returned frequency estimates are always instances of `Approximate[Long]`.
+ *
+ * @example
+ * {{{
+ *
+ * // Creates a monoid for a CMS that can count `Long` elements. val cmsMonoid: CMSMonoid[Long] = { val eps =
+ * 0.001 val delta = 1E-10 val seed = 1 CMS.monoid[Long](eps, delta, seed) }
+ *
+ * // Creates a CMS instance that has counted the element `1L`. val cms: CMS[Long] = cmsMonoid.create(1L)
+ *
+ * // Estimates the frequency of `1L` val estimate: Approximate[Long] = cms.frequency(1L)
+ * }}}
+ *
+ * @tparam K
+ * The type used to identify the elements to be counted.
+ */
+sealed abstract class CMS[K](val params: CMSParams[K]) extends java.io.Serializable with CMSCounting[K, CMS] {
+
+ override val eps: Double = params.eps
+
+ override val delta: Double = params.delta
+
+ override val maxExactCountOpt: Option[Int] = params.maxExactCountOpt
+
+ override def f2: Approximate[Long] = innerProduct(this)
+
+}
+
+/**
+ * Zero element. Used for initialization.
+ */
+case class CMSZero[K](override val params: CMSParams[K]) extends CMS[K](params) {
+
+ override val totalCount: Long = 0L
+
+ override def +(item: K, count: Long): CMS[K] = CMSItem[K](item, count, params)
+
+ override def ++(other: CMS[K]): CMS[K] = other
+
+ override def frequency(item: K): Approximate[Long] = Approximate.exact(0L)
+
+ override def innerProduct(other: CMS[K]): Approximate[Long] =
+ Approximate.exact(0L)
+
+}
+
+/**
+ * Used for holding a single element, to avoid repeatedly adding elements from sparse counts tables.
+ */
+case class CMSItem[K](item: K, override val totalCount: Long, override val params: CMSParams[K])
+ extends CMS[K](params) {
+
+ override def +(x: K, count: Long): CMS[K] =
+ SparseCMS[K](params) + (item, totalCount) + (x, count)
+
+ override def ++(other: CMS[K]): CMS[K] =
+ other match {
+ case _: CMSZero[?] => this
+ case other: CMSItem[K] =>
+ CMSInstance[K](params) + (item, totalCount) + (other.item, other.totalCount)
+ case _ => other + item
+ }
+
+ override def frequency(x: K): Approximate[Long] =
+ if (item == x) Approximate.exact(totalCount) else Approximate.exact(0L)
+
+ override def innerProduct(other: CMS[K]): Approximate[Long] =
+ Approximate.exact(totalCount) * other.frequency(item)
+
+}
+
+/**
+ * A sparse Count-Min sketch structure, used for situations where the key is highly skewed.
+ */
+case class SparseCMS[K](
+ exactCountTable: Map[K, Long],
+ override val totalCount: Long,
+ override val params: CMSParams[K]
+) extends CMS[K](params) {
+ import SparseCMS._
+
+ override def +(x: K, count: Long): CMS[K] = {
+ val currentCount = exactCountTable.getOrElse(x, 0L)
+ val newTable = exactCountTable.updated(x, currentCount + count)
+ if (newTable.size < maxExactCount) {
+ // still sparse
+ SparseCMS(newTable, totalCount = totalCount + count, params = params)
+ } else {
+ toDense(newTable, params)
+ }
+ }
+
+ override def ++(other: CMS[K]): CMS[K] =
+ other match {
+ case _: CMSZero[?] => this
+ case other: CMSItem[K] => this + (other.item, other.totalCount)
+ case other: SparseCMS[K] =>
+ // This SparseCMS's maxExactCount is used, so ++ is not communitive
+ val newTable = Semigroup.plus(exactCountTable, other.exactCountTable)
+ if (newTable.size < maxExactCount) {
+ // still sparse
+ SparseCMS(newTable, totalCount = totalCount + other.totalCount, params = params)
+ } else {
+ toDense(newTable, params)
+ }
+
+ case other: CMSInstance[K] => other ++ this
+ }
+
+ override def frequency(x: K): Approximate[Long] =
+ Approximate.exact(exactCountTable.getOrElse(x, 0L))
+
+ override def innerProduct(other: CMS[K]): Approximate[Long] =
+ exactCountTable.iterator
+ .map { case (x, count) => Approximate.exact(count) * other.frequency(x) }
+ .reduceOption(_ + _)
+ .getOrElse(Approximate.exact(0L))
+}
+
+object SparseCMS {
+
+ /**
+ * Creates a new [[SparseCMS]] with empty exactCountTable
+ */
+ def apply[K](params: CMSParams[K]): SparseCMS[K] = {
+ val exactCountTable = Map[K, Long]()
+ SparseCMS[K](exactCountTable, totalCount = 0, params = params)
+ }
+
+ /**
+ * Creates a new [[CMSInstance]] from a Map[K, Long]
+ */
+ def toDense[K](exactCountTable: Map[K, Long], params: CMSParams[K]): CMS[K] =
+ // Create new CMSInstace
+ exactCountTable.foldLeft(CMSInstance[K](params)) { case (cms, (x, count)) =>
+ cms + (x, count)
+ }
+}
+
+/**
+ * The general Count-Min sketch structure, used for holding any number of elements.
+ */
+case class CMSInstance[K](
+ countsTable: CMSInstance.CountsTable[K],
+ override val totalCount: Long,
+ override val params: CMSParams[K]
+) extends CMS[K](params) {
+
+ override def ++(other: CMS[K]): CMS[K] =
+ other match {
+ case _: CMSZero[?] => this
+ case other: CMSItem[K] => this + other.item
+ case other: SparseCMS[K] =>
+ other.exactCountTable.foldLeft(this) { case (cms, (x, count)) =>
+ cms + (x, count)
+ }
+ case other: CMSInstance[K] =>
+ val newTable = countsTable ++ other.countsTable
+ val newTotalCount = totalCount + other.totalCount
+ CMSInstance[K](newTable, newTotalCount, params)
+ }
+
+ private def makeApprox(est: Long): Approximate[Long] =
+ if (est == 0L) Approximate.exact(0L)
+ else {
+ val lower = math.max(0L, est - (eps * totalCount).toLong)
+ Approximate(lower, est, est, 1 - delta)
+ }
+
+ override def frequency(item: K): Approximate[Long] = {
+ var freq = Long.MaxValue
+ val hs = params.hashes
+ val it = countsTable.counts.iterator
+ var i = 0
+ while (it.hasNext) {
+ val row = it.next()
+ val count = row(hs(i)(item))
+ if (count < freq) freq = count
+ i += 1
+ }
+ makeApprox(freq)
+ }
+
+ /**
+ * Let X be a CMS, and let count_X[j, k] denote the value in X's 2-dimensional count table at row j and
+ * column k. Then the Count-Min sketch estimate of the inner product between A and B is the minimum inner
+ * product between their rows: estimatedInnerProduct = min_j (\sum_k count_A[j, k] * count_B[j, k]|)
+ */
+ override def innerProduct(other: CMS[K]): Approximate[Long] =
+ other match {
+ case other: CMSInstance[?] =>
+ require(other.depth == depth && other.width == width, "Tables must have the same dimensions.")
+
+ def innerProductAtDepth(d: Int) =
+ (0 to (width - 1)).iterator.map { w =>
+ countsTable.getCount((d, w)) * other.countsTable.getCount((d, w))
+ }.sum
+
+ val est = (0 to (depth - 1)).iterator.map(innerProductAtDepth).min
+ val minimum =
+ math.max(est - (eps * totalCount * other.totalCount).toLong, 0)
+ Approximate(minimum, est, est, 1 - delta)
+ case _ => other.innerProduct(this)
+ }
+
+ override def +(item: K, count: Long): CMSInstance[K] = {
+ require(count >= 0, "count must be >= 0 (negative counts not implemented")
+ if (count != 0L) {
+ val newCountsTable =
+ (0 to (depth - 1)).foldLeft(countsTable) { case (table, row) =>
+ val pos = (row, params.hashes(row)(item))
+ table + (pos, count)
+ }
+ CMSInstance[K](newCountsTable, totalCount + count, params)
+ } else this
+ }
+
+}
+
+object CMSInstance {
+
+ /**
+ * Initializes a [[CMSInstance]] with all zeroes, i.e. nothing has been counted yet.
+ */
+ def apply[K](params: CMSParams[K]): CMSInstance[K] = {
+ val countsTable = CountsTable[K](CMSFunctions.depth(params.delta), CMSFunctions.width(params.eps))
+ CMSInstance[K](countsTable, 0, params)
+ }
+
+ /**
+ * The 2-dimensional table of counters used in the Count-Min sketch. Each row corresponds to a particular
+ * hash function.
+ */
+ // TODO: implement a dense matrix type, and use it here
+ case class CountsTable[K](counts: Vector[Vector[Long]]) {
+ require(depth > 0, "Table must have at least 1 row.")
+ require(width > 0, "Table must have at least 1 column.")
+
+ def depth: Int = counts.size
+
+ def width: Int = counts(0).size
+
+ def getCount(pos: (Int, Int)): Long = {
+ val (row, col) = pos
+ require(row < depth && col < width, "Position must be within the bounds of this table.")
+ counts(row)(col)
+ }
+
+ /**
+ * Updates the count of a single cell in the table.
+ */
+ def +(pos: (Int, Int), count: Long): CountsTable[K] = {
+ val (row, col) = pos
+ val currCount = getCount(pos)
+ val newCounts =
+ counts.updated(row, counts(row).updated(col, currCount + count))
+ CountsTable[K](newCounts)
+ }
+
+ /**
+ * Adds another counts table to this one, through element-wise addition.
+ */
+ def ++(other: CountsTable[K]): CountsTable[K] = {
+ require(depth == other.depth && width == other.width, "Tables must have the same dimensions.")
+ val xss = this.counts.iterator
+ val yss = other.counts.iterator
+ val rows = Vector.newBuilder[Vector[Long]]
+ while (xss.hasNext) {
+ val xs = xss.next().iterator
+ val ys = yss.next().iterator
+ val row = Vector.newBuilder[Long]
+ while (xs.hasNext) row += (xs.next() + ys.next())
+ rows += row.result()
+ }
+ CountsTable[K](rows.result())
+ }
+ }
+
+ object CountsTable {
+
+ /**
+ * Creates a new [[CountsTable]] with counts initialized to all zeroes.
+ */
+ def apply[K](depth: Int, width: Int): CountsTable[K] =
+ CountsTable[K](Vector.fill[Long](depth, width)(0L))
+
+ }
+
+}
+
+case class TopCMSParams[K](logic: HeavyHittersLogic[K])
+
+/**
+ * A Count-Min sketch data structure that allows for (a) counting and frequency estimation of elements in a
+ * data stream and (b) tracking the heavy hitters among these elements.
+ *
+ * The logic of how heavy hitters are computed is pluggable, see [[HeavyHittersLogic]].
+ *
+ * Tip: If you do not need to track heavy hitters, take a look at [[CMS]], which is more efficient in this
+ * case.
+ *
+ * =Usage=
+ *
+ * This example demonstrates how to count `Long` elements with [[TopCMS]], i.e. `K=Long`.
+ *
+ * Note that the actual counting is always performed with a `Long`, regardless of your choice of `K`. That is,
+ * the counting table behind the scenes is backed by `Long` values (at least in the current implementation),
+ * and thus the returned frequency estimates are always instances of `Approximate[Long]`.
+ *
+ * @example
+ * {{{ // Creates a monoid for a CMS that can count `Long` elements. val topPctCMSMonoid:
+ * TopPctCMSMonoid[Long] = { val eps = 0.001 val delta = 1E-10 val seed = 1 val heavyHittersPct = 0.1
+ * TopPctCMS.monoid[Long](eps, delta, seed, heavyHittersPct) }
+ *
+ * // Creates a TopCMS instance that has counted the element `1L`. val topCMS: TopCMS[Long] =
+ * topPctCMSMonoid.create(1L)
+ *
+ * // Estimates the frequency of `1L` val estimate: Approximate[Long] = topCMS.frequency(1L)
+ *
+ * // What are the heavy hitters so far? val heavyHitters: Set[Long] = topCMS.heavyHitters }}}
+ *
+ * @tparam K
+ * The type used to identify the elements to be counted.
+ */
+sealed abstract class TopCMS[K](val cms: CMS[K], params: TopCMSParams[K])
+ extends java.io.Serializable
+ with CMSCounting[K, TopCMS]
+ with CMSHeavyHitters[K] {
+
+ override val eps: Double = cms.eps
+
+ override val delta: Double = cms.delta
+
+ override val totalCount: Long = cms.totalCount
+
+ override val maxExactCountOpt: Option[Int] = cms.maxExactCountOpt
+
+ override def frequency(item: K): Approximate[Long] = cms.frequency(item)
+
+ override def innerProduct(other: TopCMS[K]): Approximate[Long] =
+ cms.innerProduct(other.cms)
+
+ override def f2: Approximate[Long] = innerProduct(this)
+
+ /**
+ * The pluggable logic with which heavy hitters are being tracked.
+ */
+ override def heavyHittersLogic: HeavyHittersLogic[K] = params.logic
+
+}
+
+/**
+ * Zero element. Used for initialization.
+ */
+case class TopCMSZero[K](override val cms: CMS[K], params: TopCMSParams[K]) extends TopCMS[K](cms, params) {
+
+ override val heavyHitters: Set[K] = Set.empty[K]
+
+ override def +(item: K, count: Long): TopCMS[K] =
+ TopCMSInstance(cms, params) + (item, count)
+
+ override def ++(other: TopCMS[K]): TopCMS[K] = other
+
+}
+
+/**
+ * Used for holding a single element, to avoid repeatedly adding elements from sparse counts tables.
+ */
+case class TopCMSItem[K](item: K, override val cms: CMS[K], params: TopCMSParams[K])
+ extends TopCMS[K](cms, params) {
+
+ override val heavyHitters: Set[K] = Set(item)
+
+ override def +(x: K, count: Long): TopCMS[K] = toCMSInstance + (x, count)
+
+ override def ++(other: TopCMS[K]): TopCMS[K] = other match {
+ case _: TopCMSZero[?] => this
+ case other: TopCMSItem[K] => toCMSInstance + other.item
+ case other: TopCMSInstance[K] => other + item
+ }
+
+ private def toCMSInstance: TopCMSInstance[K] = {
+ val hhs = HeavyHitters.from(HeavyHitter(item, 1L))
+ TopCMSInstance(cms, hhs, params)
+ }
+
+}
+
+object TopCMSInstance {
+
+ def apply[K](cms: CMS[K], params: TopCMSParams[K]): TopCMSInstance[K] =
+ TopCMSInstance[K](cms, HeavyHitters.empty[K], params)
+
+}
+
+case class TopCMSInstance[K](override val cms: CMS[K], hhs: HeavyHitters[K], params: TopCMSParams[K])
+ extends TopCMS[K](cms, params) {
+
+ override def heavyHitters: Set[K] = hhs.items
+
+ override def +(item: K, count: Long): TopCMSInstance[K] = {
+ require(count >= 0, "count must be >= 0 (negative counts not implemented")
+ if (count != 0L) {
+ val newCms = cms + (item, count)
+ val newHhs =
+ heavyHittersLogic.updateHeavyHitters(cms, newCms)(hhs, item, count)
+ TopCMSInstance[K](newCms, newHhs, params)
+ } else this
+ }
+
+ override def ++(other: TopCMS[K]): TopCMS[K] = other match {
+ case _: TopCMSZero[?] => this
+ case other: TopCMSItem[K] => this + other.item
+ case other: TopCMSInstance[K] =>
+ val newCms = cms ++ other.cms
+ val newHhs = heavyHittersLogic.updateHeavyHitters(newCms)(hhs, other.hhs)
+ TopCMSInstance(newCms, newHhs, params)
+ }
+
+}
+
+class TopCMSMonoid[K](emptyCms: CMS[K], logic: HeavyHittersLogic[K]) extends Monoid[TopCMS[K]] {
+
+ val params: TopCMSParams[K] = TopCMSParams(logic)
+
+ override val zero: TopCMS[K] = TopCMSZero[K](emptyCms, params)
+
+ /**
+ * Combines the two sketches.
+ *
+ * The sketches must use the same hash functions.
+ */
+ override def plus(left: TopCMS[K], right: TopCMS[K]): TopCMS[K] = {
+ require(
+ left.cms.params.hashes == right.cms.params.hashes,
+ "The sketches must use the same hash functions."
+ )
+ left ++ right
+ }
+
+ /**
+ * Creates a sketch out of a single item.
+ */
+ def create(item: K): TopCMS[K] =
+ TopCMSItem[K](item, emptyCms + item, params)
+
+ /**
+ * Creates a sketch out of multiple items.
+ */
+ def create(data: Seq[K]): TopCMS[K] =
+ data.foldLeft(zero) { case (acc, x) => plus(acc, create(x)) }
+
+ override def sum(sketches: TraversableOnce[TopCMS[K]]): TopCMS[K] = {
+ val topCandidates = scala.collection.mutable.Set.empty[K]
+ val summation = new CMSSummation(emptyCms.params)
+ sketches.iterator.foreach { sketch =>
+ summation.updateInto(sketch.cms)
+ topCandidates ++= sketch.heavyHitters
+ }
+ val cms = summation.result
+ val ests =
+ topCandidates.map(k => HeavyHitter(k, cms.frequency(k).estimate)).toSet
+ val hhs = logic.purgeHeavyHitters(cms)(HeavyHitters(ests))
+ TopCMSInstance(cms, hhs, params)
+ }
+
+ override def sumOption(sketches: TraversableOnce[TopCMS[K]]): Option[TopCMS[K]] =
+ if (sketches.iterator.isEmpty) None else Some(sum(sketches))
+}
+
+class TopCMSAggregator[K](cmsMonoid: TopCMSMonoid[K]) extends MonoidAggregator[K, TopCMS[K], TopCMS[K]] {
+
+ override def monoid: TopCMSMonoid[K] = cmsMonoid
+
+ override def prepare(value: K): TopCMS[K] = monoid.create(value)
+
+ override def present(cms: TopCMS[K]): TopCMS[K] = cms
+
+}
+
+/**
+ * Controls how a CMS that implements [[CMSHeavyHitters]] tracks heavy hitters.
+ */
+abstract class HeavyHittersLogic[K] extends java.io.Serializable {
+
+ def updateHeavyHitters(
+ oldCms: CMS[K],
+ newCms: CMS[K]
+ )(hhs: HeavyHitters[K], item: K, count: Long): HeavyHitters[K] = {
+ val oldItemCount = oldCms.frequency(item).estimate
+ val oldHh = HeavyHitter[K](item, oldItemCount)
+ val newItemCount = oldItemCount + count
+ val newHh = HeavyHitter[K](item, newItemCount)
+ purgeHeavyHitters(newCms)(hhs - oldHh + newHh)
+ }
+
+ def updateHeavyHitters(cms: CMS[K])(left: HeavyHitters[K], right: HeavyHitters[K]): HeavyHitters[K] = {
+ val candidates = (left.items ++ right.items).map { case i =>
+ HeavyHitter[K](i, cms.frequency(i).estimate)
+ }
+ val newHhs = HeavyHitters.from(candidates)
+ purgeHeavyHitters(cms)(newHhs)
+ }
+
+ def purgeHeavyHitters(cms: CMS[K])(hhs: HeavyHitters[K]): HeavyHitters[K]
+
+}
+
+/**
+ * Finds all heavy hitters, i.e., elements in the stream that appear at least `(heavyHittersPct * totalCount)`
+ * times.
+ *
+ * Every item that appears at least `(heavyHittersPct * totalCount)` times is output, and with probability `p
+ * >= 1 - delta`, no item whose count is less than `(heavyHittersPct - eps) * totalCount` is output.
+ *
+ * This also means that this parameter is an upper bound on the number of heavy hitters that will be tracked:
+ * the set of heavy hitters contains at most `1 / heavyHittersPct` elements. For example, if
+ * `heavyHittersPct=0.01` (or 0.25), then at most `1 / 0.01 = 100` items (or `1 / 0.25 = 4` items) will be
+ * tracked/returned as heavy hitters. This parameter can thus control the memory footprint required for
+ * tracking heavy hitters.
+ */
+case class TopPctLogic[K](heavyHittersPct: Double) extends HeavyHittersLogic[K] {
+
+ require(0 < heavyHittersPct && heavyHittersPct < 1, "heavyHittersPct must lie in (0, 1)")
+
+ override def purgeHeavyHitters(cms: CMS[K])(hitters: HeavyHitters[K]): HeavyHitters[K] = {
+ val minCount = heavyHittersPct * cms.totalCount
+ HeavyHitters[K](hitters.hhs.filter(_.count >= minCount))
+ }
+
+}
+
+/**
+ * Tracks the top N heavy hitters, where `N` is defined by `heavyHittersN`.
+ *
+ * '''Warning:''' top-N computations are not associative. The effect is that a top-N CMS has an ordering bias
+ * (with regard to heavy hitters) when merging instances. This means merging heavy hitters across CMS
+ * instances may lead to incorrect, biased results: the outcome is biased by the order in which CMS instances
+ * / heavy hitters are being merged, with the rule of thumb being that the earlier a set of heavy hitters is
+ * being merged, the more likely is the end result biased towards these heavy hitters.
+ *
+ * @see
+ * Discussion in [[https://github.com/twitter/algebird/issues/353 Algebird issue 353]]
+ */
+case class TopNLogic[K](heavyHittersN: Int) extends HeavyHittersLogic[K] {
+
+ require(heavyHittersN > 0, "heavyHittersN must be > 0")
+
+ override def purgeHeavyHitters(cms: CMS[K])(hitters: HeavyHitters[K]): HeavyHitters[K] = {
+ val sorted =
+ hitters.hhs.toSeq.sortBy(hh => hh.count).takeRight(heavyHittersN)
+ HeavyHitters[K](sorted.toSet)
+ }
+
+}
+
+/**
+ * Containers for holding heavy hitter items and their associated counts.
+ */
+case class HeavyHitters[K](hhs: Set[HeavyHitter[K]]) extends java.io.Serializable {
+
+ def -(hh: HeavyHitter[K]): HeavyHitters[K] = HeavyHitters[K](hhs - hh)
+
+ def +(hh: HeavyHitter[K]): HeavyHitters[K] = HeavyHitters[K](hhs + hh)
+
+ def ++(other: HeavyHitters[K]): HeavyHitters[K] =
+ HeavyHitters[K](hhs ++ other.hhs)
+
+ def items: Set[K] = hhs.map(_.item)
+
+}
+
+object HeavyHitters {
+
+ def empty[K]: HeavyHitters[K] = HeavyHitters(emptyHhs)
+
+ private def emptyHhs[K]: Set[HeavyHitter[K]] = Set[HeavyHitter[K]]()
+
+ def from[K](hhs: Set[HeavyHitter[K]]): HeavyHitters[K] =
+ hhs.foldLeft(empty[K])(_ + _)
+
+ def from[K](hh: HeavyHitter[K]): HeavyHitters[K] = HeavyHitters(emptyHhs + hh)
+
+}
+
+case class HeavyHitter[K](item: K, count: Long) extends java.io.Serializable
+
+/**
+ * Monoid for Top-% based [[TopCMS]] sketches.
+ *
+ * =Usage=
+ *
+ * The type `K` is the type of items you want to count. You must provide an implicit `CMSHasher[K]` for `K`,
+ * and Algebird ships with several such implicits for commonly used types such as `Long` and `BigInt`.
+ *
+ * If your type `K` is not supported out of the box, you have two options: 1) You provide a "translation"
+ * function to convert items of your (unsupported) type `K` to a supported type such as Double, and then use
+ * the `contramap` function of [[CMSHasher]] to create the required `CMSHasher[K]` for your type (see the
+ * documentation of [[CMSHasher]] for an example); 2) You implement a `CMSHasher[K]` from scratch, using the
+ * existing CMSHasher implementations as a starting point.
+ *
+ * Note: Because Arrays in Scala/Java not have sane `equals` and `hashCode` implementations, you cannot safely
+ * use types such as `Array[Byte]`. Extra work is required for Arrays. For example, you may opt to convert
+ * `Array[T]` to a `Seq[T]` via `toSeq`, or you can provide appropriate wrapper classes. Algebird provides one
+ * such wrapper class, [[Bytes]], to safely wrap an `Array[Byte]` for use with CMS.
+ *
+ * @param cms
+ * A [[CMS]] instance, which is used for the counting and the frequency estimation performed by this class.
+ * @param heavyHittersPct
+ * A threshold for finding heavy hitters, i.e., elements that appear at least (heavyHittersPct * totalCount)
+ * times in the stream.
+ * @tparam K
+ * The type used to identify the elements to be counted. For example, if you want to count the occurrence of
+ * user names, you could map each username to a unique numeric ID expressed as a `Long`, and then count the
+ * occurrences of those `Long`s with a CMS of type `K=Long`. Note that this mapping between the elements of
+ * your problem domain and their identifiers used for counting via CMS should be bijective. We require a
+ * [[CMSHasher]] context bound for `K`, see [[CMSHasher]] for available implicits that can be imported.
+ * Which type K should you pick in practice? For domains that have less than `2^64` unique elements, you'd
+ * typically use `Long`. For larger domains you can try `BigInt`, for example.
+ */
+class TopPctCMSMonoid[K](cms: CMS[K], heavyHittersPct: Double = 0.01)
+ extends TopCMSMonoid[K](cms, TopPctLogic[K](heavyHittersPct))
+
+object TopPctCMS {
+
+ def monoid[K: CMSHasher](
+ eps: Double,
+ delta: Double,
+ seed: Int,
+ heavyHittersPct: Double
+ ): TopPctCMSMonoid[K] =
+ new TopPctCMSMonoid[K](CMS(eps, delta, seed), heavyHittersPct)
+
+ def monoid[K: CMSHasher](depth: Int, width: Int, seed: Int, heavyHittersPct: Double): TopPctCMSMonoid[K] =
+ monoid(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersPct)
+
+ def aggregator[K: CMSHasher](
+ eps: Double,
+ delta: Double,
+ seed: Int,
+ heavyHittersPct: Double
+ ): TopPctCMSAggregator[K] =
+ new TopPctCMSAggregator[K](monoid(eps, delta, seed, heavyHittersPct))
+
+ def aggregator[K: CMSHasher](
+ depth: Int,
+ width: Int,
+ seed: Int,
+ heavyHittersPct: Double
+ ): TopPctCMSAggregator[K] =
+ aggregator(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersPct)
+
+}
+
+/**
+ * An Aggregator for [[TopPctCMS]]. Can be created using [[TopPctCMS.aggregator]].
+ */
+case class TopPctCMSAggregator[K](cmsMonoid: TopPctCMSMonoid[K]) extends TopCMSAggregator(cmsMonoid)
+
+/**
+ * Monoid for top-N based [[TopCMS]] sketches. '''Use with care! (see warning below)'''
+ *
+ * =Warning: Adding top-N CMS instances (`++`) is an unsafe operation=
+ *
+ * Top-N computations are not associative. The effect is that a top-N CMS has an ordering bias (with regard to
+ * heavy hitters) when ''merging'' CMS instances (e.g. via `++`). This means merging heavy hitters across CMS
+ * instances may lead to incorrect, biased results: the outcome is biased by the order in which CMS instances
+ * / heavy hitters are being merged, with the rule of thumb being that the earlier a set of heavy hitters is
+ * being merged, the more likely is the end result biased towards these heavy hitters.
+ *
+ * The warning above only applies when ''adding CMS instances'' (think: `cms1 ++ cms2`). In comparison, heavy
+ * hitters are correctly computed when:
+ *
+ * - a top-N CMS instance is created from a single data stream, i.e. `Seq[K]`
+ * - items are added/counted individually, i.e. `cms + item` or `cms + (item, count)`.
+ *
+ * See the discussion in [[https://github.com/twitter/algebird/issues/353 Algebird issue 353]] for further
+ * details.
+ *
+ * =Alternatives=
+ *
+ * The following, alternative data structures may be better picks than a top-N based CMS given the warning
+ * above:
+ *
+ * - [[TopPctCMS]]: Has safe merge semantics for its instances including heavy hitters.
+ * - [[SpaceSaver]]: Has the same ordering bias than a top-N CMS, but at least it provides bounds on the
+ * bias.
+ *
+ * =Usage=
+ *
+ * The type `K` is the type of items you want to count. You must provide an implicit `CMSHasher[K]` for `K`,
+ * and Algebird ships with several such implicits for commonly used types such as `Long` and `BigInt`.
+ *
+ * If your type `K` is not supported out of the box, you have two options: 1) You provide a "translation"
+ * function to convert items of your (unsupported) type `K` to a supported type such as [[Double]], and then
+ * use the `contramap` function of [[CMSHasher]] to create the required `CMSHasher[K]` for your type (see the
+ * documentation of [[CMSHasher]] for an example); 2) You implement a `CMSHasher[K]` from scratch, using the
+ * existing CMSHasher implementations as a starting point.
+ *
+ * Note: Because Arrays in Scala/Java not have sane `equals` and `hashCode` implementations, you cannot safely
+ * use types such as `Array[Byte]`. Extra work is required for Arrays. For example, you may opt to convert
+ * `Array[T]` to a `Seq[T]` via `toSeq`, or you can provide appropriate wrapper classes. Algebird provides one
+ * such wrapper class, [[Bytes]], to safely wrap an `Array[Byte]` for use with CMS.
+ *
+ * @param cms
+ * A [[CMS]] instance, which is used for the counting and the frequency estimation performed by this class.
+ * @param heavyHittersN
+ * The maximum number of heavy hitters to track.
+ * @tparam K
+ * The type used to identify the elements to be counted. For example, if you want to count the occurrence of
+ * user names, you could map each username to a unique numeric ID expressed as a `Long`, and then count the
+ * occurrences of those `Long`s with a CMS of type `K=Long`. Note that this mapping between the elements of
+ * your problem domain and their identifiers used for counting via CMS should be bijective. We require a
+ * [[CMSHasher]] context bound for `K`, see [[CMSHasher]] for available implicits that can be imported.
+ * Which type K should you pick in practice? For domains that have less than `2^64` unique elements, you'd
+ * typically use `Long`. For larger domains you can try `BigInt`, for example.
+ */
+class TopNCMSMonoid[K](cms: CMS[K], heavyHittersN: Int = 100)
+ extends TopCMSMonoid[K](cms, TopNLogic[K](heavyHittersN))
+
+object TopNCMS {
+
+ def monoid[K: CMSHasher](eps: Double, delta: Double, seed: Int, heavyHittersN: Int): TopNCMSMonoid[K] =
+ new TopNCMSMonoid[K](CMS(eps, delta, seed), heavyHittersN)
+
+ def monoid[K: CMSHasher](depth: Int, width: Int, seed: Int, heavyHittersN: Int): TopNCMSMonoid[K] =
+ monoid(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersN)
+
+ def aggregator[K: CMSHasher](
+ eps: Double,
+ delta: Double,
+ seed: Int,
+ heavyHittersN: Int
+ ): TopNCMSAggregator[K] =
+ new TopNCMSAggregator[K](monoid(eps, delta, seed, heavyHittersN))
+
+ def aggregator[K: CMSHasher](depth: Int, width: Int, seed: Int, heavyHittersN: Int): TopNCMSAggregator[K] =
+ aggregator(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersN)
+
+}
+
+/**
+ * An Aggregator for [[TopNCMS]]. Can be created using [[TopNCMS.aggregator]].
+ */
+case class TopNCMSAggregator[K](cmsMonoid: TopNCMSMonoid[K]) extends TopCMSAggregator(cmsMonoid)
+
+/**
+ * K1 defines a scope for the CMS. For each k1, keep the top heavyHittersN associated k2 values.
+ */
+case class ScopedTopNLogic[K1, K2](heavyHittersN: Int) extends HeavyHittersLogic[(K1, K2)] {
+
+ require(heavyHittersN > 0, "heavyHittersN must be > 0")
+
+ override def purgeHeavyHitters(
+ cms: CMS[(K1, K2)]
+ )(hitters: HeavyHitters[(K1, K2)]): HeavyHitters[(K1, K2)] = {
+ val grouped = hitters.hhs.groupBy(hh => hh.item._1)
+ val (underLimit, overLimit) = grouped.partition {
+ _._2.size <= heavyHittersN
+ }
+ val sorted = overLimit.transform { case (_, hhs) =>
+ hhs.toSeq.sortBy(hh => hh.count)
+ }
+ val purged = sorted.transform { case (_, hhs) =>
+ hhs.takeRight(heavyHittersN)
+ }
+ HeavyHitters[(K1, K2)](purged.values.flatten.toSet ++ underLimit.values.flatten.toSet)
+ }
+
+}
+
+/*
+ * Monoid for Top-N values per key in an associative [[TopCMS]].
+ *
+ * Typical use case for this might be (Country, City) pairs. For a stream of such
+ * pairs, we might want to keep track of the most popular cities for each country.
+ *
+ * This can, of course, be achieved using a Map[Country, TopNCMS[City]], but this
+ * requires storing one CMS per distinct Country.
+ *
+ * Similarly, one could attempt to use a TopNCMS[(Country, City)], but less common
+ * countries may not make the cut if N is not "very large".
+ *
+ * ScopedTopNCMSMonoid[Country, City] will avoid having one Country drown others
+ * out, while still only using a single CMS.
+ *
+ * In general the eviction of K1 is not supported, and all distinct K1 values must
+ * be retained. Therefore it is important to only use this Monoid when the number
+ * of distinct K1 values is known to be reasonably bounded.
+ */
+class ScopedTopNCMSMonoid[K1, K2](cms: CMS[(K1, K2)], heavyHittersN: Int = 100)
+ extends TopCMSMonoid[(K1, K2)](cms, ScopedTopNLogic[K1, K2](heavyHittersN))
+
+object ScopedTopNCMS {
+
+ def scopedHasher[K1: CMSHasher, K2: CMSHasher]: CMSHasher[(K1, K2)] = new CMSHasher[(K1, K2)] {
+ private val k1Hasher = implicitly[CMSHasher[K1]]
+ private val k2Hasher = implicitly[CMSHasher[K2]]
+
+ override def hash(a: Int, b: Int, width: Int)(x: (K1, K2)): Int = {
+ val (k1, k2) = x
+ val xs = Seq(k1Hasher.hash(a, b, width)(k1), k2Hasher.hash(a, b, width)(k2), a, b)
+ (scala.util.hashing.MurmurHash3.seqHash(xs) & Int.MaxValue) % width
+ }
+ }
+
+ def monoid[K1: CMSHasher, K2: CMSHasher](
+ eps: Double,
+ delta: Double,
+ seed: Int,
+ heavyHittersN: Int
+ ): ScopedTopNCMSMonoid[K1, K2] =
+ new ScopedTopNCMSMonoid[K1, K2](CMS(eps, delta, seed)(scopedHasher[K1, K2]), heavyHittersN)
+
+ def monoid[K1: CMSHasher, K2: CMSHasher](
+ depth: Int,
+ width: Int,
+ seed: Int,
+ heavyHittersN: Int
+ ): ScopedTopNCMSMonoid[K1, K2] =
+ monoid(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersN)
+
+ def aggregator[K1: CMSHasher, K2: CMSHasher](
+ eps: Double,
+ delta: Double,
+ seed: Int,
+ heavyHittersN: Int
+ ): TopCMSAggregator[(K1, K2)] =
+ new TopCMSAggregator(monoid(eps, delta, seed, heavyHittersN))
+
+ def aggregator[K1: CMSHasher, K2: CMSHasher](
+ depth: Int,
+ width: Int,
+ seed: Int,
+ heavyHittersN: Int
+ ): TopCMSAggregator[(K1, K2)] =
+ aggregator(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersN)
+
+}
+
+case class CMSHash[K: CMSHasher](a: Int, b: Int, width: Int) extends java.io.Serializable {
+
+ /**
+ * Returns `a * x + b (mod p) (mod width)`.
+ */
+ def apply(x: K): Int = implicitly[CMSHasher[K]].hash(a, b, width)(x)
+
+}
+
+/**
+ * This formerly held the instances that moved to object CMSHasher
+ *
+ * These instances are slow, but here for compatibility with old serialized data. For new code, avoid these
+ * and instead use the implicits found in the CMSHasher companion object.
+ */
+object CMSHasherImplicits {
+
+ implicit object CMSHasherBigInt extends CMSHasher[BigInt] {
+ override def hash(a: Int, b: Int, width: Int)(x: BigInt): Int =
+ CMSHasher.hashBytes(a, b, width)(x.toByteArray)
+ }
+
+ implicit object CMSHasherString extends CMSHasher[String] {
+ override def hash(a: Int, b: Int, width: Int)(x: String): Int =
+ CMSHasher.hashBytes(a, b, width)(x.getBytes("UTF-8"))
+ }
+
+ def cmsHasherShort: CMSHasher[Short] = CMSHasher.cmsHasherShort
+}
diff --git a/algebird-core/src/main/scala-3/DecayedVector.scala b/algebird-core/src/main/scala-3/DecayedVector.scala
new file mode 100644
index 000000000..18e816fe4
--- /dev/null
+++ b/algebird-core/src/main/scala-3/DecayedVector.scala
@@ -0,0 +1,75 @@
+/*
+Copyright 2012 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+ */
+
+package com.twitter.algebird
+
+/**
+ * Represents a container class together with time. Its monoid consists of exponentially scaling the older
+ * value and summing with the newer one.
+ */
+object DecayedVector extends CompatDecayedVector {
+ def buildWithHalflife[C[_]](vector: C[Double], time: Double, halfLife: Double): DecayedVector[C] =
+ DecayedVector(vector, time * scala.math.log(2.0) / halfLife)
+
+ def monoidWithEpsilon[C[_]](
+ eps: Double
+ )(implicit vs: VectorSpace[Double, C], metric: Metric[C[Double]]): Monoid[DecayedVector[C]] =
+ new Monoid[DecayedVector[C]] {
+ override val zero = DecayedVector(vs.group.zero, Double.NegativeInfinity)
+ override def plus(left: DecayedVector[C], right: DecayedVector[C]) =
+ if (left.scaledTime <= right.scaledTime) {
+ scaledPlus(right, left, eps)
+ } else {
+ scaledPlus(left, right, eps)
+ }
+ }
+
+ def forMap[K](m: Map[K, Double], scaledTime: Double): DecayedVector[Map[K, _]] =
+ DecayedVector[Map[K, _]](m, scaledTime)
+ def forMapWithHalflife[K](m: Map[K, Double], time: Double, halfLife: Double): DecayedVector[Map[K, _]] =
+ forMap(m, time * scala.math.log(2.0) / halfLife)
+
+ def mapMonoidWithEpsilon[K](
+ eps: Double
+ )(implicit
+ vs: VectorSpace[Double, Map[K, _]],
+ metric: Metric[Map[K, Double]]
+ ): Monoid[DecayedVector[Map[K, _]]] =
+ monoidWithEpsilon[Map[K, _]](eps)
+
+ implicit def mapMonoid[K](implicit
+ vs: VectorSpace[Double, Map[K, _]],
+ metric: Metric[Map[K, Double]]
+ ): Monoid[DecayedVector[Map[K, _]]] =
+ mapMonoidWithEpsilon(-1.0)
+
+ def scaledPlus[C[_]](newVal: DecayedVector[C], oldVal: DecayedVector[C], eps: Double)(implicit
+ vs: VectorSpace[Double, C],
+ metric: Metric[C[Double]]
+ ): DecayedVector[C] = {
+ implicit val mon: Monoid[C[Double]] = vs.group
+ val expFactor = scala.math.exp(oldVal.scaledTime - newVal.scaledTime)
+ val newVector =
+ Monoid.plus(newVal.vector, vs.scale(expFactor, oldVal.vector))
+ if (eps < 0.0 || Metric.norm(newVector) > eps) {
+ DecayedVector(newVector, newVal.scaledTime)
+ } else {
+ DecayedVector(mon.zero, Double.NegativeInfinity)
+ }
+ }
+}
+
+case class DecayedVector[C[_]](vector: C[Double], scaledTime: Double)
diff --git a/algebird-core/src/main/scala-3/DecayingCMS.scala b/algebird-core/src/main/scala-3/DecayingCMS.scala
new file mode 100644
index 000000000..54809e2a8
--- /dev/null
+++ b/algebird-core/src/main/scala-3/DecayingCMS.scala
@@ -0,0 +1,650 @@
+package com.twitter.algebird
+
+import java.lang.Double.{compare => cmp}
+import java.lang.Math
+import java.util.Arrays.deepHashCode
+import scala.concurrent.duration.Duration
+import scala.util.Random
+
+/**
+ * DecayingCMS is a module to build count-min sketch instances whose counts decay exponentially.
+ *
+ * Similar to a Map[K, com.twitter.algebird.DecayedValue], each key is associated with a single count value
+ * that decays over time. Unlike a map, the decyaing CMS is an approximate count -- in exchange for the
+ * possibility of over-counting, we can bound its size in memory.
+ *
+ * The intended use case is for metrics or machine learning where exact values aren't needed.
+ *
+ * You can expect the keys with the biggest values to be fairly accurate but the very small values (rare keys
+ * or very old keys) to be lost in the noise. For both metrics and ML this should be fine: you can't learn too
+ * much from very rare values.
+ *
+ * We recommend depth of at least 5, and width of at least 100, but you should do some experiments to
+ * determine the smallest parameters that will work for your use case.
+ */
+final class DecayingCMS[K](
+ seed: Long,
+ val halfLife: Duration,
+ val depth: Int, // number of hashing functions
+ val width: Int, // number of table cells per hashing function
+ hasher: CMSHasher[K]
+) extends Serializable { module =>
+
+ override def toString: String =
+ s"DecayingCMS(seed=$seed, halfLife=$halfLife, depth=$depth, width=$width)"
+
+ @inline private def getNextLogScale(
+ logScale: Double,
+ oldTimeInHL: Double,
+ nowInHL: Double
+ ): Double =
+ if (nowInHL == oldTimeInHL) logScale else logScale + (nowInHL - oldTimeInHL) * log2
+
+ @inline private def getScale(logScale: Double, oldTimeInHL: Double, nowInHL: Double): Double = {
+ val logScale1 = getNextLogScale(logScale, oldTimeInHL, nowInHL)
+ Math.exp(-logScale1)
+ }
+
+ val empty: CMS =
+ new CMS(Array.fill(depth)(Vector.fill[Double](width)(0.0)), 0.0, Double.NegativeInfinity)
+
+ /**
+ * Represents a decaying scalar value at a particular point in time.
+ *
+ * The value decays according to halfLife. Another way to think about DoubleAt is that it represents a
+ * particular decay curve (and in particular, a point along that curve). Two DoubleAt values may be
+ * equivalent if they are two points on the same curve.
+ *
+ * The `timeToZero` and `timeToUnit` methods can be used to "normalize" DoubleAt values. If two DoubleAt
+ * values do not produce the same (approximate) Double values from these methods, they represent different
+ * curves.
+ */
+ class DoubleAt private[algebird] (val value: Double, val timeInHL: Double) extends Serializable {
+ lhs =>
+
+ // this is not public because it's not safe in general -- you need
+ // to run a function that is time-commutative.
+ private[algebird] def map(f: Double => Double): DoubleAt =
+ new DoubleAt(f(value), timeInHL)
+
+ // this is not public because it's not safe in general -- you need
+ // to run a function that is time-commutative.
+ private[algebird] def map2(rhs: DoubleAt)(f: (Double, Double) => Double): DoubleAt =
+ if (lhs.timeInHL < rhs.timeInHL) {
+ val x = lhs.scaledAt(rhs.timeInHL)
+ new DoubleAt(f(x, rhs.value), rhs.timeInHL)
+ } else if (lhs.timeInHL == rhs.timeInHL) {
+ new DoubleAt(f(lhs.value, rhs.value), rhs.timeInHL)
+ } else {
+ val y = rhs.scaledAt(lhs.timeInHL)
+ new DoubleAt(f(lhs.value, y), lhs.timeInHL)
+ }
+
+ def unary_- : DoubleAt = new DoubleAt(-value, timeInHL)
+ def abs: DoubleAt = new DoubleAt(Math.abs(value), timeInHL)
+ def *(n: Double): DoubleAt = new DoubleAt(value * n, timeInHL)
+
+ def +(rhs: DoubleAt): DoubleAt = map2(rhs)(_ + _)
+ def -(rhs: DoubleAt): DoubleAt = map2(rhs)(_ - _)
+ def min(rhs: DoubleAt): DoubleAt = map2(rhs)(Math.min)
+ def max(rhs: DoubleAt): DoubleAt = map2(rhs)(Math.max)
+
+ def /(rhs: DoubleAt): Double = map2(rhs)(_ / _).value
+
+ /**
+ * We consider two DoubleAt values equal not just if their elements are equal, but also if they represent
+ * the same value at different points of decay.
+ */
+ def compare(rhs: DoubleAt): Int = {
+ val vc = cmp(lhs.value, rhs.value)
+ val tc = cmp(lhs.timeInHL, rhs.timeInHL)
+ if (vc == tc) vc
+ else if (tc == 0) vc
+ else if (vc == 0) tc
+ else if (tc < 0) cmp(lhs.scaledAt(rhs.timeInHL), rhs.value)
+ else cmp(lhs.value, rhs.scaledAt(lhs.timeInHL))
+ }
+
+ /**
+ * Time when this value will reach the smallest double value bigger than zero, unless we are already at
+ * zero in which case we return the current time
+ */
+ def timeToZero: Double =
+ if (java.lang.Double.isNaN(value)) Double.NaN
+ else if (java.lang.Double.isInfinite(value)) Double.PositiveInfinity
+ else if (value == 0.0) timeInHL
+ else timeToUnit + DoubleAt.TimeFromUnitToZero
+
+ /**
+ * This is the scaled time when the current value will reach 1 (or -1 for negative values)
+ *
+ * This method is a way of collapsing a DoubleAt into a single value (the time in the past or future where
+ * its value would be 1, the unit value).
+ */
+ def timeToUnit: Double =
+ if (java.lang.Double.isNaN(value)) Double.NaN
+ else if (java.lang.Double.isInfinite(value)) Double.PositiveInfinity
+ else if (value == 0.0) Double.NegativeInfinity
+ else {
+ // solve for result:
+ //
+ // 1 = value * module.getScale(0.0, timeInHL, result)
+ // 1 = value * Math.exp(-getNextLogScale(0.0, timeInHL, result))
+ // 1 / value = Math.exp(-getNextLogScale(0.0, timeInHL, result))
+ // log(1 / value) = -getNextLogScale(0.0, timeInHL, result)
+ // -log(1 / value) = getNextLogScale(0.0, timeInHL, result)
+ // log(value) = getNextLogScale(0.0, timeInHL, result)
+ // log(value) = if (result == timeInHL) 0 else 0 + (result - timeInHL) * log2
+ // log(value) = if (result == timeInHL) 0 else (result - timeInHL) * log2
+ //
+ // log(value) = (result - timeInHL) * log2
+ // log(value) / log2 = result - timeInHL
+ // log(value) / log2 + timeInHL = result
+ Math.log(Math.abs(value)) / log2 + timeInHL
+ }
+
+ override def equals(that: Any): Boolean =
+ that match {
+ case d: DoubleAt => compare(d) == 0
+ case _ => false
+ }
+
+ override def hashCode: Int =
+ timeToUnit.##
+
+ override def toString: String =
+ s"DoubleAt($value, $timeInHL)"
+
+ def <(rhs: DoubleAt): Boolean = (lhs.compare(rhs)) < 0
+ def <=(rhs: DoubleAt): Boolean = (lhs.compare(rhs)) <= 0
+ def >(rhs: DoubleAt): Boolean = (lhs.compare(rhs)) > 0
+ def >=(rhs: DoubleAt): Boolean = (lhs.compare(rhs)) >= 0
+
+ def time: Long =
+ toTimestamp(timeInHL)
+
+ private def scaledAt(t: Double): Double =
+ if (value == 0.0) 0.0
+ else value * module.getScale(0.0, timeInHL, t)
+
+ def at(time: Long): Double =
+ if (value == 0.0) 0.0
+ else value * module.getScale(0.0, timeInHL, fromTimestamp(time))
+ }
+
+ object DoubleAt {
+ def apply(x: Double, t: Long): DoubleAt =
+ new DoubleAt(x, fromTimestamp(t))
+
+ val zero: DoubleAt =
+ new DoubleAt(0.0, Double.NegativeInfinity)
+
+ private val TimeFromUnitToZero: Double =
+ -Math.log(Double.MinPositiveValue) / log2
+ }
+
+ val totalCells: Int = depth * width
+
+ val halfLifeSecs: Double =
+ halfLife.toMillis.toDouble / 1000.0
+
+ // TODO: consider a smaller number?
+ // we are trading accuracy for possible performence
+ private[this] val maxLogScale: Double = 20.0
+
+ /**
+ * Allocate an empty array of row.
+ *
+ * The elements start as null. It's an important optimization _not_ to allocate vectors here, since we're
+ * often building up cells mutably.
+ */
+ private def allocCells(): Array[Vector[Double]] =
+ new Array[Vector[Double]](depth)
+
+ def toTimestamp(t: Double): Long =
+ (t * halfLifeSecs * 1000.0).toLong
+
+ def fromTimestamp(t: Long): Double =
+ (t.toDouble / 1000.0) / halfLifeSecs
+
+ val hashFns: Array[K => Int] = {
+ val rng = new Random(seed)
+ def genPos(): Int =
+ rng.nextInt() match {
+ case 0 => genPos()
+ case n => n & 0x7fffffff
+ }
+
+ (0 until depth).map { _ =>
+ val n = genPos()
+ (k: K) => hasher.hash(n, 0, width)(k)
+ }.toArray
+ }
+
+ private final val log2 = Math.log(2.0)
+
+ /**
+ * The idealized formula for the updating current value for a key (y0 -> y1) is given as:
+ *
+ * delta = (t1 - t0) / halflife y1 = y0 * 2^(-delta) + n
+ *
+ * However, we want to avoid having to rescale every single cell every time we update; i.e. a cell with a
+ * zero value should continue to have a zero value when n=0.
+ *
+ * Therefore, we introduce a change of variable to cell values (z) along with a scale factor (scale), and
+ * the following formula:
+ *
+ * (1) zN = yN * scaleN
+ *
+ * Our constraint is expressed as:
+ *
+ * (2) If n=0, z1 = z0
+ *
+ * In that case:
+ *
+ * (3) If n=0, (y1 * scale1) = (y0 * scale0) (4) Substituting for y1, (y0 * 2^(-delta) + 0) * scale1 = y0 *
+ * scale0 (5) 2^(-delta) * scale1 = scale0 (6) scale1 = scale0 * 2^(delta)
+ *
+ * Also, to express z1 in terms of z0, we say:
+ *
+ * (7) z1 = y1 * scale1 (8) z1 = (y0 * 2^(-delta) + n) * scale1 (9) z1 = ((z0 / scale0) * 2^(-delta) + n) *
+ * scale1 (10) z1 / scale1 = (z0 / (scale1 * 2^(-delta))) * 2^(-delta) + n (11) z1 / scale1 = z0 / scale1 +
+ * n (12) z1 = z0 + n * scale1
+ *
+ * So, for cells where n=0, we just update scale0 to scale1, and for cells where n is non-zero, we update z1
+ * in terms of z0 and scale1.
+ *
+ * If we convert scale to logscale, we have:
+ *
+ * (13) logscale1 = logscale0 + delta * log(2) (14) z1 = z0 + n * exp(logscale1)
+ *
+ * When logscale1 gets big, we start to distort z1. For example, exp(36) is close to 2^53. We can measure
+ * when n * exp(logscale1) gets big, and in those cases we can rescale all our cells (set each z to its
+ * corresponding y) and set the logscale to 0.
+ *
+ * (15) y1 = z1 / scale1 (16) y1 = z1 / exp(logscale1) (17) y1 = z1 * exp(-logscale1)
+ */
+ final class CMS(
+ val cells: Array[Vector[Double]],
+ val logScale: Double,
+ val timeInHL: Double
+ ) extends Serializable {
+
+ @inline private def scale: Double =
+ Math.exp(-logScale)
+
+ override def toString: String = {
+ val s = cells.iterator.map(_.toString).mkString("Array(", ", ", ")")
+ s"CMS($s, $logScale, $timeInHL)"
+ }
+
+ override def hashCode: Int =
+ deepHashCode(cells.asInstanceOf[Array[Object]]) * 59 +
+ logScale.## * 17 +
+ timeInHL.## * 37 +
+ 19
+
+ // unfortunately we can't check the path-dependent type of this
+ // CMS, which we signal by using a type projection here.
+ override def equals(any: Any): Boolean =
+ any match {
+ case that: DecayingCMS[?]#CMS =>
+ this.logScale == that.logScale &&
+ this.timeInHL == that.timeInHL &&
+ this.cells.length == that.cells.length && {
+ var i = 0
+ while (i < depth) {
+ if (this.cells(i) != that.cells(i)) return false
+ i += 1
+ }
+ true
+ }
+ case _ =>
+ false
+ }
+
+ def lastUpdateTime: Long =
+ toTimestamp(timeInHL)
+
+ /**
+ * Provide lower and upper bounds on values returned for any possible key.
+ *
+ * The first value is a lower bound: even keys that have never been counted will return this value or
+ * greater. This will be zero unless the CMS is saturated.
+ *
+ * The second value is an upper bound: the key with the largest cardinality will not be reported as being
+ * larger than this value (though it might be reported as being smaller).
+ *
+ * Together these values indicate how saturated and skewed the CMS might be.
+ */
+ def range: (DoubleAt, DoubleAt) = {
+ var minMinimum = Double.PositiveInfinity
+ var minMaximum = Double.PositiveInfinity
+ var i = 0
+ while (i < cells.length) {
+ val it = cells(i).iterator
+ var localMax = it.next() // we know it doesn't start empty
+ if (localMax < minMinimum) minMinimum = localMax
+ while (it.hasNext) {
+ val n = it.next()
+ if (n > localMax) localMax = n
+ else if (n < minMinimum) minMinimum = n
+ }
+ if (localMax < minMaximum) minMaximum = localMax
+ i += 1
+ }
+
+ val s = scale
+ def sc(x: Double): DoubleAt =
+ new DoubleAt(if (x == 0.0) 0.0 else x * s, timeInHL)
+
+ (sc(minMinimum), sc(minMaximum))
+ }
+
+ /**
+ * Returns the square-root of the inner product of two decaying CMSs.
+ *
+ * We want the result to decay at the same rate as the CMS for this method to be valid. Taking the square
+ * root ensures that this is true. Without it, we would violate the following equality (assuming we had
+ * at() on a CMS):
+ *
+ * x.innerProduct(y).at(t) = x.at(t).innerProduct(y.at(t))
+ *
+ * This is why we don't support innerProduct, only innerProductRoot.
+ */
+ def innerProductRoot(that: CMS): DoubleAt = {
+ var i = 0
+ var res = Double.PositiveInfinity
+ val t = Math.max(this.timeInHL, that.timeInHL)
+ val scale = this.getScale(t) * that.getScale(t)
+ while (i < depth) {
+ var sum = 0.0
+ val it0 = this.cells(i).iterator
+ val it1 = that.cells(i).iterator
+ while (it0.hasNext) {
+ val x = it0.next() * it1.next()
+ if (x != 0.0) sum += x
+ }
+ if (sum < res) res = sum
+ i += 1
+ }
+ val x = if (res != 0.0) Math.sqrt(res * scale) else 0.0
+ new DoubleAt(x, t)
+ }
+
+ def l2Norm: DoubleAt =
+ innerProductRoot(this)
+
+ def scale(x: Double): CMS =
+ if (java.lang.Double.isNaN(x)) {
+ throw new IllegalArgumentException(s"invalid scale: $x")
+ } else if (x < 0.0) {
+ throw new IllegalArgumentException(s"negative scale is not allowed: $x")
+ } else if (x == 0.0) {
+ module.empty
+ } else {
+ val s = logScale + Math.log(x)
+ val c = new CMS(cells, s, timeInHL)
+ if (s > maxLogScale) c.rescaleTo(timeInHL) else c
+ }
+
+ /**
+ * Get the total count of all items in the CMS.
+ *
+ * The total is the same as the l1Norm, since we don't allow negative values.
+ *
+ * Total is one of the few non-approximate statistics that DecayingCMS supports. We expect the total to be
+ * exact (except for floating-point error).
+ */
+ def total: DoubleAt = {
+ val n = cells(0).sum
+ val x = if (n == 0.0) 0.0 else scale * n
+ new DoubleAt(x, timeInHL)
+ }
+
+ def get(k: K): DoubleAt = {
+ var minValue = Double.PositiveInfinity
+ var didx = 0
+ while (didx < depth) {
+ val i = hashFns(didx)(k)
+ val inner = cells(didx)
+ val value = inner(i)
+ if (value < minValue) minValue = value
+ didx += 1
+ }
+ val x = if (minValue == 0.0) 0.0 else scale * minValue
+ new DoubleAt(x, timeInHL)
+ }
+
+ def getScale(t: Double): Double =
+ module.getScale(logScale, timeInHL, t)
+
+ private final def nextLogScale(t: Double): Double =
+ module.getNextLogScale(logScale, timeInHL, t)
+
+ def +(other: CMS): CMS = {
+ val x = this
+ val y = other
+ val timeInHL = Math.max(x.timeInHL, y.timeInHL)
+ val cms = new CMS(allocCells(), 0.0, timeInHL)
+
+ val xscale = x.getScale(timeInHL)
+ val yscale = y.getScale(timeInHL)
+
+ // a zero count is zero, no matter, how big the scale is.
+ @inline def prod(x: Double, y: Double): Double =
+ if (x == 0.0) 0.0 else x * y
+
+ var i = 0
+ while (i < depth) {
+ val left = x.cells(i)
+ val right = y.cells(i)
+ var j = 0
+ val bldr = rowBuilder()
+ while (j < width) {
+ bldr += prod(left(j), xscale) + prod(right(j), yscale)
+ j += 1
+ }
+ cms.cells(i) = bldr.result()
+ i += 1
+ }
+ cms
+ }
+
+ def add(t: Long, k: K, n: Double): CMS =
+ scaledAdd(fromTimestamp(t), k, n)
+
+ // TODO: we could allocate a mutable scratch pad, write all the
+ // values into it, and then build a CMS out of it. if items is
+ // very small, this would be less efficient than what we're doing
+ // now. probably the "ideal" solution would be determine how many
+ // items there are. if we have fewer than ~width items, this
+ // approach is fine. for more, a scratch pad would be better
+ // (assuming we wrote that code).
+ //
+ // alternately, you could map items into (zero + item) and then
+ // use the monoid's sum to boil it down.
+ //
+ // we only use this in testing currently so the current code is
+ // fine until we rely on it in production. any change here should
+ // probably include benchmarks justifying the design.
+ def bulkAdd(items: Iterable[(Long, K, Double)]): CMS =
+ items.foldLeft(this) { case (c, (t, k, v)) => c.add(t, k, v) }
+
+ private[algebird] def scaledAdd(ts1: Double, k: K, n: Double): CMS =
+ if (n < 0.0) {
+ val t = toTimestamp(ts1)
+ throw new IllegalArgumentException(
+ s"we can only add non-negative numbers to a CMS, got $n for key: $k at time: $t"
+ )
+ } else if (n == 0.0) {
+ this
+ } else {
+ val logScale1 = nextLogScale(ts1)
+ if (logScale1 > maxLogScale) {
+ rescaleTo(ts1).scaledAdd(ts1, k, n)
+ } else {
+ val increment = n * Math.exp(logScale1)
+ val cells1 = allocCells()
+ var didx = 0
+ while (didx < depth) {
+ val cell = cells(didx)
+ val w = hashFns(didx)(k)
+ cells1(didx) = cell.updated(w, cell(w) + increment)
+ didx += 1
+ }
+ new CMS(cells1, logScale1, ts1)
+ }
+ }
+
+ // Set the scale back to 0.0
+ // input time is in half-lives
+ private[algebird] def rescaleTo(ts: Double): CMS = {
+ val logScale1 = nextLogScale(ts)
+ val expL = Math.exp(-logScale1)
+ if (expL == 0.0) {
+ new CMS(monoid.zero.cells, 0.0, ts)
+ } else {
+ val cms = new CMS(allocCells(), 0.0, ts)
+ var i = 0
+ while (i < depth) {
+ val ci = cells(i)
+ cms.cells(i) = ci.map(_ * expL)
+ i += 1
+ }
+ cms
+ }
+ }
+ }
+
+ private def rowBuilder() = {
+ val bldr = Vector.newBuilder[Double]
+ bldr.sizeHint(width)
+ bldr
+ }
+
+ object CMS {
+
+ implicit val monoidForCMS: Monoid[CMS] =
+ new Monoid[CMS] {
+
+ def zero: CMS = module.empty
+
+ def plus(x: CMS, y: CMS): CMS =
+ x + y
+
+ /**
+ * Turn a flat array into an array of vectors.
+ */
+ private def scratchToCells(scratch: Array[Double]): Array[Vector[Double]] = {
+ val cells = new Array[Vector[Double]](depth)
+ var i = 0
+ while (i < depth) {
+ var j = i * width
+ val limit = j + width
+ val bldr = rowBuilder()
+ while (j < limit) {
+ bldr += scratch(j)
+ j += 1
+ }
+ cells(i) = bldr.result()
+ i += 1
+ }
+ cells
+ }
+
+ /**
+ * This method sums the first `num` items in `arr`.
+ */
+ private def innerSum(arr: Array[CMS], num: Int): CMS =
+ if (num == 0) zero
+ else if (num == 1) arr(0)
+ else if (num == 2) plus(arr(0), arr(1))
+ else {
+ // start with zero
+ val scratch: Array[Double] = new Array(totalCells)
+
+ val latestTimeInHL: Double =
+ arr.iterator.take(num).map(cms => cms.timeInHL).max
+
+ var i = 0
+ while (i < num) {
+ val cms = arr(i)
+ val scale = cms.getScale(latestTimeInHL)
+ var j = 0
+ while (j < depth) {
+ val row = cms.cells(j)
+ val stride = j * width
+ var k = 0
+ while (k < width) {
+ val n = row(k)
+ if (n > 0.0) {
+ scratch(stride + k) += scale * n
+ }
+ k += 1
+ }
+ j += 1
+ }
+ i += 1
+ }
+
+ val cells = scratchToCells(scratch)
+
+ new CMS(cells, 0.0, latestTimeInHL)
+ }
+
+ override def sumOption(xs: TraversableOnce[CMS]): Option[CMS] = {
+
+ val it: Iterator[CMS] = xs.toIterator
+ val ChunkSize = 1000
+
+ // the idea here is that we read up to 1000 CMS values into
+ // a fixed array, crunch them down to a single CMS, store it
+ // in the first array index, read up to 999 more CMS values
+ // in, crunch them down, and so on.
+ var i = 0
+ val arr = new Array[CMS](ChunkSize)
+ while (it.hasNext) {
+ while (it.hasNext && i < ChunkSize) {
+ arr(i) = it.next()
+ i += 1
+ }
+ if (i > 1) {
+ arr(0) = innerSum(arr, i)
+ }
+ i = 1
+ }
+ if (i == 0) None else Some(arr(0))
+ }
+ }
+ }
+
+ val monoid: Monoid[CMS] = CMS.monoidForCMS
+}
+
+object DecayingCMS {
+
+ /**
+ * Construct a DecayingCMS module.
+ *
+ * The seed is used to initialize the hash families used by the count-min sketch. Using the same seed will
+ * always produce the same hash family.
+ *
+ * Half-life determines the rate at which values in the CMS decay. If a key was counted once at time t, by
+ * time (t + halfLife), the value for that key will be 0.5. After enough half lives the value will decay to
+ * zero.
+ *
+ * The size of the CMS in bytes is O(depth * width).
+ *
+ * Width controls the relative error due to over-counting (approximately 1/width). For 1% error, use
+ * width=100, for 0.1% error, use width=1000, etc.
+ *
+ * Depth controls the probability the error bounds are broken and that probability scales with exp(-alpha *
+ * depth) so, a small depth (e.g. 5-10) is fine. Each update requires O(depth) work so you want to keep this
+ * as small as possible.
+ */
+ def apply[K](seed: Long, halfLife: Duration, depth: Int, width: Int)(implicit
+ hasher: CMSHasher[K]
+ ): DecayingCMS[K] =
+ new DecayingCMS(seed, halfLife, depth, width, hasher)
+}
diff --git a/algebird-core/src/main/scala-3/Fold.scala b/algebird-core/src/main/scala-3/Fold.scala
new file mode 100644
index 000000000..0b89f2d62
--- /dev/null
+++ b/algebird-core/src/main/scala-3/Fold.scala
@@ -0,0 +1,352 @@
+/*
+Copyright 2014 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+ */
+package com.twitter.algebird
+
+import java.io.Serializable
+import scala.collection.compat._
+
+/**
+ * Folds are first-class representations of "Traversable.foldLeft." They have the nice property that they can
+ * be fused to work in parallel over an input sequence.
+ *
+ * A Fold accumulates inputs (I) into some internal type (X), converting to a defined output type (O) when
+ * done. We use existential types to hide internal details and to allow for internal and external (X and O)
+ * types to differ for "map" and "join."
+ *
+ * In discussing this type we draw parallels to Function1 and related types. You can think of a fold as a
+ * function "Seq[I] => O" but in reality we do not have to materialize the input sequence at once to "run" the
+ * fold.
+ *
+ * The traversal of the input data structure is NOT done by Fold itself. Instead we expose some methods like
+ * "overTraversable" that know how to iterate through various sequence types and drive the fold. We also
+ * expose some internal state so library authors can fold over their own types.
+ *
+ * See the companion object for constructors.
+ */
+sealed trait Fold[-I, +O] extends Serializable {
+
+ /**
+ * Users can ignore this type.
+ *
+ * The internal accumulator type. No one outside this Fold needs to know what this is, and that's a good
+ * thing. It keeps type signatures sane and makes this easy to use for the amount of flexibility it
+ * provides.
+ */
+ type X
+
+ /**
+ * Users can ignore this method. It is exposed so library authors can run folds over their own sequence
+ * types.
+ *
+ * "build" constructs a FoldState, which tells us how to run the fold. It is expected that we can run the
+ * same Fold many times over different data structures, but we must build a new FoldState every time.
+ *
+ * See FoldState for information on how to use this for your own sequence types.
+ */
+ def build(): FoldState[X, I, O]
+
+ /**
+ * Transforms the output of the Fold after iteration is complete. This is analogous to "Future.map" or
+ * "Function1.compose."
+ */
+ def map[P](f: O => P): Fold[I, P] = {
+ val self = this
+ new Fold[I, P] {
+ type X = self.X
+ override def build(): FoldState[X, I, P] =
+ self.build().map(f)
+ }
+ }
+
+ /**
+ * Joins two folds into one and combines the results. The fused fold accumulates with both at the same time
+ * and combines at the end.
+ */
+ def joinWith[I2 <: I, P, Q](other: Fold[I2, P])(f: (O, P) => Q): Fold[I2, Q] = {
+ val self = this
+ new Fold[I2, Q] {
+ type X = (self.X, other.X)
+ override def build(): FoldState[X, I2, Q] = {
+ val first = self.build()
+ val second = other.build()
+ new FoldState(
+ { case ((x, y), i) => (first.add(x, i), second.add(y, i)) },
+ (first.start, second.start),
+ { case (x, y) => f(first.end(x), second.end(y)) }
+ )
+ }
+ }
+ }
+
+ /**
+ * Convenient shorthand for joining Folds without combining at the end.
+ */
+ def join[I2 <: I, P](other: Fold[I2, P]): Fold[I2, (O, P)] =
+ joinWith(other) { case (o, p) => (o, p) }
+
+ /**
+ * Transforms the input of the fold before every accumulation. (The name comes from "contravariant map.")
+ * This is analogous to "Function1.andThen."
+ */
+ def contramap[H](f: H => I): Fold[H, O] = {
+ val self = this
+ new Fold[H, O] {
+ type X = self.X
+ override def build(): FoldState[X, H, O] =
+ self.build().contramap(f)
+ }
+ }
+
+ /**
+ * Trivially runs a Fold over an empty sequence.
+ */
+ def overEmpty: O = {
+ // build is a "def" so we construct the state once and use the pieces to run the fold
+ val state = build()
+ state.end(state.start)
+ }
+
+ /**
+ * Trivially runs a Fold over a single element sequence.
+ */
+ def overSingleton(i: I): O = {
+ val state = build()
+ state.end(state.add(state.start, i))
+ }
+
+ /**
+ * Runs a Fold over a Traversable.
+ */
+ def overTraversable(is: TraversableOnce[I]): O = {
+ val state = build()
+ state.end(is.iterator.foldLeft(state.start)(state.add))
+ }
+}
+
+/**
+ * A FoldState defines a left fold with a "hidden" accumulator type. It is exposed so library authors can run
+ * Folds over their own sequence types.
+ *
+ * The fold can be executed correctly according to the properties of "add" and your traversed data structure.
+ * For example, the "add" function of a monoidal fold will be associative. A FoldState is valid for only one
+ * iteration because the accumulator (seeded by "start" may be mutable.
+ *
+ * The three components of a fold are add: (X, I) => X - updates and returns internal state for every input I
+ * start: X - the initial state end: X => O - transforms internal state to a final result
+ *
+ * Folding over Seq(x, y) would produce the result end(add(add(start, x), y))
+ */
+final class FoldState[X, -I, +O] private[algebird] (val add: (X, I) => X, val start: X, val end: X => O)
+ extends Serializable {
+
+ /**
+ * Transforms the output type of the FoldState (see Fold.map).
+ */
+ def map[P](f: O => P): FoldState[X, I, P] =
+ new FoldState(add, start, end.andThen(f))
+
+ /**
+ * Transforms the input type of the FoldState (see Fold.contramap).
+ */
+ def contramap[H](f: H => I): FoldState[X, H, O] =
+ new FoldState((x, h) => add(x, f(h)), start, end)
+}
+
+/**
+ * Methods to create and run Folds.
+ *
+ * The Folds defined here are immutable and serializable, which we expect by default. It is important that you
+ * as a user indicate mutability or non-serializability when defining new Folds. Additionally, it is
+ * recommended that "end" functions not mutate the accumulator in order to support scans (producing a stream
+ * of intermediate outputs by calling "end" at each step).
+ */
+object Fold extends CompatFold {
+
+ /**
+ * "import Fold.applicative" will bring the Applicative instance into scope. See FoldApplicative.
+ */
+ implicit def applicative[I]: Applicative[Fold[I, _]] =
+ new FoldApplicative[I]
+
+ /**
+ * Turn a common Scala foldLeft into a Fold. The accumulator MUST be immutable and serializable.
+ */
+ def foldLeft[I, O](o: O)(add: (O, I) => O): Fold[I, O] =
+ fold[O, I, O](add, o, o => o)
+
+ /**
+ * A general way of defining Folds that supports a separate accumulator type. The accumulator MUST be
+ * immutable and serializable.
+ */
+ def fold[M, I, O](add: (M, I) => M, start: M, end: M => O): Fold[I, O] =
+ new Fold[I, O] {
+ type X = M
+ override def build(): FoldState[X, I, O] =
+ new FoldState(add, start, end)
+ }
+
+ /**
+ * A general way of defining Folds that supports constructing mutable or non-serializable accumulators.
+ */
+ def foldMutable[M, I, O](add: (M, I) => M, start: Unit => M, end: M => O): Fold[I, O] =
+ new Fold[I, O] {
+ type X = M
+ override def build(): FoldState[X, I, O] =
+ new FoldState(add, start(()), end)
+ }
+
+ /**
+ * Fuse a sequence of Folds into one that outputs the result of each.
+ */
+ def sequence[I, O](ms: Seq[Fold[I, O]]): Fold[I, Seq[O]] =
+ new Fold[I, Seq[O]] {
+ type X = Seq[Any]
+ override def build(): FoldState[Seq[Any], I, Seq[O]] = {
+ val bs: Seq[FoldState[Any, I, O]] =
+ ms.map(_.build().asInstanceOf[FoldState[Any, I, O]])
+ val adds =
+ bs.map(_.add)
+ val ends =
+ bs.map(_.end)
+ val starts: Seq[Any] =
+ bs.map(_.start)
+ val add: (Seq[Any], I) => Seq[Any] = { (xs, i) => adds.zip(xs).map { case (f, x) => f(x, i) } }
+ val end: (Seq[Any] => Seq[O]) = { xs => ends.zip(xs).map { case (f, x) => f(x) } }
+ new FoldState(add, starts, end)
+ }
+ }
+
+ /**
+ * An even simpler Fold that collects into a Seq. Shorthand for "container[I, Seq];" fewer type arguments,
+ * better type inferrence.
+ */
+ def seq[I]: Fold[I, Seq[I]] =
+ container[I, Seq]
+
+ /**
+ * A Fold that does no work and returns a constant. Analogous to Function1 const: def const[A, B](b: B): (A
+ * \=> B) = { _ => b }
+ */
+ def const[O](value: O): Fold[Any, O] =
+ Fold.foldLeft(value) { case (u, _) => u }
+
+ /**
+ * A Fold that runs the given side effect for every element.
+ */
+ def foreach[I](e: I => Unit): Fold[I, Unit] =
+ Fold.foldLeft(()) { case (_, i) => e(i) }
+
+ /**
+ * A Fold that returns the first value in a sequence.
+ */
+ def first[I]: Fold[I, Option[I]] =
+ Fold.foldLeft[I, Option[I]](None) {
+ case (None, i) => Some(i)
+ case (x, _) => x
+ }
+
+ /**
+ * A Fold that returns the last value in a sequence.
+ */
+ def last[I]: Fold[I, Option[I]] =
+ Fold.foldLeft[I, Option[I]](None) { case (_, i) => Some(i) }
+
+ /**
+ * A Fold that returns the max value in a sequence. (Biased to earlier equal values.)
+ */
+ def max[I](implicit ordering: Ordering[I]): Fold[I, Option[I]] =
+ Fold.foldLeft[I, Option[I]](None) {
+ case (None, i) => Some(i)
+ case (Some(y), i) if ordering.compare(y, i) < 0 => Some(i)
+ case (x, _) => x
+ }
+
+ /**
+ * A Fold that returns a min value in a sequence. (Biased to earlier equal values.)
+ */
+ def min[I](implicit ordering: Ordering[I]): Fold[I, Option[I]] =
+ Fold.foldLeft[I, Option[I]](None) {
+ case (None, i) => Some(i)
+ case (Some(y), i) if ordering.compare(y, i) > 0 => Some(i)
+ case (x, _) => x
+ }
+
+ /**
+ * A Fold that returns the sum of a numeric sequence. Does not protect against overflow.
+ */
+ def sum[I](implicit numeric: Monoid[I]): Fold[I, I] =
+ Fold.foldLeft(numeric.zero) { case (x, i) => numeric.plus(x, i) }
+
+ /**
+ * For a semigroup, if we get more than 0 items, use plus
+ */
+ def sumOption[T](implicit sg: Semigroup[T]): Fold[T, Option[T]] =
+ Fold.foldLeft(None: Option[T]) {
+ case (None, i) => Some(i)
+ case (Some(l), r) => Some(sg.plus(l, r))
+ }
+
+ /**
+ * A Fold that returns the product of a numeric sequence. Does not protect against overflow.
+ */
+ def product[I](implicit numeric: Ring[I]): Fold[I, I] =
+ Fold.foldLeft(numeric.one) { case (x, i) => numeric.times(x, i) }
+
+ /**
+ * A Fold that returns the length of a sequence.
+ */
+ def size: Fold[Any, Long] =
+ Fold.foldLeft(0L) { case (x, _) => x + 1 }
+
+ /**
+ * A Fold that returns "true" if all elements of the sequence statisfy the predicate. Note this does not
+ * short-circuit enumeration of the sequence.
+ */
+ def forall[I](pred: I => Boolean): Fold[I, Boolean] =
+ foldLeft(true)((b, i) => b && pred(i))
+
+ /**
+ * A Fold that returns "true" if any element of the sequence statisfies the predicate. Note this does not
+ * short-circuit enumeration of the sequence.
+ */
+ def exists[I](pred: I => Boolean): Fold[I, Boolean] =
+ foldLeft(false)((b, i) => b || pred(i))
+
+ /**
+ * A Fold that counts the number of elements satisfying the predicate.
+ */
+ def count[I](pred: I => Boolean): Fold[I, Long] =
+ foldLeft(0L) {
+ case (c, i) if pred(i) => c + 1L
+ case (c, _) => c
+ }
+}
+
+/**
+ * Folds are Applicatives!
+ */
+class FoldApplicative[I] extends Applicative[Fold[I, _]] {
+ override def map[T, U](mt: Fold[I, T])(fn: T => U): Fold[I, U] =
+ mt.map(fn)
+ override def apply[T](v: T): Fold[I, T] =
+ Fold.const(v)
+ override def join[T, U](mt: Fold[I, T], mu: Fold[I, U]): Fold[I, (T, U)] =
+ mt.join(mu)
+ override def sequence[T](ms: Seq[Fold[I, T]]): Fold[I, Seq[T]] =
+ Fold.sequence(ms)
+ override def joinWith[T, U, V](mt: Fold[I, T], mu: Fold[I, U])(fn: (T, U) => V): Fold[I, V] =
+ mt.joinWith(mu)(fn)
+}
diff --git a/algebird-core/src/main/scala-3/Interval.scala b/algebird-core/src/main/scala-3/Interval.scala
new file mode 100644
index 000000000..6a1645d16
--- /dev/null
+++ b/algebird-core/src/main/scala-3/Interval.scala
@@ -0,0 +1,380 @@
+/*
+ Copyright 2013 Twitter, Inc.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package com.twitter.algebird
+
+// TODO this is clearly more general than summingbird, and should be extended to be a ring (add union, etc...)
+
+/**
+ * Represents a single interval on a T with an Ordering
+ */
+sealed trait Interval[T] extends java.io.Serializable {
+ def contains(t: T)(implicit ord: Ordering[T]): Boolean
+
+ def intersect(that: Interval[T])(implicit ord: Ordering[T]): Interval[T]
+ final def apply(t: T)(implicit ord: Ordering[T]): Boolean = contains(t)
+ final def &&(that: Interval[T])(implicit ord: Ordering[T]): Interval[T] = intersect(that)
+
+ /**
+ * Map the Interval with a non-decreasing function. If you use a non-monotonic function (like x^2) then the
+ * result is meaningless. TODO: It might be good to have types for these properties in algebird.
+ */
+ def mapNonDecreasing[U](fn: T => U): Interval[U]
+}
+
+case class Universe[T]() extends Interval[T] {
+ override def contains(t: T)(implicit ord: Ordering[T]): Boolean = true
+ override def intersect(that: Interval[T])(implicit ord: Ordering[T]): Interval[T] =
+ that
+ override def mapNonDecreasing[U](fn: T => U): Interval[U] = Universe()
+}
+
+case class Empty[T]() extends Interval[T] {
+ override def contains(t: T)(implicit ord: Ordering[T]): Boolean = false
+ override def intersect(that: Interval[T])(implicit ord: Ordering[T]): Interval[T] =
+ this
+ override def mapNonDecreasing[U](fn: T => U): Interval[U] = Empty()
+}
+
+object Interval extends java.io.Serializable {
+
+ /**
+ * Class that only exists so that [[leftClosedRightOpen]] and [[leftOpenRightClosed]] can retain the type
+ * information of the returned interval. The compiler doesn't know anything about ordering, so without
+ * [[MaybeEmpty]] the only valid return type is Interval[T].
+ */
+ sealed abstract class MaybeEmpty[T, NonEmpty[t] <: Interval[t]] {
+ def isEmpty: Boolean
+ }
+ object MaybeEmpty {
+
+ /**
+ * Represents an empty interval.
+ */
+ case class SoEmpty[T, NonEmpty[t] <: Interval[t]]() extends MaybeEmpty[T, NonEmpty] {
+ override def isEmpty: Boolean = true
+ }
+
+ /**
+ * Represents a non-empty interval.
+ */
+ case class NotSoEmpty[T, NonEmpty[t] <: Interval[t]](get: NonEmpty[T]) extends MaybeEmpty[T, NonEmpty] {
+ override def isEmpty: Boolean = false
+ }
+ }
+
+ type GenIntersection[T] = Intersection[Lower, Upper, T]
+ type InLowExUp[T] = Intersection[InclusiveLower, ExclusiveUpper, T]
+ type InLowInUp[T] = Intersection[InclusiveLower, InclusiveUpper, T]
+ type ExLowExUp[T] = Intersection[ExclusiveLower, ExclusiveUpper, T]
+ type ExLowInUp[T] = Intersection[ExclusiveLower, InclusiveUpper, T]
+
+ implicit def monoid[T: Ordering]: Monoid[Interval[T]] =
+ Monoid.from[Interval[T]](Universe[T]())(_ && _)
+
+ // Automatically convert from a MaybeEmpty instance
+ implicit def fromMaybeEmpty[T, NonEmpty[t] <: Interval[t]](me: MaybeEmpty[T, NonEmpty]): Interval[T] =
+ me match {
+ case MaybeEmpty.SoEmpty() => Empty()
+ case MaybeEmpty.NotSoEmpty(i) => i
+ }
+
+ def leftClosedRightOpen[T: Ordering](lower: T, upper: T): MaybeEmpty[T, InLowExUp] =
+ if (Ordering[T].lt(lower, upper))
+ MaybeEmpty.NotSoEmpty[T, InLowExUp](Intersection(InclusiveLower(lower), ExclusiveUpper(upper)))
+ else MaybeEmpty.SoEmpty[T, InLowExUp]()
+
+ def leftOpenRightClosed[T: Ordering](lower: T, upper: T): MaybeEmpty[T, ExLowInUp] =
+ if (Ordering[T].lt(lower, upper))
+ MaybeEmpty.NotSoEmpty[T, ExLowInUp](Intersection(ExclusiveLower(lower), InclusiveUpper(upper)))
+ else MaybeEmpty.SoEmpty[T, ExLowInUp]()
+
+ def closed[T: Ordering](lower: T, upper: T): MaybeEmpty[T, InLowInUp] =
+ if (Ordering[T].lteq(lower, upper))
+ MaybeEmpty.NotSoEmpty[T, InLowInUp](Intersection(InclusiveLower(lower), InclusiveUpper(upper)))
+ else MaybeEmpty.SoEmpty[T, InLowInUp]()
+
+ def open[T: Ordering](lower: T, upper: T): MaybeEmpty[T, ExLowExUp] =
+ if (Ordering[T].lt(lower, upper))
+ MaybeEmpty.NotSoEmpty[T, ExLowExUp](Intersection(ExclusiveLower(lower), ExclusiveUpper(upper)))
+ else MaybeEmpty.SoEmpty[T, ExLowExUp]()
+
+ /**
+ * This is here for binary compatibility reasons. These methods should be moved to Interval, which should
+ * also be an abstract class for better binary compatibility at the next incompatible change
+ */
+ implicit final class IntervalMethods[T](val intr: Interval[T]) extends AnyVal {
+ def isEmpty(implicit succ: Successible[T], pred: Predecessible[T]): Boolean = intr match {
+ case Empty() => true
+ case Universe() => false
+ case Intersection(InclusiveLower(l), ExclusiveUpper(u)) =>
+ !succ.ordering.lt(l, u)
+ case Intersection(InclusiveLower(l), InclusiveUpper(u)) =>
+ !succ.ordering.lteq(l, u)
+ case Intersection(ExclusiveLower(l), ExclusiveUpper(u)) =>
+ !succ.next(l).exists(succ.ordering.lt(_, u))
+ case Intersection(ExclusiveLower(l), InclusiveUpper(u)) =>
+ !succ.next(l).exists(succ.ordering.lteq(_, u))
+ case InclusiveLower(_) => false // we at least have l
+ case InclusiveUpper(_) => false // false // we at least have u
+ case ExclusiveLower(l) =>
+ succ.next(l).isEmpty
+ case ExclusiveUpper(u) =>
+ pred.prev(u).isEmpty
+ }
+
+ /**
+ * If this returns Some(t), then intr.contains(t) and there is no s less than t such that intr.contains(s)
+ *
+ * if this returns None, it may be Empty, Upper or Universe
+ */
+ def boundedLeast(implicit succ: Successible[T]): Option[T] = intr match {
+ case Empty() => None
+ case Universe() => None
+ case _: Upper[?] => None
+ case i @ Intersection(_, _) => i.least
+ case l: Lower[?] => l.least
+ }
+
+ /**
+ * If this returns Some(t), then intr.contains(t) and there is no s greater than t such that
+ * intr.contains(s)
+ *
+ * if this returns None, it may be Empty, Lower, or Universe
+ */
+ def boundedGreatest(implicit pred: Predecessible[T]): Option[T] =
+ intr match {
+ case Empty() => None
+ case Universe() => None
+ case _: Lower[?] => None
+ case i @ Intersection(_, _) => i.greatest
+ case u: Upper[?] => u.greatest
+ }
+ }
+}
+
+// Marker traits to keep lower on the left in Intersection
+sealed trait Lower[T] extends Interval[T] {
+
+ /**
+ * This may give a false positive (but should try not to). Note the case of (0,1) for the integers. If they
+ * were doubles, this would intersect, but since there are no members of the set Int that are bigger than 0
+ * and less than 1, they don't really intersect. So, ordering is not enough here. You need a stronger
+ * notion, which we don't have a typeclass for.
+ */
+ def intersects(u: Upper[T])(implicit ord: Ordering[T]): Boolean
+
+ /**
+ * The smallest value that is contained here This is an Option, because of cases like
+ * ExclusiveLower(Int.MaxValue) which are pathological and equivalent to Empty
+ */
+ def least(implicit s: Successible[T]): Option[T]
+ def strictLowerBound(implicit p: Predecessible[T]): Option[T]
+
+ /**
+ * Iterates all the items in this Lower[T] from lowest to highest
+ */
+ def toIterable(implicit s: Successible[T]): Iterable[T] =
+ least match {
+ case Some(l) => s.iterateNext(l)
+ case None => Iterable.empty
+ }
+}
+sealed trait Upper[T] extends Interval[T] {
+
+ /**
+ * The smallest value that is contained here This is an Option, because of cases like
+ * ExclusiveUpper(Int.MinValue), which are pathological and equivalent to Empty
+ */
+ def greatest(implicit p: Predecessible[T]): Option[T]
+ // The smallest value that is not present
+ def strictUpperBound(implicit s: Successible[T]): Option[T]
+
+ /**
+ * Iterates all the items in this Upper[T] from highest to lowest
+ */
+ def toIterable(implicit p: Predecessible[T]): Iterable[T] =
+ greatest match {
+ case Some(g) => p.iteratePrev(g)
+ case None => Iterable.empty
+ }
+}
+
+case class InclusiveLower[T](lower: T) extends Interval[T] with Lower[T] {
+ override def contains(t: T)(implicit ordering: Ordering[T]): Boolean =
+ ordering.lteq(lower, t)
+ override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match {
+ case Universe() => this
+ case Empty() => that
+ case ub @ InclusiveUpper(_) =>
+ if (intersects(ub)) Intersection(this, ub) else Empty()
+ case ub @ ExclusiveUpper(_) =>
+ if (intersects(ub)) Intersection(this, ub) else Empty()
+ case InclusiveLower(thatlb) =>
+ if (ordering.gt(lower, thatlb)) this else that
+ case ExclusiveLower(thatlb) =>
+ if (ordering.gt(lower, thatlb)) this else that
+ case Intersection(thatL, thatU) => (this && thatL) && thatU
+ }
+ override def intersects(u: Upper[T])(implicit ordering: Ordering[T]): Boolean =
+ u match {
+ case InclusiveUpper(upper) => ordering.lteq(lower, upper)
+ case ExclusiveUpper(upper) => ordering.lt(lower, upper)
+ }
+ override def least(implicit s: Successible[T]): Option[T] = Some(lower)
+ override def strictLowerBound(implicit p: Predecessible[T]): Option[T] = p.prev(lower)
+ override def mapNonDecreasing[U](fn: T => U): Interval[U] = InclusiveLower(fn(lower))
+}
+case class ExclusiveLower[T](lower: T) extends Interval[T] with Lower[T] {
+ override def contains(t: T)(implicit ordering: Ordering[T]): Boolean =
+ ordering.lt(lower, t)
+ override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match {
+ case Universe() => this
+ case Empty() => that
+ case ub @ InclusiveUpper(_) =>
+ if (intersects(ub)) Intersection(this, ub) else Empty()
+ case ub @ ExclusiveUpper(_) =>
+ if (intersects(ub)) Intersection(this, ub) else Empty()
+ case InclusiveLower(thatlb) =>
+ if (ordering.gteq(lower, thatlb)) this else that
+ case ExclusiveLower(thatlb) =>
+ if (ordering.gteq(lower, thatlb)) this else that
+ case Intersection(thatL, thatU) => (this && thatL) && thatU
+ }
+ override def intersects(u: Upper[T])(implicit ordering: Ordering[T]): Boolean =
+ u match {
+ case InclusiveUpper(upper) => ordering.lt(lower, upper)
+ case ExclusiveUpper(upper) =>
+ ordering.lt(lower, upper) // This is a false positive for (x, next(x))
+ }
+ override def least(implicit s: Successible[T]): Option[T] = s.next(lower)
+ override def strictLowerBound(implicit p: Predecessible[T]): Option[T] = Some(lower)
+ override def mapNonDecreasing[U](fn: T => U): Interval[U] = ExclusiveLower(fn(lower))
+}
+case class InclusiveUpper[T](upper: T) extends Interval[T] with Upper[T] {
+ override def contains(t: T)(implicit ordering: Ordering[T]): Boolean =
+ ordering.lteq(t, upper)
+ override def greatest(implicit p: Predecessible[T]): Option[T] = Some(upper)
+ // The smallest value that is not present
+ override def strictUpperBound(implicit s: Successible[T]): Option[T] = s.next(upper)
+ override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match {
+ case Universe() => this
+ case Empty() => that
+ case lb @ InclusiveLower(_) =>
+ if (lb.intersects(this)) Intersection(lb, this) else Empty()
+ case lb @ ExclusiveLower(_) =>
+ if (lb.intersects(this)) Intersection(lb, this) else Empty()
+ case InclusiveUpper(thatub) =>
+ if (ordering.lt(upper, thatub)) this else that
+ case ExclusiveUpper(thatub) =>
+ if (ordering.lt(upper, thatub)) this else that
+ case Intersection(thatL, thatU) => thatL && (this && thatU)
+ }
+ override def mapNonDecreasing[U](fn: T => U): Interval[U] = InclusiveUpper(fn(upper))
+}
+case class ExclusiveUpper[T](upper: T) extends Interval[T] with Upper[T] {
+ override def contains(t: T)(implicit ordering: Ordering[T]): Boolean =
+ ordering.lt(t, upper)
+ override def greatest(implicit p: Predecessible[T]): Option[T] = p.prev(upper)
+ // The smallest value that is not present
+ override def strictUpperBound(implicit s: Successible[T]): Option[T] = Some(upper)
+ override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match {
+ case Universe() => this
+ case Empty() => that
+ case lb @ InclusiveLower(_) =>
+ if (lb.intersects(this)) Intersection(lb, this) else Empty()
+ case lb @ ExclusiveLower(_) =>
+ if (lb.intersects(this)) Intersection(lb, this) else Empty()
+ case InclusiveUpper(thatub) =>
+ if (ordering.lteq(upper, thatub)) this else that
+ case ExclusiveUpper(thatub) =>
+ if (ordering.lteq(upper, thatub)) this else that
+ case Intersection(thatL, thatU) => thatL && (this && thatU)
+ }
+ override def mapNonDecreasing[U](fn: T => U): Interval[U] = ExclusiveUpper(fn(upper))
+}
+
+case class Intersection[L[t] <: Lower[t], U[t] <: Upper[t], T](lower: L[T], upper: U[T]) extends Interval[T] {
+ override def contains(t: T)(implicit ordering: Ordering[T]): Boolean =
+ lower.contains(t) && upper.contains(t)
+ override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match {
+ case Universe() => this
+ case Empty() => that
+ case lb @ InclusiveLower(_) => (lb && lower) && upper
+ case lb @ ExclusiveLower(_) => (lb && lower) && upper
+ case ub @ InclusiveUpper(_) => lower && (ub && upper)
+ case ub @ ExclusiveUpper(_) => lower && (ub && upper)
+ case Intersection(thatL, thatU) => (lower && thatL) && (upper && thatU)
+ }
+ override def mapNonDecreasing[T1](fn: T => T1): Interval[T1] = {
+ val newLower = lower match {
+ case InclusiveLower(l) => InclusiveLower(fn(l))
+ case ExclusiveLower(l) => ExclusiveLower(fn(l))
+ }
+ val newUpper = upper match {
+ case InclusiveUpper(u) => InclusiveUpper(fn(u))
+ case ExclusiveUpper(u) => ExclusiveUpper(fn(u))
+ }
+ Intersection(newLower, newUpper)
+ }
+
+ def least(implicit s: Successible[T]): Option[T] =
+ lower.least.filter(upper.contains(_)(s.ordering))
+
+ /**
+ * Goes from lowest to highest for all items that are contained in this Intersection
+ */
+ def leastToGreatest(implicit s: Successible[T]): Iterable[T] = {
+ val self = this
+ implicit val ord: Ordering[T] = s.ordering
+ // TODO https://github.com/twitter/algebird/issues/263
+ new AbstractIterable[T] {
+ // We have to do this because the normal takeWhile causes OOM on big intervals
+ override def iterator: Iterator[T] = lower.toIterable.iterator.takeWhile(self.upper.contains(_))
+ }
+ }
+
+ def greatest(implicit p: Predecessible[T]): Option[T] =
+ upper.greatest.filter(lower.contains(_)(p.ordering))
+
+ /**
+ * Goes from highest to lowest for all items that are contained in this Intersection
+ */
+ def greatestToLeast(implicit p: Predecessible[T]): Iterable[T] = {
+ val self = this
+ implicit val ord: Ordering[T] = p.ordering
+ // TODO https://github.com/twitter/algebird/issues/263
+ new AbstractIterable[T] {
+ // We have to do this because the normal takeWhile causes OOM on big intervals
+ override def iterator: Iterator[T] = upper.toIterable.iterator.takeWhile(self.lower.contains(_))
+ }
+ }
+
+ /**
+ * Some intervals can actually be synonyms for empty: (0,0) for instance, contains nothing. This cannot be
+ * normalized to [a, b) form, thus we return an option Also, there are cases like [Int.MinValue,
+ * Int.MaxValue] that cannot are actually equivalent to Universe. The bottom line: if this returns None, it
+ * just means you can't express it this way, it does not mean it is empty or universe, etc... (there are
+ * other cases).
+ */
+ def toLeftClosedRightOpen(implicit
+ s: Successible[T]
+ ): Option[Intersection[InclusiveLower, ExclusiveUpper, T]] =
+ for {
+ l <- lower.least
+ g <- upper.strictUpperBound if s.ordering.lt(l, g)
+ } yield Intersection(InclusiveLower(l), ExclusiveUpper(g))
+}
diff --git a/algebird-core/src/main/scala-3/InvariantAlgebras.scala b/algebird-core/src/main/scala-3/InvariantAlgebras.scala
new file mode 100644
index 000000000..6f30ebc1c
--- /dev/null
+++ b/algebird-core/src/main/scala-3/InvariantAlgebras.scala
@@ -0,0 +1,48 @@
+package com.twitter.algebird
+
+class InvariantSemigroup[T, U](val forward: T => U, val reverse: U => T)(implicit val semigroup: Semigroup[T])
+ extends Semigroup[U] {
+ override def plus(l: U, r: U): U =
+ forward(semigroup.plus(reverse(l), reverse(r)))
+ override def sumOption(iter: TraversableOnce[U]): Option[U] =
+ semigroup.sumOption(iter.map(reverse)).map(forward)
+
+ /*
+ * Note these work for the subclasses since in those cases semigroup
+ * will be the appropriate algebra.
+ */
+ override val hashCode: Int = (forward, reverse, semigroup).hashCode
+ override def equals(that: Any): Boolean =
+ that match {
+ case r: InvariantSemigroup[?, ?] =>
+ (hashCode == r.hashCode) &&
+ (forward == r.forward) &&
+ (reverse == r.reverse) &&
+ (semigroup == r.semigroup)
+ case _ => false
+ }
+}
+
+class InvariantMonoid[T, U](forward: T => U, reverse: U => T)(implicit val monoid: Monoid[T])
+ extends InvariantSemigroup[T, U](forward, reverse)
+ with Monoid[U] {
+ override val zero: U = forward(monoid.zero)
+}
+
+class InvariantGroup[T, U](forward: T => U, reverse: U => T)(implicit val group: Group[T])
+ extends InvariantMonoid[T, U](forward, reverse)
+ with Group[U] {
+ override def negate(u: U): U = forward(group.negate(reverse(u)))
+ override def minus(l: U, r: U): U =
+ forward(group.minus(reverse(l), reverse(r)))
+}
+
+class InvariantRing[T, U](forward: T => U, reverse: U => T)(implicit val ring: Ring[T])
+ extends InvariantGroup[T, U](forward, reverse)
+ with Ring[U] {
+ override val one: U = forward(ring.one)
+ override def times(l: U, r: U): U =
+ forward(ring.times(reverse(l), reverse(r)))
+ override def product(iter: TraversableOnce[U]): U =
+ forward(ring.product(iter.map(reverse)))
+}
diff --git a/algebird-core/src/main/scala-3/JavaMonoids.scala b/algebird-core/src/main/scala-3/JavaMonoids.scala
new file mode 100644
index 000000000..26ce54f0a
--- /dev/null
+++ b/algebird-core/src/main/scala-3/JavaMonoids.scala
@@ -0,0 +1,147 @@
+/*
+Copyright 2012 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+ */
+package com.twitter.algebird
+
+import java.lang.{
+ Boolean => JBool,
+ Double => JDouble,
+ Float => JFloat,
+ Integer => JInt,
+ Long => JLong,
+ Short => JShort
+}
+import java.util.{ArrayList => JArrayList, HashMap => JHashMap, List => JList, Map => JMap}
+
+import scala.collection.JavaConverters._
+
+object JIntRing extends Ring[JInt] {
+ override val zero: JInt = JInt.valueOf(0)
+ override val one: JInt = JInt.valueOf(1)
+ override def plus(x: JInt, y: JInt): JInt = x + y
+ override def negate(x: JInt): JInt = -x
+ override def minus(x: JInt, y: JInt): JInt = x - y
+ override def times(x: JInt, y: JInt): JInt = x * y
+}
+
+object JShortRing extends Ring[JShort] {
+ override val zero: JShort = Short.box(0)
+ override val one: JShort = Short.box(1)
+ override def plus(x: JShort, y: JShort): JShort = (x + y).toShort
+ override def negate(x: JShort): JShort = (-x).toShort
+ override def minus(x: JShort, y: JShort): JShort = (x - y).toShort
+ override def times(x: JShort, y: JShort): JShort = (x * y).toShort
+}
+
+object JLongRing extends Ring[JLong] {
+ override val zero: JLong = JLong.valueOf(0L)
+ override val one: JLong = JLong.valueOf(1L)
+ override def plus(x: JLong, y: JLong): JLong = x + y
+ override def negate(x: JLong): JLong = -x
+ override def minus(x: JLong, y: JLong): JLong = x - y
+ override def times(x: JLong, y: JLong): JLong = x * y
+}
+
+object JFloatRing extends Ring[JFloat] {
+ override val zero: JFloat = JFloat.valueOf(0.0f)
+ override val one: JFloat = JFloat.valueOf(1.0f)
+ override def plus(x: JFloat, y: JFloat): JFloat = x + y
+ override def negate(x: JFloat): JFloat = -x
+ override def minus(x: JFloat, y: JFloat): JFloat = x - y
+ override def times(x: JFloat, y: JFloat): JFloat = x * y
+}
+
+object JDoubleRing extends Ring[JDouble] {
+ override val zero: JDouble = JDouble.valueOf(0.0)
+ override val one: JDouble = JDouble.valueOf(1.0)
+ override def plus(x: JDouble, y: JDouble): JDouble = x + y
+ override def negate(x: JDouble): JDouble = -x
+ override def minus(x: JDouble, y: JDouble): JDouble = x - y
+ override def times(x: JDouble, y: JDouble): JDouble = x * y
+}
+
+object JBoolRing extends Ring[JBool] {
+ override val zero: JBool = JBool.FALSE
+ override val one: JBool = JBool.TRUE
+ override def plus(x: JBool, y: JBool): JBool =
+ JBool.valueOf(x.booleanValue ^ y.booleanValue)
+ override def negate(x: JBool): JBool = x
+ override def minus(x: JBool, y: JBool): JBool = plus(x, y)
+ override def times(x: JBool, y: JBool): JBool =
+ JBool.valueOf(x.booleanValue & y.booleanValue)
+}
+
+/**
+ * Since Lists are mutable, this always makes a full copy. Prefer scala immutable Lists if you use scala
+ * immutable lists, the tail of the result of plus is always the right argument
+ */
+class JListMonoid[T] extends Monoid[JList[T]] {
+ override def isNonZero(x: JList[T]): Boolean = !x.isEmpty
+ override lazy val zero: JArrayList[T] = new JArrayList[T](0)
+ override def plus(x: JList[T], y: JList[T]): JArrayList[T] = {
+ val res = new JArrayList[T](x.size + y.size)
+ res.addAll(x)
+ res.addAll(y)
+ res
+ }
+}
+
+/**
+ * Since maps are mutable, this always makes a full copy. Prefer scala immutable maps if you use scala
+ * immutable maps, this operation is much faster TODO extend this to Group, Ring
+ */
+class JMapMonoid[K, V: Semigroup] extends Monoid[JMap[K, V]] {
+ override lazy val zero: JHashMap[K, V] = new JHashMap[K, V](0)
+
+ val nonZero: (V => Boolean) = implicitly[Semigroup[V]] match {
+ case mon: Monoid[?] => mon.isNonZero(_)
+ case _ => _ => true
+ }
+
+ override def isNonZero(x: JMap[K, V]): Boolean =
+ !x.isEmpty && (implicitly[Semigroup[V]] match {
+ case mon: Monoid[?] =>
+ x.values.asScala.exists(v => mon.isNonZero(v))
+ case _ => true
+ })
+ override def plus(x: JMap[K, V], y: JMap[K, V]): JHashMap[K, V] = {
+ val (big, small, bigOnLeft) =
+ if (x.size > y.size) {
+ (x, y, true)
+ } else {
+ (y, x, false)
+ }
+ val vsemi = implicitly[Semigroup[V]]
+ val result = new JHashMap[K, V](big.size + small.size)
+ result.putAll(big)
+ small.entrySet.asScala.foreach { kv =>
+ val smallK = kv.getKey
+ val smallV = kv.getValue
+ if (big.containsKey(smallK)) {
+ val bigV = big.get(smallK)
+ val newV =
+ if (bigOnLeft) vsemi.plus(bigV, smallV) else vsemi.plus(smallV, bigV)
+ if (nonZero(newV))
+ result.put(smallK, newV)
+ else
+ result.remove(smallK)
+ } else {
+ // No need to explicitly add with zero on V, just put in the small value
+ result.put(smallK, smallV)
+ }
+ }
+ result
+ }
+}
diff --git a/algebird-core/src/main/scala-3/MapAlgebra.scala b/algebird-core/src/main/scala-3/MapAlgebra.scala
new file mode 100644
index 000000000..9ca370eaf
--- /dev/null
+++ b/algebird-core/src/main/scala-3/MapAlgebra.scala
@@ -0,0 +1,320 @@
+/*
+Copyright 2012 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+ */
+package com.twitter.algebird
+
+import com.twitter.algebird.macros.{Cuber, Roller}
+import scala.collection.mutable.{Builder, Map => MMap}
+import scala.collection.{Map => ScMap}
+import algebra.ring.Rng
+import scala.collection.compat._
+
+trait MapOperations[K, V, M <: ScMap[K, V]] {
+ def add(oldMap: M, kv: (K, V)): M
+ def remove(oldMap: M, k: K): M
+ def fromMutable(mut: MMap[K, V]): M
+}
+
+abstract class GenericMapMonoid[K, V, M <: ScMap[K, V]](implicit val semigroup: Semigroup[V])
+ extends Monoid[M]
+ with MapOperations[K, V, M] {
+
+ val nonZero: (V => Boolean) = semigroup match {
+ case mon: Monoid[?] => mon.isNonZero(_)
+ case _ => _ => true
+ }
+
+ override def isNonZero(x: M): Boolean =
+ !x.isEmpty && (semigroup match {
+ case mon: Monoid[?] =>
+ x.valuesIterator.exists(v => mon.isNonZero(v))
+ case _ => true
+ })
+
+ override def plus(x: M, y: M): M = {
+ // Scala maps can reuse internal structure, so don't copy just add into the bigger one:
+ // This really saves computation when adding lots of small maps into big ones (common)
+ val (big, small, bigOnLeft) =
+ if (x.size > y.size) {
+ (x, y, true)
+ } else {
+ (y, x, false)
+ }
+ small match {
+ // Mutable maps create new copies of the underlying data on add so don't use the
+ // handleImmutable method.
+ // Cannot have a None so 'get' is safe here.
+ case _: MMap[?, ?] => sumOption(Seq(big, small)).get
+ case _ => handleImmutable(big, small, bigOnLeft)
+ }
+ }
+
+ private def handleImmutable(big: M, small: M, bigOnLeft: Boolean) =
+ small.foldLeft(big) { (oldMap, kv) =>
+ val newV = big
+ .get(kv._1)
+ .map { bigV =>
+ if (bigOnLeft)
+ semigroup.plus(bigV, kv._2)
+ else
+ semigroup.plus(kv._2, bigV)
+ }
+ .getOrElse(kv._2)
+ if (nonZero(newV))
+ add(oldMap, kv._1 -> newV)
+ else
+ remove(oldMap, kv._1)
+ }
+ override def sumOption(items: TraversableOnce[M]): Option[M] =
+ if (items.iterator.isEmpty) None
+ else {
+ val mutable = MMap[K, V]()
+ items.iterator.foreach { m =>
+ m.foreach { case (k, v) =>
+ val oldVOpt = mutable.get(k)
+ // sorry for the micro optimization here: avoiding a closure
+ val newV =
+ if (oldVOpt.isEmpty) v else Semigroup.plus(oldVOpt.get, v)
+ if (nonZero(newV))
+ mutable.update(k, newV)
+ else
+ mutable.remove(k)
+ }
+ }
+ Some(fromMutable(mutable))
+ }
+}
+
+class MapMonoid[K, V](implicit semigroup: Semigroup[V]) extends GenericMapMonoid[K, V, Map[K, V]] {
+ override lazy val zero: Map[K, V] = Map[K, V]()
+ override def add(oldMap: Map[K, V], kv: (K, V)): Map[K, V] = oldMap + kv
+ override def remove(oldMap: Map[K, V], k: K): Map[K, V] = oldMap - k
+ override def fromMutable(mut: MMap[K, V]): Map[K, V] =
+ new MutableBackedMap(mut)
+}
+
+class ScMapMonoid[K, V](implicit semigroup: Semigroup[V]) extends GenericMapMonoid[K, V, ScMap[K, V]] {
+ override lazy val zero: ScMap[K, V] = ScMap[K, V]()
+ override def add(oldMap: ScMap[K, V], kv: (K, V)): ScMap[K, V] = oldMap + kv
+ override def remove(oldMap: ScMap[K, V], k: K): ScMap[K, V] = oldMap - k
+ override def fromMutable(mut: MMap[K, V]): ScMap[K, V] =
+ new MutableBackedMap(mut)
+}
+
+/**
+ * You can think of this as a Sparse vector group
+ */
+class MapGroup[K, V](implicit val group: Group[V]) extends MapMonoid[K, V]()(group) with Group[Map[K, V]] {
+ override def negate(kv: Map[K, V]): Map[K, V] =
+ kv.iterator.map { case (k, v) =>
+ (k, group.negate(v))
+ }.toMap
+}
+
+class ScMapGroup[K, V](implicit val group: Group[V])
+ extends ScMapMonoid[K, V]()(group)
+ with Group[ScMap[K, V]] {
+ override def negate(kv: ScMap[K, V]): ScMap[K, V] =
+ kv.iterator.map { case (k, v) =>
+ (k, group.negate(v))
+ }.toMap
+}
+
+/**
+ * You can think of this as a Sparse vector ring
+ */
+trait GenericMapRing[K, V, M <: ScMap[K, V]] extends Rng[M] with MapOperations[K, V, M] {
+
+ implicit def ring: Ring[V]
+
+ override def times(x: M, y: M): M = {
+ val (big, small, bigOnLeft) =
+ if (x.size > y.size) {
+ (x, y, true)
+ } else {
+ (y, x, false)
+ }
+ small.foldLeft(zero) { (oldMap, kv) =>
+ val bigV = big.getOrElse(kv._1, ring.zero)
+ val newV =
+ if (bigOnLeft) ring.times(bigV, kv._2) else ring.times(kv._2, bigV)
+ if (ring.isNonZero(newV)) {
+ add(oldMap, kv._1 -> newV)
+ } else {
+ remove(oldMap, kv._1)
+ }
+ }
+ }
+}
+
+class MapRing[K, V](implicit override val ring: Ring[V])
+ extends MapGroup[K, V]()(ring)
+ with GenericMapRing[K, V, Map[K, V]]
+
+class ScMapRing[K, V](implicit override val ring: Ring[V])
+ extends ScMapGroup[K, V]()(ring)
+ with GenericMapRing[K, V, ScMap[K, V]]
+
+object MapAlgebra {
+ def rightContainsLeft[K, V: Equiv](l: Map[K, V], r: Map[K, V]): Boolean =
+ l.forall { case (k, v) =>
+ r.get(k).exists(Equiv[V].equiv(_, v))
+ }
+
+ implicit def sparseEquiv[K, V: Monoid: Equiv]: Equiv[Map[K, V]] =
+ Equiv.fromFunction { (m1, m2) =>
+ val cleanM1 = removeZeros(m1)
+ val cleanM2 = removeZeros(m2)
+ rightContainsLeft(cleanM1, cleanM2) && rightContainsLeft(cleanM2, cleanM1)
+ }
+
+ def mergeLookup[T, U, V: Monoid](
+ keys: TraversableOnce[T]
+ )(lookup: T => Option[V])(present: T => U): Map[U, V] =
+ sumByKey {
+ keys.iterator.map(k => present(k) -> lookup(k).getOrElse(Monoid.zero[V]))
+ }
+
+ // Returns a new map with zero-value entries removed
+ def removeZeros[K, V: Monoid](m: Map[K, V]): Map[K, V] =
+ m.filter { case (_, v) => Monoid.isNonZero(v) }
+
+ /**
+ * For each key, sum all the values. Note that if V is a Monoid, the current implementation will drop from
+ * the output any key where the values are all Monoid.zero. If the Semigroup is a Monoid, This function is
+ * equivalent to:
+ *
+ * pairs.filter(_._2 != Monoid.zero).groupBy(_._1).mapValues(_.map(_._2).sum)
+ *
+ * Otherwise, the function is equivalent to:
+ *
+ * pairs.groupBy(_._1).mapValues(_.map(_._2).sum)
+ */
+ def sumByKey[K, V: Semigroup](pairs: TraversableOnce[(K, V)]): Map[K, V] =
+ Monoid.sum(pairs.iterator.map(Map(_)))
+
+ /**
+ * For each key, creates a list of all values. This function is equivalent to:
+ *
+ * pairs.groupBy(_._1).mapValues(_.map(_._2))
+ */
+ def group[K, V](pairs: TraversableOnce[(K, V)]): Map[K, List[V]] =
+ if (pairs.iterator.isEmpty) Map.empty
+ else {
+ val mutable = MMap[K, Builder[V, List[V]]]()
+ pairs.iterator.foreach { case (k, v) =>
+ val oldVOpt = mutable.get(k)
+ // sorry for the micro optimization here: avoiding a closure
+ val bldr = if (oldVOpt.isEmpty) {
+ val b = List.newBuilder[V]
+ mutable.update(k, b)
+ b
+ } else oldVOpt.get
+ bldr += v
+ }
+ mutable.iterator.map { case (k, bldr) => (k, bldr.result()) }.toMap
+ }
+
+ // Consider this as edges from k -> v, produce a Map[K,Set[V]]
+ def toGraph[K, V](pairs: TraversableOnce[(K, V)]): Map[K, Set[V]] =
+ Monoid.sum(pairs.map { case (k, v) => Map(k -> Set(v)) })
+
+ /** join the keys of two maps (similar to outer-join in a DB) */
+ def join[K, V, W](map1: Map[K, V], map2: Map[K, W]): Map[K, (Option[V], Option[W])] =
+ Monoid
+ .plus(
+ map1.transform { case (_, v) =>
+ (List(v), List[W]())
+ },
+ map2.transform { case (_, w) =>
+ (List[V](), List(w))
+ }
+ )
+ .transform { case (_, (v, w)) => (v.headOption, w.headOption) }
+
+ /**
+ * Reverses a graph losslessly None key is for v's with no sources.
+ */
+ def invertExact[K, V](m: Map[Option[K], Set[V]]): Map[Option[V], Set[K]] = {
+ def nonEmptyIter[T](i: Iterable[T]): Iterable[Option[T]] =
+ if (i.isEmpty) Iterable(None)
+ else {
+ i.map(Some(_))
+ }
+
+ Monoid.sum {
+ for {
+ (k, sv) <- m.view.toIterable
+ v <- nonEmptyIter(sv)
+ } yield Map(v -> k.toSet)
+ }
+ }
+
+ /**
+ * Invert the Common case of exactly one value for each key
+ */
+ def invert[K, V](m: Map[K, V]): Map[V, Set[K]] =
+ Monoid.sum(m.view.toIterable.map { case (k, v) => Map(v -> Set(k)) })
+
+ def dot[K, V](left: Map[K, V], right: Map[K, V])(implicit mring: Ring[Map[K, V]], mon: Monoid[V]): V =
+ Monoid.sum(mring.times(left, right).values)
+
+ def cube[K, V](it: TraversableOnce[(K, V)])(implicit c: Cuber[K]): Map[c.K, List[V]] = {
+ val map: MMap[c.K, List[V]] = MMap[c.K, List[V]]()
+ it.iterator.foreach { case (k, v) =>
+ c(k).iterator.foreach { ik =>
+ map.get(ik) match {
+ case Some(vs) => map += ik -> (v :: vs)
+ case None => map += ik -> List(v)
+ }
+ }
+ }
+ map.foreach { case (k, v) => map(k) = v.reverse }
+ new MutableBackedMap(map)
+ }
+
+ def cubeSum[K, V](it: TraversableOnce[(K, V)])(implicit c: Cuber[K], sg: Semigroup[V]): Map[c.K, V] =
+ sumByKey(it.iterator.flatMap { case (k, v) => c(k).map((_, v)) })
+
+ def cubeAggregate[T, K, U, V](it: TraversableOnce[T], agg: Aggregator[T, U, V])(
+ fn: T => K
+ )(implicit c: Cuber[K]): Map[c.K, V] =
+ sumByKey(it.iterator.flatMap(t => c(fn(t)).iterator.map((_, agg.prepare(t)))))(agg.semigroup)
+ .map { case (k, v) => (k, agg.present(v)) }
+
+ def rollup[K, V](it: TraversableOnce[(K, V)])(implicit r: Roller[K]): Map[r.K, List[V]] = {
+ val map: MMap[r.K, List[V]] = MMap[r.K, List[V]]()
+ it.iterator.foreach { case (k, v) =>
+ r(k).iterator.foreach { ik =>
+ map.get(ik) match {
+ case Some(vs) => map += ik -> (v :: vs)
+ case None => map += ik -> List(v)
+ }
+ }
+ }
+ map.foreach { case (k, v) => map(k) = v.reverse }
+ new MutableBackedMap(map)
+ }
+
+ def rollupSum[K, V](it: TraversableOnce[(K, V)])(implicit r: Roller[K], sg: Semigroup[V]): Map[r.K, V] =
+ sumByKey(it.iterator.flatMap { case (k, v) => r(k).iterator.map((_, v)) })
+
+ def rollupAggregate[T, K, U, V](it: TraversableOnce[T], agg: Aggregator[T, U, V])(
+ fn: T => K
+ )(implicit r: Roller[K]): Map[r.K, V] =
+ sumByKey(it.iterator.flatMap(t => r(fn(t)).iterator.map((_, agg.prepare(t)))))(agg.semigroup)
+ .map { case (k, v) => (k, agg.present(v)) }
+
+}
diff --git a/algebird-core/src/main/scala-3/Scan.scala b/algebird-core/src/main/scala-3/Scan.scala
new file mode 100644
index 000000000..2dc2ff9c2
--- /dev/null
+++ b/algebird-core/src/main/scala-3/Scan.scala
@@ -0,0 +1,333 @@
+package com.twitter.algebird
+
+import scala.collection.compat._
+
+object Scan {
+
+ /**
+ * Most consumers of Scan don't care about the type of the type State type variable. But for those that do,
+ * we make an effort to expose it in all of our combinators.
+ * @tparam I
+ * @tparam S
+ * @tparam O
+ */
+ type Aux[-I, S, +O] = Scan[I, O] { type State = S }
+
+ implicit def applicative[I]: Applicative[Scan[I, _]] = new ScanApplicative[I]
+
+ def from[I, S, O](initState: S)(presentAndNextStateFn: (I, S) => (O, S)): Aux[I, S, O] =
+ new Scan[I, O] {
+ override type State = S
+ override val initialState = initState
+ override def presentAndNextState(i: I, s: State): (O, State) = presentAndNextStateFn(i, s)
+ }
+
+ def fromFunction[I, O](f: I => O): Aux[I, Unit, O] = new Scan[I, O] {
+ override type State = Unit
+ override val initialState = ()
+ override def presentAndNextState(i: I, stateBeforeProcessingI: Unit): (O, State) = (f(i), ())
+ }
+
+ /**
+ * Scans take streams of inputs to streams of outputs, but some scans have trivial inputs and just produce a
+ * stream of outputs. Streams can be thought of as being a hidden state that is queryable for a head
+ * element, and another hidden state that represents the rest of the stream.
+ * @param initState
+ * The initial state of the scan; think of this as an infinite stream.
+ * @param destructor
+ * This function decomposes a stream into the its head-element and tail-stream.
+ * @tparam S
+ * The hidden state of the stream that we are turning into a Scan.
+ * @tparam O
+ * The type of the elments of the stream that we are turning into a Scan
+ * @return
+ * A Scan whose inputs are irrelevant, and whose outputs are those that we would get from implementing a
+ * stream using the information provided to this method.
+ */
+ def iterate[S, O](initState: S)(destructor: S => (O, S)): Aux[Any, S, O] = new Scan[Any, O] {
+ override type State = S
+ override val initialState = initState
+ override def presentAndNextState(i: Any, stateBeforeProcessingI: S): (O, S) =
+ destructor(stateBeforeProcessingI)
+ }
+
+ /**
+ * A Scan whose `Nth` output is the number `N` (starting from 0).
+ */
+ val index: Aux[Any, Long, Long] = iterate(0L)(n => (n, n + 1))
+
+ def identity[A]: Aux[A, Unit, A] = fromFunction[A, A](x => x)
+
+ /**
+ * @param initStateCreator
+ * A call-by-name method that allocates new mutable state
+ * @param presentAndUpdateStateFn
+ * A function that both presents the output value, and has the side-effect of updating the mutable state
+ * @tparam I
+ * @tparam S
+ * @tparam O
+ * @return
+ * A Scan that safely encapsulates state while it's doing its thing.
+ */
+ def mutable[I, S, O](initStateCreator: => S)(presentAndUpdateStateFn: (I, S) => O): Aux[I, S, O] =
+ new Scan[I, O] {
+ override type State = S
+ override def initialState = initStateCreator
+ override def presentAndNextState(i: I, s: S): (O, S) = (presentAndUpdateStateFn(i, s), s)
+ }
+
+ /**
+ * The trivial scan that always returns the same value, regardless of input
+ * @param t
+ * @tparam T
+ */
+ def const[T](t: T): Aux[Any, Unit, T] = fromFunction(_ => t)
+
+ /**
+ * @param aggregator
+ * @param initState
+ * @tparam A
+ * @tparam B
+ * @tparam C
+ * @return
+ * A scan which, when given `[a_1, ..., a_n]` outputs `[c_1, ..., c_n]` where `c_i = initState +
+ * aggregator.prepare(a_1) + ... + aggregator.prepare(a_i)`
+ */
+ def fromAggregator[A, B, C](aggregator: Aggregator[A, B, C], initState: B): Aux[A, B, C] =
+ from(initState) { (a: A, stateBeforeProcessingI: B) =>
+ // nb: the order of the arguments to semigroup.plus here is what determines the order of the final summation;
+ // this matters because not all semigroups are commutative
+ val stateAfterProcessingA =
+ aggregator.append(stateBeforeProcessingI, a)
+ (aggregator.present(stateAfterProcessingA), stateAfterProcessingA)
+ }
+
+ /**
+ * @param monoidAggregator
+ * @tparam A
+ * @tparam B
+ * @tparam C
+ * @return
+ * A scan which, when given `[a_1, ..., a_n]` outputs `[c_1, ..., c_n]` where `c_i =
+ * monoidAggregator.monoid.zero + aggregator.prepare(a_1) + ... + aggregator.prepare(a_i)`
+ */
+ def fromMonoidAggregator[A, B, C](monoidAggregator: MonoidAggregator[A, B, C]): Aux[A, B, C] =
+ fromAggregator(monoidAggregator, monoidAggregator.monoid.zero)
+
+}
+
+/**
+ * The Scan trait is an alternative to the `scanLeft` method on iterators/other collections for a range of of
+ * use-cases where `scanLeft` is awkward to use. At a high level it provides some of the same functionality as
+ * `scanLeft`, but with a separation of "what is the state of the scan" from "what are the elements that I'm
+ * scanning over?". In particular, when scanning over an iterator with `N` elements, the output is an iterator
+ * with `N` elements (in contrast to scanLeft's `N+1`).
+ *
+ * If you find yourself writing a `scanLeft` over pairs of elements, where you only use one element of the
+ * pair within the `scanLeft`, then throw that element away in a `map` immediately after the scanLeft is done,
+ * then this abstraction is for you.
+ *
+ * The canonical method to use a scan is `apply`.
+ *
+ * @tparam I
+ * The type of elements that the computation is scanning over.
+ * @tparam O
+ * The output type of the scan (typically distinct from the hidden `State` of the scan).
+ */
+sealed abstract class Scan[-I, +O] extends Serializable {
+
+ import Scan.{from, Aux}
+
+ /**
+ * The computation of any given scan involves keeping track of a hidden state.
+ */
+ type State
+
+ /**
+ * The state of the scan before any elements have been processed
+ * @return
+ */
+ def initialState: State
+
+ /**
+ * @param i
+ * An element in the stream to process
+ * @param stateBeforeProcessingI
+ * The state of the scan before processing i
+ * @return
+ * The output of the scan corresponding to processing i with state stateBeforeProcessing, along with the
+ * result of updating stateBeforeProcessing with the information from i.
+ */
+ def presentAndNextState(i: I, stateBeforeProcessingI: State): (O, State)
+
+ /**
+ * @param iter
+ * @return
+ * If `iter = Iterator(a_1, ..., a_n)`, return:` `Iterator(o_1, ..., o_n)` where `(o_(i+1), state_(i+1)) =
+ * presentAndNextState(a_i, state_i)` and `state_0 = initialState`
+ */
+ def scanIterator(iter: Iterator[I]): Iterator[O] = new AbstractIterator[O] {
+ override def hasNext: Boolean = iter.hasNext
+ var state: State = initialState
+ override def next(): O = {
+ val thisState = state
+ val thisA = iter.next()
+ val (thisC, nextState) = presentAndNextState(thisA, thisState)
+ state = nextState
+ thisC
+ }
+ }
+
+ /**
+ * @param inputs
+ * @param bf
+ * @tparam In
+ * The type of the input collection
+ * @tparam Out
+ * The type of the output collection
+ * @return
+ * Given inputs as a collection of the form `[a_1, ..., a_n]` the output will be a collection of the form:
+ * `[o_1, ..., o_n]` where `(o_(i+1), state_(i+1)) = presentAndNextState(a_i, state_i)` and `state_0 =
+ * initialState`.
+ */
+ def apply[In <: TraversableOnce[I], Out](
+ inputs: In
+ )(implicit bf: BuildFrom[In, O, Out]): Out =
+ bf.fromSpecific(inputs)(scanIterator(inputs.toIterator))
+
+ // combinators
+
+ /**
+ * Return a new scan that is the same as this scan, but with a different `initialState`.
+ * @param newInitialState
+ * @return
+ */
+ def replaceState(newInitialState: => State): Aux[I, State, O] =
+ from(newInitialState)(presentAndNextState(_, _))
+
+ def composePrepare[I1](f: I1 => I): Aux[I1, State, O] = from(initialState) { (i, stateBeforeProcessingI) =>
+ presentAndNextState(f(i), stateBeforeProcessingI)
+ }
+
+ def andThenPresent[O1](g: O => O1): Aux[I, State, O1] = from(initialState) { (i, stateBeforeProcessingI) =>
+ val (c, stateAfterProcessingA) = presentAndNextState(i, stateBeforeProcessingI)
+ (g(c), stateAfterProcessingA)
+ }
+
+ /**
+ * Return a scan that is semantically identical to `this.join(Scan.identity[I1])`, but where we don't
+ * pollute the `State` by pairing it redundantly with `Unit`.
+ * @tparam I1
+ * @return
+ * If this Scan's `apply` method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1,
+ * ..., o_n`, then this results in a Scan whose `apply` method returns `[(o_1, a_1), ..., (o_n, a_n)]`
+ * when given the same input.
+ */
+ def joinWithInput[I1 <: I]: Aux[I1, State, (O, I1)] = from(initialState) { (i, stateBeforeProcessingI) =>
+ val (o, stateAfterProcessingI) = presentAndNextState(i, stateBeforeProcessingI)
+ ((o, i), stateAfterProcessingI)
+ }
+
+ /**
+ * Return a scan whose output is paired with the state of the scan before each input updates the state.
+ * @return
+ * If this Scan's `apply` method is given inputs [a_1, ..., a_n] resulting in outputs of the form `[o_1,
+ * ..., o_n]`, where `(o_(i+1), state_(i+1)) = presentAndNextState(a_i, state_i)` and `state_0 =
+ * initialState`, return a scan that whose apply method, when given inputs `[a_1, ..., a_n]` will return
+ * `[(o_1, state_0), ..., (o_n, state_(n-1))]`.
+ */
+ def joinWithPriorState: Aux[I, State, (State, O)] = from(initialState) { (i, stateBeforeProcessingI) =>
+ val (o, stateAfterProcessingA) = presentAndNextState(i, stateBeforeProcessingI)
+ ((stateBeforeProcessingI, o), stateAfterProcessingA)
+ }
+
+ /**
+ * Return a scan whose output is paired with the state of the scan after each input updates the state.
+ * @return
+ * If this Scan's `apply` method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1,
+ * ..., o_n]`, where `(o_(i+1), state_(i+1)) = presentAndNextState(a_i, state_i)`` and state_0 =
+ * initialState, return a scan that whose apply method, when given inputs `[a_1, ..., a_n]` will return
+ * `[(o_1, state_1), ..., (o_n, state_n]`.
+ */
+ def joinWithPosteriorState: Aux[I, State, (O, State)] = from(initialState) { (i, stateBeforeProcessingI) =>
+ val (c, stateAfterProcessingA) = presentAndNextState(i, stateBeforeProcessingI)
+ ((c, stateAfterProcessingA), stateAfterProcessingA)
+ }
+
+ /**
+ * For every `foo`, `scan.joinWithIndex(foo) == scan(foo).zipWithIndex`.
+ * @return
+ * If this Scan's `apply` method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1,
+ * ..., o_n]`, return a scan that whose apply method, when given the same input, will return `[(o_1, 1),
+ * ..., (o_n, n)]`.
+ */
+ def joinWithIndex: Aux[I, (State, Long), (O, Long)] = join(Scan.index)
+
+ /**
+ * Compose two scans pairwise such that, when given pairwise zipped inputs, the resulting scan will output
+ * pairwise zipped outputs.
+ * @param scan2
+ * @tparam I2
+ * @tparam O2
+ * @return
+ * If this Scan's apply method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1,
+ * ..., o_n]`, and `scan2.apply([b_1, ..., b_n] = [p_1, ..., p_n]` then `zip` will return a scan whose
+ * apply method, when given input `[(a_1, b_1), ..., (a_n, b_n)]` results in the output `[(o_1, p_1), ...,
+ * (o_2, p_2)]`. In other words: `scan.zip(scan2)(foo.zip(bar)) == scan(foo).zip(scan2(bar))`
+ */
+ def zip[I2, O2](scan2: Scan[I2, O2]): Aux[(I, I2), (State, scan2.State), (O, O2)] =
+ from((initialState, scan2.initialState)) { (i1i2, stateBeforeProcessingI1I2) =>
+ val (o1, state1AfterProcesingI1) =
+ presentAndNextState(i1i2._1, stateBeforeProcessingI1I2._1)
+ val (o2, state2AfterProcesingI2) =
+ scan2.presentAndNextState(i1i2._2, stateBeforeProcessingI1I2._2)
+ ((o1, o2), (state1AfterProcesingI1, state2AfterProcesingI2))
+ }
+
+ /**
+ * Given a scan that takes compatible input to this one, pairwise compose the state and outputs of each scan
+ * on a common input stream.
+ * @param scan2
+ * @tparam I2
+ * @tparam O2
+ * @return
+ * If this Scan's apply method is given inputs [a_1, ..., a_n] resulting in outputs of the form `[o_1,
+ * ..., o_n]`, and `scan2.apply([a_1, ..., a_n] = [p_1, ..., p_n]` then `join` will return a scan whose
+ * apply method returns `[(o_1, p_1), ..., (o_2, p_2)]`. In other words: `scan.join(scan2)(foo) ==
+ * scan(foo).zip(scan2(foo))`
+ */
+ def join[I2 <: I, O2](scan2: Scan[I2, O2]): Aux[I2, (State, scan2.State), (O, O2)] =
+ from((initialState, scan2.initialState)) { (i, stateBeforeProcessingI) =>
+ val (o1, state1AfterProcesingI1) = presentAndNextState(i, stateBeforeProcessingI._1)
+ val (o2, state2AfterProcesingI2) = scan2.presentAndNextState(i, stateBeforeProcessingI._2)
+ ((o1, o2), (state1AfterProcesingI1, state2AfterProcesingI2))
+ }
+
+ /**
+ * Takes the output of this scan and feeds as input into scan2.
+ * @param scan2
+ * @tparam P
+ * @return
+ * If this Scan's apply method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1,
+ * ..., o_n]`, and `scan2.apply([o_1, ..., o_n] = [p_1, ..., p_n]` then `compose` will return a scan which
+ * returns `[p_1, ..., p_n]`.
+ */
+ def compose[P](scan2: Scan[O, P]): Aux[I, (State, scan2.State), P] =
+ from((initialState, scan2.initialState)) { (i, stateBeforeProcessingI) =>
+ val (o, state1AfterProcesingI) = presentAndNextState(i, stateBeforeProcessingI._1)
+ val (p, state2AfterProcesingO) = scan2.presentAndNextState(o, stateBeforeProcessingI._2)
+ (p, (state1AfterProcesingI, state2AfterProcesingO))
+ }
+
+}
+
+class ScanApplicative[I] extends Applicative[Scan[I, _]] {
+ override def map[T, U](mt: Scan[I, T])(fn: T => U): Scan[I, U] =
+ mt.andThenPresent(fn)
+
+ override def apply[T](v: T): Scan[I, T] =
+ Scan.const(v)
+
+ override def join[T, U](mt: Scan[I, T], mu: Scan[I, U]): Scan[I, (T, U)] =
+ mt.join(mu)
+}
diff --git a/algebird-core/src/main/scala-3/SpaceSaver.scala b/algebird-core/src/main/scala-3/SpaceSaver.scala
new file mode 100644
index 000000000..5f9eee7e6
--- /dev/null
+++ b/algebird-core/src/main/scala-3/SpaceSaver.scala
@@ -0,0 +1,296 @@
+package com.twitter.algebird
+
+import java.nio.ByteBuffer
+
+import scala.collection.immutable.SortedMap
+import scala.util.{Failure, Success, Try}
+
+object SpaceSaver {
+
+ /**
+ * Construct SpaceSaver with given capacity containing a single item. This is the public api to create a new
+ * SpaceSaver.
+ */
+ def apply[T](capacity: Int, item: T): SpaceSaver[T] = SSOne(capacity, item)
+
+ /**
+ * Construct SpaceSaver with given capacity containing a single item with provided exact count. This is the
+ * public api to create a new SpaceSaver.
+ */
+ def apply[T](capacity: Int, item: T, count: Long): SpaceSaver[T] =
+ SSMany(capacity, Map(item -> ((count, 0L))))
+
+ private[algebird] val ordering =
+ Ordering.by[(?, (Long, Long)), (Long, Long)] { case (_, (count, err)) =>
+ (-count, err)
+ }
+
+ implicit def spaceSaverSemiGroup[T]: Semigroup[SpaceSaver[T]] =
+ new SpaceSaverSemigroup[T]
+
+ /**
+ * Encodes the SpaceSaver as a sequence of bytes containing in order
+ * - 1 byte: 1/2 => 1 = SSOne, 2 = SSMany
+ * - 4 bytes: the capacity
+ * - N bytes: the item/counters (counters as length + N*(item size + item + 2 * counters)
+ */
+ def toBytes[T](ss: SpaceSaver[T], tSerializer: T => Array[Byte]): Array[Byte] =
+ ss match {
+ case SSOne(capacity, item) =>
+ val itemAsBytes = tSerializer(item)
+ val itemLength = itemAsBytes.length
+ // 1 for the type, 4 for capacity, 4 for itemAsBytes.length
+ val buffer = new Array[Byte](1 + 4 + 4 + itemLength)
+ ByteBuffer
+ .wrap(buffer)
+ .put(1: Byte)
+ .putInt(capacity)
+ .putInt(itemLength)
+ .put(itemAsBytes)
+ buffer
+
+ case SSMany(
+ capacity,
+ counters,
+ _
+ ) => // We do not care about the buckets are thery are created by SSMany.apply
+ val buffer = scala.collection.mutable.ArrayBuffer.newBuilder[Byte]
+ buffer += (2: Byte)
+
+ var buff = ByteBuffer.allocate(4)
+ buff.putInt(capacity)
+ buffer ++= buff.array()
+
+ buff = ByteBuffer.allocate(4)
+ buff.putInt(counters.size)
+ buffer ++= buff.array()
+ counters.foreach { case (item, (a, b)) =>
+ val itemAsBytes = tSerializer(item)
+
+ buff = ByteBuffer.allocate(4)
+ buff.putInt(itemAsBytes.length)
+ buffer ++= buff.array()
+
+ buffer ++= itemAsBytes
+
+ buff = ByteBuffer.allocate(8 * 2)
+ buff.putLong(a)
+ buff.putLong(b)
+ buffer ++= buff.array()
+ }
+ buffer.result().toArray
+ }
+
+ // Make sure to be reversible so fromBytes(toBytes(x)) == x
+ def fromBytes[T](bytes: Array[Byte], tDeserializer: Array[Byte] => Try[T]): Try[SpaceSaver[T]] =
+ fromByteBuffer(ByteBuffer.wrap(bytes), buffer => tDeserializer(buffer.array()))
+
+ def fromByteBuffer[T](bb: ByteBuffer, tDeserializer: ByteBuffer => Try[T]): Try[SpaceSaver[T]] =
+ Try {
+ bb.get.toInt match {
+ case 1 =>
+ val capacity = bb.getInt
+ val itemLength = bb.getInt
+ val itemAsBytes = new Array[Byte](itemLength)
+ bb.get(itemAsBytes)
+ tDeserializer(ByteBuffer.wrap(itemAsBytes)).map(item => SSOne(capacity, item))
+ case 2 =>
+ val capacity = bb.getInt
+
+ var countersToDeserialize = bb.getInt
+ val counters = scala.collection.mutable.Map.empty[T, (Long, Long)]
+ while (countersToDeserialize != 0) {
+ val itemLength = bb.getInt()
+ val itemAsBytes = new Array[Byte](itemLength)
+ bb.get(itemAsBytes)
+ val item = tDeserializer(ByteBuffer.wrap(itemAsBytes))
+
+ val a = bb.getLong
+ val b = bb.getLong
+
+ item match {
+ case Failure(e) => return Failure(e)
+ case Success(i) =>
+ counters += ((i, (a, b)))
+ }
+
+ countersToDeserialize -= 1
+ }
+
+ Success(SSMany(capacity, counters.toMap))
+ }
+ }.flatten
+}
+
+/**
+ * Data structure used in the Space-Saving Algorithm to find the approximate most frequent and top-k elements.
+ * The algorithm is described in "Efficient Computation of Frequent and Top-k Elements in Data Streams". See
+ * here: www.cs.ucsb.edu/research/tech_reports/reports/2005-23.pdf In the paper the data structure is called
+ * StreamSummary but we chose to call it SpaceSaver instead. Note that the adaptation to hadoop and
+ * parallelization were not described in the article and have not been proven to be mathematically correct or
+ * preserve the guarantees or benefits of the algorithm.
+ */
+sealed abstract class SpaceSaver[T] {
+ import SpaceSaver.ordering
+
+ /**
+ * Maximum number of counters to keep (parameter "m" in the research paper).
+ */
+ def capacity: Int
+
+ /**
+ * Current lowest value for count
+ */
+ def min: Long
+
+ /**
+ * Map of item to counter, where each counter consists of an observed count and possible over-estimation
+ * (error)
+ */
+ def counters: Map[T, (Long, Long)]
+
+ def ++(other: SpaceSaver[T]): SpaceSaver[T]
+
+ /**
+ * returns the frequency estimate for the item
+ */
+ def frequency(item: T): Approximate[Long] = {
+ val (count, err) = counters.getOrElse(item, (min, min))
+ Approximate(count - err, count, count, 1.0)
+ }
+
+ /**
+ * Get the elements that show up more than thres times. Returns sorted in descending order: (item,
+ * Approximate[Long], guaranteed)
+ */
+ def mostFrequent(thres: Int): Seq[(T, Approximate[Long], Boolean)] =
+ counters.iterator
+ .filter { case (_, (count, _)) => count >= thres }
+ .toList
+ .sorted(ordering)
+ .map { case (item, (count, err)) =>
+ (item, Approximate(count - err, count, count, 1.0), thres <= count - err)
+ }
+
+ /**
+ * Get the top-k elements. Returns sorted in descending order: (item, Approximate[Long], guaranteed)
+ */
+ def topK(k: Int): Seq[(T, Approximate[Long], Boolean)] = {
+ require(k < capacity)
+ val si = counters.toList
+ .sorted(ordering)
+ val siK = si.take(k)
+ val countKPlus1 = si.drop(k).headOption.map(_._2._1).getOrElse(0L)
+ siK.map { case (item, (count, err)) =>
+ (item, Approximate(count - err, count, count, 1.0), countKPlus1 < count - err)
+ }
+ }
+
+ /**
+ * Check consistency with other SpaceSaver, useful for testing. Returns boolean indicating if they are
+ * consistent
+ */
+ def consistentWith(that: SpaceSaver[T]): Boolean =
+ (counters.keys ++ that.counters.keys).forall(item => (frequency(item) - that.frequency(item)) ~ 0)
+}
+
+case class SSOne[T] private[algebird] (override val capacity: Int, item: T) extends SpaceSaver[T] {
+ require(capacity > 1)
+
+ override def min: Long = 0L
+
+ override def counters: Map[T, (Long, Long)] = Map(item -> ((1L, 1L)))
+
+ override def ++(other: SpaceSaver[T]): SpaceSaver[T] = other match {
+ case other: SSOne[?] => SSMany(this).add(other)
+ case other: SSMany[?] => other.add(this)
+ }
+}
+
+object SSMany {
+ private def bucketsFromCounters[T](counters: Map[T, (Long, Long)]): SortedMap[Long, Set[T]] =
+ SortedMap[Long, Set[T]]() ++ counters.groupBy(_._2._1).mapValues(_.keySet).toMap
+
+ private[algebird] def apply[T](capacity: Int, counters: Map[T, (Long, Long)]): SSMany[T] =
+ SSMany(capacity, counters, bucketsFromCounters(counters))
+
+ private[algebird] def apply[T](one: SSOne[T]): SSMany[T] =
+ SSMany(one.capacity, Map(one.item -> ((1L, 0L))), SortedMap(1L -> Set(one.item)))
+}
+
+case class SSMany[T] private (
+ override val capacity: Int,
+ override val counters: Map[T, (Long, Long)],
+ buckets: SortedMap[Long, Set[T]]
+) extends SpaceSaver[T] {
+ private val exact: Boolean = counters.size < capacity
+
+ override val min: Long = if (counters.size < capacity) 0L else buckets.firstKey
+
+ // item is already present and just needs to be bumped up one
+ private def bump(item: T) = {
+ val (count, err) = counters(item)
+ val counters1 = counters + (item -> ((count + 1L, err))) // increment by one
+ val currBucket = buckets(count) // current bucket
+ val buckets1 = {
+ if (currBucket.size == 1) // delete current bucket since it will be empty
+ buckets - count
+ else // remove item from current bucket
+ buckets + (count -> (currBucket - item))
+ } + (count + 1L -> (buckets.getOrElse(count + 1L, Set()) + item))
+ SSMany(capacity, counters1, buckets1)
+ }
+
+ // lose one item to meet capacity constraint
+ private def loseOne = {
+ val firstBucket = buckets(buckets.firstKey)
+ val itemToLose = firstBucket.head
+ val counters1 = counters - itemToLose
+ val buckets1 =
+ if (firstBucket.size == 1)
+ buckets - min
+ else
+ buckets + (min -> (firstBucket - itemToLose))
+ SSMany(capacity, counters1, buckets1)
+ }
+
+ // introduce new item
+ private def introduce(item: T, count: Long, err: Long) = {
+ val counters1 = counters + (item -> ((count, err)))
+ val buckets1 = buckets + (count -> (buckets.getOrElse(count, Set()) + item))
+ SSMany(capacity, counters1, buckets1)
+ }
+
+ // add a single element
+ private[algebird] def add(x: SSOne[T]): SSMany[T] = {
+ require(x.capacity == capacity)
+ if (counters.contains(x.item))
+ bump(x.item)
+ else
+ (if (exact) this else this.loseOne).introduce(x.item, min + 1L, min)
+ }
+
+ // merge two stream summaries
+ private def merge(x: SSMany[T]): SSMany[T] = {
+ require(x.capacity == capacity)
+ val counters1 = Map() ++
+ (counters.keySet ++ x.counters.keySet).toList
+ .map { key =>
+ val (count1, err1) = counters.getOrElse(key, (min, min))
+ val (count2, err2) = x.counters.getOrElse(key, (x.min, x.min))
+ key -> ((count1 + count2, err1 + err2))
+ }
+ .sorted(SpaceSaver.ordering)
+ .take(capacity)
+ SSMany(capacity, counters1)
+ }
+
+ override def ++(other: SpaceSaver[T]): SpaceSaver[T] = other match {
+ case other: SSOne[?] => add(other)
+ case other: SSMany[?] => merge(other)
+ }
+}
+
+class SpaceSaverSemigroup[T] extends Semigroup[SpaceSaver[T]] {
+ override def plus(x: SpaceSaver[T], y: SpaceSaver[T]): SpaceSaver[T] = x ++ y
+}
diff --git a/algebird-core/src/main/scala-3/VectorSpace.scala b/algebird-core/src/main/scala-3/VectorSpace.scala
new file mode 100644
index 000000000..f8818600c
--- /dev/null
+++ b/algebird-core/src/main/scala-3/VectorSpace.scala
@@ -0,0 +1,59 @@
+/*
+Copyright 2012 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+ */
+
+package com.twitter.algebird
+
+import scala.annotation.implicitNotFound
+
+/**
+ * This class represents a vector space. For the required properties see:
+ *
+ * http://en.wikipedia.org/wiki/Vector_space#Definition
+ */
+object VectorSpace extends VectorSpaceOps with Implicits
+
+sealed trait VectorSpaceOps {
+ def scale[F, C[_]](v: F, c: C[F])(implicit vs: VectorSpace[F, C]): C[F] =
+ vs.scale(v, c)
+ def from[F, C[_]](scaleFn: (F, C[F]) => C[F])(implicit r: Ring[F], cGroup: Group[C[F]]): VectorSpace[F, C] =
+ new VectorSpace[F, C] {
+ override def ring: Ring[F] = r
+ override def group: Group[C[F]] = cGroup
+ override def scale(v: F, c: C[F]): C[F] =
+ if (r.isNonZero(v)) scaleFn(v, c) else cGroup.zero
+ }
+}
+private object VectorSpaceOps extends VectorSpaceOps
+
+sealed trait Implicits extends LowPrioImpicits {
+ implicit def indexedSeqSpace[T: Ring]: VectorSpace[T, IndexedSeq] =
+ VectorSpaceOps.from[T, IndexedSeq]((s, seq) => seq.map(Ring.times(s, _)))
+}
+
+sealed trait LowPrioImpicits {
+ implicit def mapSpace[K, T: Ring]: VectorSpace[T, Map[K, _]] =
+ VectorSpaceOps.from[T, Map[K, _]] { (s, m) =>
+ m.transform { case (_, v) => Ring.times(s, v) }
+ }
+}
+
+@implicitNotFound(msg = "Cannot find VectorSpace type class for Container: ${C} and Ring: ${F}")
+trait VectorSpace[F, C[_]] extends java.io.Serializable {
+ implicit def ring: Ring[F]
+ def field: Ring[F] = ring // this is for compatibility with older versions
+ implicit def group: Group[C[F]]
+ def scale(v: F, c: C[F]): C[F]
+}
diff --git a/algebird-core/src/main/scala-3/monad/EitherMonad.scala b/algebird-core/src/main/scala-3/monad/EitherMonad.scala
new file mode 100644
index 000000000..b6d5e2ffc
--- /dev/null
+++ b/algebird-core/src/main/scala-3/monad/EitherMonad.scala
@@ -0,0 +1,37 @@
+/*
+ Copyright 2013 Twitter, Inc.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package com.twitter.algebird.monad
+
+import com.twitter.algebird.Monad
+
+// Monad for either, used for modeling Error where L is the type of the error
+object EitherMonad {
+ class Error[L] extends Monad[Either[L, *]] {
+ override def apply[R](r: R): Right[L, R] = Right(r)
+
+ override def flatMap[T, U](self: Either[L, T])(next: T => Either[L, U]): Either[L, U] =
+ self.right.flatMap(next)
+
+ override def map[T, U](self: Either[L, T])(fn: T => U): Either[L, U] =
+ self.right.map(fn)
+ }
+
+ implicit def monad[L]: Monad[Either[L, _]] = new Error[L]
+
+ def assert[L](truth: Boolean, failure: => L): Either[L, Unit] =
+ if (truth) Right(()) else Left(failure)
+}
diff --git a/algebird-core/src/main/scala-3/monad/Reader.scala b/algebird-core/src/main/scala-3/monad/Reader.scala
new file mode 100644
index 000000000..e0747af20
--- /dev/null
+++ b/algebird-core/src/main/scala-3/monad/Reader.scala
@@ -0,0 +1,76 @@
+/*
+ Copyright 2013 Twitter, Inc.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package com.twitter.algebird.monad
+
+import com.twitter.algebird.Monad
+
+// TODO this is general, move somewhere better
+
+// Reader Monad, represents a series of operations that mutate some environment
+// type (the input to the function)
+
+sealed trait Reader[-Env, +T] {
+ def apply(env: Env): T
+ def flatMap[E1 <: Env, U](next: T => Reader[E1, U]): Reader[E1, U] =
+ FlatMappedReader[E1, T, U](this, next)
+ def map[U](thatFn: T => U): Reader[Env, U] =
+ FlatMappedReader(this, (t: T) => ConstantReader(thatFn(t)))
+}
+
+final case class ConstantReader[+T](get: T) extends Reader[Any, T] {
+ override def apply(env: Any): T = get
+ override def map[U](fn: T => U): ConstantReader[U] = ConstantReader(fn(get))
+ override def flatMap[E1 <: Any, U](next: T => Reader[E1, U]): Reader[E1, U] =
+ next(get)
+}
+final case class ReaderFn[E, +T](fn: E => T) extends Reader[E, T] {
+ override def apply(env: E): T = fn(env)
+}
+final case class FlatMappedReader[E, U, +T](first: Reader[E, U], fn: U => Reader[E, T]) extends Reader[E, T] {
+ override def apply(env: E): T = {
+ @annotation.tailrec
+ def loop(r: Reader[E, Any], stack: List[(Any) => Reader[E, Any]]): Any =
+ r match {
+ case ConstantReader(get) =>
+ stack match {
+ case head :: tail => loop(head(get), tail)
+ case Nil => get
+ }
+ case ReaderFn(fn) =>
+ stack match {
+ case head :: tail => loop(head(fn(env)), tail)
+ case Nil => fn(env)
+ }
+ case FlatMappedReader(first, nextFn) => loop(first, nextFn :: stack)
+ }
+ loop(first, List(fn.asInstanceOf[(Any) => Reader[E, Any]])).asInstanceOf[T]
+ }
+}
+
+object Reader {
+ def const[T](t: T): Reader[Any, T] = ConstantReader(t)
+ implicit def apply[E, T](fn: (E) => T): Reader[E, T] = ReaderFn(fn)
+
+ class ReaderM[Env] extends Monad[Reader[Env, _]] {
+ override def apply[T](t: T): ConstantReader[T] = ConstantReader(t)
+ override def flatMap[T, U](self: Reader[Env, T])(next: T => Reader[Env, U]): Reader[Env, U] =
+ self.flatMap(next)
+ override def map[T, U](self: Reader[Env, T])(fn: T => U): Reader[Env, U] = self.map(fn)
+ }
+
+ implicit def monad[Env]: Monad[Reader[Env, _]] = new ReaderM[Env]
+}
diff --git a/algebird-core/src/main/scala-3/monad/StateWithError.scala b/algebird-core/src/main/scala-3/monad/StateWithError.scala
new file mode 100644
index 000000000..e15a9ebc3
--- /dev/null
+++ b/algebird-core/src/main/scala-3/monad/StateWithError.scala
@@ -0,0 +1,130 @@
+/*
+ Copyright 2013 Twitter, Inc.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package com.twitter.algebird.monad
+
+import com.twitter.algebird.{Monad, Semigroup}
+
+/**
+ * Monad to handle mutating input state and possible failures. This is used to interact in the planning phase
+ * with existing mutable APIs (like storm or cascading), but retain the ability to compose carefully.
+ */
+sealed trait StateWithError[S, +F, +T] {
+ def join[F1 >: F, U](
+ that: StateWithError[S, F1, U],
+ mergeErr: (F1, F1) => F1,
+ mergeState: (S, S) => S
+ ): StateWithError[S, F1, (T, U)] =
+ join(that)(Semigroup.from(mergeErr), Semigroup.from(mergeState))
+
+ def join[F1 >: F, U](that: StateWithError[S, F1, U])(implicit
+ sgf: Semigroup[F1],
+ sgs: Semigroup[S]
+ ): // TODO: deep joins could blow the stack, not yet using trampoline here
+ StateWithError[S, F1, (T, U)] =
+ StateFn { (requested: S) =>
+ (run(requested), that.run(requested)) match {
+ case (Right((s1, r1)), Right((s2, r2))) =>
+ Right((sgs.plus(s1, s2), (r1, r2)))
+ case (Left(err1), Left(err2)) =>
+ Left(sgf.plus(err1, err2)) // Our earlier is not ready
+ case (Left(err), _) => Left(err)
+ case (_, Left(err)) => Left(err)
+ }
+ }
+
+ def apply(state: S): Either[F, (S, T)] = run(state)
+
+ def run(state: S): Either[F, (S, T)]
+
+ def flatMap[F1 >: F, U](next: T => StateWithError[S, F1, U]): StateWithError[S, F1, U] =
+ FlatMappedState(this, next)
+
+ def map[U](fn: (T) => U): StateWithError[S, F, U] =
+ FlatMappedState(this, (t: T) => StateWithError.const(fn(t)))
+}
+
+/** Simple wrapper of a function in the Monad */
+final case class StateFn[S, F, T](fn: S => Either[F, (S, T)]) extends StateWithError[S, F, T] {
+ override def run(state: S): Either[F, (S, T)] = fn(state)
+}
+
+/**
+ * A Trampolining instance that should prevent stack overflow at the expense of performance
+ */
+final case class FlatMappedState[S, F, T, U](start: StateWithError[S, F, T], fn: T => StateWithError[S, F, U])
+ extends StateWithError[S, F, U] {
+ override def run(state: S): Either[F, (S, U)] = {
+ @annotation.tailrec
+ def loop(inState: S, st: StateWithError[S, F, Any], stack: List[Any => StateWithError[S, F, Any]]): Any =
+ st match {
+ case StateFn(fn) =>
+ fn(inState) match {
+ case err @ Left(_) => err // bail at first error
+ case noError @ Right((newState, out)) =>
+ stack match {
+ case head :: tailStack => loop(newState, head(out), tailStack)
+ case Nil => noError // recursion ends
+ }
+ }
+ case FlatMappedState(st, next) => loop(inState, st, next :: stack)
+ }
+ loop(state, this, Nil).asInstanceOf[Either[F, (S, U)]]
+ }
+}
+
+object StateWithError {
+ def getState[S]: StateWithError[S, Nothing, S] =
+ StateFn((state: S) => Right((state, state)))
+ def putState[S](newState: S): StateWithError[S, Nothing, Unit] =
+ StateFn((_: S) => Right((newState, ())))
+ def swapState[S](newState: S): StateWithError[S, Nothing, S] =
+ StateFn((old: S) => Right((newState, old)))
+
+ def const[S, T](t: T): StateWithError[S, Nothing, T] =
+ StateFn((state: S) => Right((state, t)))
+ def lazyVal[S, T](t: => T): StateWithError[S, Nothing, T] =
+ StateFn((state: S) => Right((state, t)))
+ def failure[S, F](f: F): StateWithError[S, F, Nothing] =
+ StateFn(_ => Left(f))
+
+ /**
+ * Use like fromEither[Int](Right("good")) to get a constant Either in the monad
+ */
+ def fromEither[S]: ConstantStateMaker[S] = new ConstantStateMaker[S]
+ class ConstantStateMaker[S] {
+ def apply[F, T](either: Either[F, T]): StateWithError[S, F, T] = { (s: S) => either.right.map((s, _)) }
+ }
+
+ class FunctionLifter[S] {
+ def apply[I, F, T](fn: I => Either[F, T]): (I => StateWithError[S, F, T]) = { (i: I) =>
+ StateFn((s: S) => fn(i).right.map((s, _)))
+ }
+ }
+ // TODO this should move to Monad and work for any Monad
+ def toKleisli[S]: FunctionLifter[S] = new FunctionLifter[S]
+
+ implicit def apply[S, F, T](fn: S => Either[F, (S, T)]): StateWithError[S, F, T] = StateFn(fn)
+ implicit def monad[S, F]: Monad[StateWithError[S, F, _]] = new StateFMonad[F, S]
+
+ class StateFMonad[F, S] extends Monad[StateWithError[S, F, _]] {
+ override def apply[T](const: T): StateWithError[S, Nothing, T] = { (s: S) => Right((s, const)) }
+ override def flatMap[T, U](
+ earlier: StateWithError[S, F, T]
+ )(next: T => StateWithError[S, F, U]): StateWithError[S, F, U] =
+ earlier.flatMap(next)
+ }
+}
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/AdaptiveCache.scala b/algebird-core/src/main/scala/com/twitter/algebird/AdaptiveCache.scala
index 29329b788..53a0eff17 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/AdaptiveCache.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/AdaptiveCache.scala
@@ -87,9 +87,9 @@ class AdaptiveCache[K, V: Semigroup](maxCapacity: Int, growthMargin: Double = 3.
summingCache = new SummingWithHitsCache(currentCapacity)
if (currentCapacity == maxCapacity)
- sentinelCache.stopGrowing
+ sentinelCache.stopGrowing()
else
- sentinelCache.clear
+ sentinelCache.clear()
}
ret
}
@@ -101,7 +101,7 @@ class AdaptiveCache[K, V: Semigroup](maxCapacity: Int, growthMargin: Double = 3.
override def flush: Option[Map[K, V]] = {
val ret = summingCache.flush
- sentinelCache.clear
+ sentinelCache.clear()
ret
}
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/AdaptiveVector.scala b/algebird-core/src/main/scala/com/twitter/algebird/AdaptiveVector.scala
index e47fb8792..31f5117bc 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/AdaptiveVector.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/AdaptiveVector.scala
@@ -145,7 +145,7 @@ object AdaptiveVector {
def iteq: Boolean =
(lit.hasNext, rit.hasNext) match {
case (true, true) =>
- val (lnext, rnext) = (lit.next, rit.next)
+ val (lnext, rnext) = (lit.next(), rit.next())
if (lnext._1 == rnext._1 && Equiv[V].equiv(lnext._2, rnext._2))
iteq
else
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/Applicative.scala b/algebird-core/src/main/scala/com/twitter/algebird/Applicative.scala
index 32a66339a..211cac612 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/Applicative.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/Applicative.scala
@@ -42,7 +42,7 @@ trait Applicative[M[_]] extends Functor[M] {
case _ =>
val mb =
ms.foldLeft(apply(Seq.newBuilder[T]))((mb, mt) => joinWith(mb, mt)((b, t) => b += t))
- map(mb)(_.result)
+ map(mb)(_.result())
}
def joinWith[T, U, V](mt: M[T], mu: M[U])(fn: (T, U) => V): M[V] =
map(join(mt, mu)) { case (t, u) => fn(t, u) }
@@ -102,7 +102,7 @@ object Applicative {
)(implicit app: Applicative[M], cbf: Factory[T, R[T]]): M[R[T]] = {
val bldr = cbf.newBuilder
val mbldr = ms.iterator.foldLeft(app.apply(bldr))((mb, mt) => app.joinWith(mb, mt)(_ += _))
- app.map(mbldr)(_.result)
+ app.map(mbldr)(_.result())
}
def joinWith[M[_], T, U, V](mt: M[T], mu: M[U])(fn: (T, U) => V)(implicit app: Applicative[M]): M[V] =
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/AveragedValue.scala b/algebird-core/src/main/scala/com/twitter/algebird/AveragedValue.scala
index 9d684db79..efef198e3 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/AveragedValue.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/AveragedValue.scala
@@ -112,7 +112,7 @@ object AveragedValue {
*/
def numericAggregator[N](implicit num: Numeric[N]): MonoidAggregator[N, AveragedValue, Double] =
Aggregator
- .prepareMonoid { n: N => AveragedValue(num.toDouble(n)) }
+ .prepareMonoid { (n: N) => AveragedValue(num.toDouble(n)) }
.andThenPresent(_.value)
/**
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/Batched.scala b/algebird-core/src/main/scala/com/twitter/algebird/Batched.scala
index d209a98dc..0db108a3a 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/Batched.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/Batched.scala
@@ -104,7 +104,7 @@ object Batched {
if (ts.iterator.isEmpty) None
else {
val it = ts.iterator
- val t0 = it.next
+ val t0 = it.next()
Some(Item(t0).append(it))
}
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/BloomFilter.scala b/algebird-core/src/main/scala/com/twitter/algebird/BloomFilter.scala
index bda97981d..5ea0f11d5 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/BloomFilter.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/BloomFilter.scala
@@ -33,7 +33,7 @@ object RichCBitSet {
def fromBitSet(bs: BitSet): CBitSet = {
val nbs = new CBitSet
val it = bs.iterator
- while (it.hasNext) { nbs.set(it.next) }
+ while (it.hasNext) { nbs.set(it.next()) }
nbs
}
implicit def cb2rcb(cb: CBitSet): RichCBitSet = new RichCBitSet(cb)
@@ -235,7 +235,7 @@ case class BloomFilterMonoid[A](numHashes: Int, width: Int)(implicit hash: Hash1
case BFInstance(_, bitset, _) =>
// these Ints are boxed so, that's a minor bummer
val iter = bitset.iterator
- while (iter.hasNext) { set(iter.next) }
+ while (iter.hasNext) { set(iter.next()) }
}
if (sets == 0) Some(zero)
else if (sets == numHashes && (oneItem != null)) Some(oneItem)
@@ -307,7 +307,7 @@ object BF {
new IntIterator {
val boxedIter: Iterator[Int] = bitset.iterator
override def hasNext: Boolean = boxedIter.hasNext
- override def next: Int = boxedIter.next
+ override def next: Int = boxedIter.next()
}
case BFZero(_, _) =>
new IntIterator {
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/BufferedOperation.scala b/algebird-core/src/main/scala/com/twitter/algebird/BufferedOperation.scala
index e8c45b668..102f2e3c7 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/BufferedOperation.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/BufferedOperation.scala
@@ -45,7 +45,7 @@ abstract class ArrayBufferedOperation[I, O](size: Int) extends Buffered[I, O] {
if (buffer.isEmpty) None
else {
val res = operate(buffer.toSeq)
- buffer.clear
+ buffer.clear()
Some(res)
}
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/ExpHist.scala b/algebird-core/src/main/scala/com/twitter/algebird/ExpHist.scala
index 2f6d6e988..3a01eee07 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/ExpHist.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/ExpHist.scala
@@ -105,7 +105,7 @@ case class ExpHist(
b += bucket
},
_ => Vector.newBuilder[Bucket],
- x => addAll(x.result)
+ x => addAll(x.result())
)
// This internal method assumes that the instance is stepped forward
@@ -182,7 +182,7 @@ object ExpHist {
case class Bucket(size: Long, timestamp: Timestamp)
object Bucket {
- implicit val ord: Ordering[Bucket] = Ordering.by { b: Bucket => (b.timestamp, b.size) }
+ implicit val ord: Ordering[Bucket] = Ordering.by { (b: Bucket) => (b.timestamp, b.size) }
}
/**
@@ -260,7 +260,7 @@ object ExpHist {
if (desired.isEmpty) Vector.empty
else {
val input = buckets.dropWhile(_.size == 0)
- val bucketSize +: tail = desired
+ val bucketSize +: tail = desired : @unchecked
val remaining = drop(bucketSize, input)
input.head.copy(size = bucketSize) +: rebucket(remaining, tail)
}
@@ -275,7 +275,7 @@ object ExpHist {
* If an element wasn't fully consumed, the remainder will be stuck back onto the head.
*/
@tailrec private[this] def drop(toDrop: Long, input: Vector[Bucket]): Vector[Bucket] = {
- val (b @ Bucket(count, _)) +: tail = input
+ val (b @ Bucket(count, _)) +: tail = input : @unchecked
(toDrop - count) match {
case 0 => tail
case x if x < 0 => b.copy(size = -x) +: tail
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/HashingTrick.scala b/algebird-core/src/main/scala/com/twitter/algebird/HashingTrick.scala
index 0d86aa03e..03b1dad0c 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/HashingTrick.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/HashingTrick.scala
@@ -27,7 +27,7 @@ class HashingTrickMonoid[V: Group](bits: Int, seed: Int = 123456) extends Monoid
Monoid.plus(left, right)
def init[K](kv: (K, V))(implicit ev: K => Array[Byte]): AdaptiveVector[V] = {
- val (long1, long2) = hash(kv._1)
+ val (long1, long2):(Long,Long) = hash(kv._1)
val index = (long1 & bitMask).toInt
val isNegative = (long2 & 1) == 1
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/HyperLogLog.scala b/algebird-core/src/main/scala/com/twitter/algebird/HyperLogLog.scala
index adac1141d..0fc0b97e6 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/HyperLogLog.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/HyperLogLog.scala
@@ -419,7 +419,7 @@ case class SparseHLL(override val bits: Int, maxRhow: Map[Int, Max[Byte]]) exten
val iter: Iterator[(Int, Max[Byte])] = maxRhow.iterator
while (iter.hasNext) {
- val (idx, _) = iter.next
+ val (idx, _) = iter.next()
val existing: Byte = newContents(idx)
val other: Byte = maxRhow(idx).get
@@ -575,7 +575,7 @@ class HyperLogLogMonoid(val bits: Int) extends Monoid[HLL] with BoundedSemilatti
None
} else {
val iter = items.iterator.buffered
- var curValue = iter.next
+ var curValue = iter.next()
while (iter.hasNext) {
curValue = (curValue, iter.head) match {
case (DenseHLL(_, _), _) => denseUpdate(curValue, iter)
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/HyperLogLogSeries.scala b/algebird-core/src/main/scala/com/twitter/algebird/HyperLogLogSeries.scala
index f795b1a4c..75b5c7ccc 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/HyperLogLogSeries.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/HyperLogLogSeries.scala
@@ -62,7 +62,7 @@ case class HLLSeries(bits: Int, rows: Vector[Map[Int, Long]]) {
while (i >= 0) {
val it = rows(i).iterator
while (it.hasNext) {
- val (k, t) = it.next
+ val (k, t) = it.next()
if (t >= threshold && seen.add(k)) {
sum += HyperLogLog.negativePowersOfTwo(i + 1)
}
@@ -142,7 +142,7 @@ class HyperLogLogSeriesMonoid(val bits: Int) extends Monoid[HLLSeries] {
val bldr = Vector.newBuilder[Map[Int, Long]]
val lit = left.rows.iterator
val rit = right.rows.iterator
- while (lit.hasNext && rit.hasNext) bldr += combine(lit.next, rit.next)
+ while (lit.hasNext && rit.hasNext) bldr += combine(lit.next(), rit.next())
val zipped = bldr.result()
HLLSeries(bits, zipped ++ right.rows.slice(ln, rn))
}
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/Max.scala b/algebird-core/src/main/scala/com/twitter/algebird/Max.scala
index df95c4691..6e84c7541 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/Max.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/Max.scala
@@ -160,8 +160,8 @@ private[algebird] sealed abstract class LowPriorityMaxInstances {
while (true) {
if (xs.hasNext) {
if (ys.hasNext) {
- val x = xs.next
- val y = ys.next
+ val x = xs.next()
+ val y = ys.next()
val cmp = ord.compare(x, y)
if (cmp != 0) return cmp
} else {
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/Metric.scala b/algebird-core/src/main/scala/com/twitter/algebird/Metric.scala
index e5c6df39b..fc4dd10e8 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/Metric.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/Metric.scala
@@ -73,7 +73,7 @@ object Metric {
def minkowskiMap[K, V: Monoid: Metric](p: Double): Metric[Map[K, V]] =
Metric.from { (a: Map[K, V], b: Map[K, V]) =>
- val outP = (a.keySet ++ b.keySet).map { key: K =>
+ val outP = (a.keySet ++ b.keySet).map { (key: K) =>
val v1 = a.getOrElse(key, Monoid.zero[V])
val v2 = b.getOrElse(key, Monoid.zero[V])
math.pow(implicitly[Metric[V]].apply(v1, v2), p)
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/MinHasher.scala b/algebird-core/src/main/scala/com/twitter/algebird/MinHasher.scala
index ada06450b..5c6b9ebc9 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/MinHasher.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/MinHasher.scala
@@ -69,7 +69,7 @@ abstract class MinHasher[H](val numHashes: Int, val numBands: Int)(implicit n: N
private val hashFunctions = {
val r = new scala.util.Random(seed)
val numHashFunctions = math.ceil(numBytes / 16.0).toInt
- (1 to numHashFunctions).map(_ => MurmurHash128(r.nextLong))
+ (1 to numHashFunctions).map(_ => MurmurHash128(r.nextLong()))
}
/** Signature for empty set, needed to be a proper Monoid */
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/MomentsGroup.scala b/algebird-core/src/main/scala/com/twitter/algebird/MomentsGroup.scala
index 74eb5a428..9da380b3e 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/MomentsGroup.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/MomentsGroup.scala
@@ -248,7 +248,7 @@ object Moments {
val fold: Fold[Double, Moments] = momentsMonoid.zero.fold
def numericAggregator[N](implicit num: Numeric[N]): MonoidAggregator[N, Moments, Moments] =
- Aggregator.prepareMonoid { n: N => Moments(num.toDouble(n)) }
+ Aggregator.prepareMonoid { (n: N) => Moments(num.toDouble(n)) }
/**
* Create a Moments object given a single value. This is useful for initializing moment calculations at the
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/Monad.scala b/algebird-core/src/main/scala/com/twitter/algebird/Monad.scala
index de8c31a71..cd14c7a96 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/Monad.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/Monad.scala
@@ -57,7 +57,7 @@ object Monad {
if (xs.isEmpty)
monad.apply(acc)
else
- monad.flatMap(fn(acc, xs.head)) { t: T => foldM(t, xs.tail)(fn) }
+ monad.flatMap(fn(acc, xs.head)) { (t: T) => foldM(t, xs.tail)(fn) }
// Some instances of the Monad typeclass (case for a macro):
implicit val list: Monad[List] = new Monad[List] {
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/Preparer.scala b/algebird-core/src/main/scala/com/twitter/algebird/Preparer.scala
index a10d6d8a8..1d81a888e 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/Preparer.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/Preparer.scala
@@ -187,10 +187,10 @@ trait FlatMapPreparer[A, T] extends Preparer[A, T] {
def prepareFn: A => TraversableOnce[T]
def map[U](fn: T => U): FlatMapPreparer[A, U] =
- FlatMapPreparer { a: A => prepareFn(a).map(fn) }
+ FlatMapPreparer { (a: A) => prepareFn(a).map(fn) }
override def flatMap[U](fn: T => TraversableOnce[U]): FlatMapPreparer[A, U] =
- FlatMapPreparer { a: A => prepareFn(a).flatMap(fn) }
+ FlatMapPreparer { (a: A) => prepareFn(a).flatMap(fn) }
override def monoidAggregate[B, C](aggregator: MonoidAggregator[T, B, C]): MonoidAggregator[A, B, C] =
aggregator.sumBefore.composePrepare(prepareFn)
@@ -242,10 +242,10 @@ object FlatMapPreparer {
override val prepareFn: TraversableOnce[A] => TraversableOnce[A] = (a: TraversableOnce[A]) => a
override def map[U](fn: A => U): FlatMapPreparer[TraversableOnce[A], U] =
- FlatMapPreparer { a: TraversableOnce[A] => a.map(fn) }
+ FlatMapPreparer { (a: TraversableOnce[A]) => a.map(fn) }
override def flatMap[U](fn: A => TraversableOnce[U]): FlatMapPreparer[TraversableOnce[A], U] =
- FlatMapPreparer { a: TraversableOnce[A] => a.flatMap(fn) }
+ FlatMapPreparer { (a: TraversableOnce[A]) => a.flatMap(fn) }
override def monoidAggregate[B, C](
aggregator: MonoidAggregator[A, B, C]
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/QTree.scala b/algebird-core/src/main/scala/com/twitter/algebird/QTree.scala
index 2376cfbf8..c78897715 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/QTree.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/QTree.scala
@@ -151,9 +151,9 @@ class QTreeSemigroup[A](k: Int)(implicit val underlyingMonoid: Monoid[A]) extend
val batchSize = compressBatchSize
var count = 1 // start at 1, so we only compress after batchSize items
val iter = items.toIterator
- var result = iter.next // due to not being empty, this does not throw
+ var result = iter.next() // due to not being empty, this does not throw
while (iter.hasNext) {
- result = result.merge(iter.next)
+ result = result.merge(iter.next())
count += 1
if (count % batchSize == 0) {
result = result.compress(k)
@@ -428,8 +428,8 @@ class QTree[@specialized(Int, Long, Float, Double) A] private[algebird] (
print(" (" + parentCount + ")")
}
println(" {" + _sum + "}")
- lowerChild.foreach(_.dump)
- upperChild.foreach(_.dump)
+ lowerChild.foreach(_.dump())
+ upperChild.foreach(_.dump())
}
/**
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/SketchMap.scala b/algebird-core/src/main/scala/com/twitter/algebird/SketchMap.scala
index f5973c338..e327ed57c 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/SketchMap.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/SketchMap.scala
@@ -145,7 +145,7 @@ case class SketchMapParams[K](seed: Int, width: Int, depth: Int, heavyHittersCou
val numCounters = width
(0 to (numHashes - 1)).map { _ =>
val smhash: SketchMapHash[K] =
- SketchMapHash(CMSHash[Long](r.nextInt, 0, numCounters), seed)(serialization)
+ SketchMapHash(CMSHash[Long](r.nextInt(), 0, numCounters), seed)(serialization)
(k: K) => smhash(k)
}
}
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/SummingCache.scala b/algebird-core/src/main/scala/com/twitter/algebird/SummingCache.scala
index 4cd9a1505..e2302e899 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/SummingCache.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/SummingCache.scala
@@ -57,7 +57,7 @@ class SummingCache[K, V](capacity: Int)(implicit sgv: Semigroup[V]) extends Stat
override def flush: Option[Map[K, V]] = {
// Get a copy of the cache, since it is mutable
val res = optNonEmpty(cache.iterator.toMap)
- cache.clear
+ cache.clear()
res
}
override def isFlushed: Boolean = cache.isEmpty
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/SummingIterator.scala b/algebird-core/src/main/scala/com/twitter/algebird/SummingIterator.scala
index cd9e7deaf..7644aca2e 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/SummingIterator.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/SummingIterator.scala
@@ -49,16 +49,16 @@ class SummingIterator[V](summer: StatefulSummer[V], it: Iterator[V])
// This has to be lazy because it shouldn't be touched until the val it is exhausted
protected lazy val tailIter: Iterator[V] = summer.flush.iterator
override def hasNext: Boolean = it.hasNext || tailIter.hasNext
- override def next: V = nextInternal
+ override def next(): V = nextInternal
@tailrec
private def nextInternal: V =
if (it.hasNext) {
- summer.put(it.next) match {
+ summer.put(it.next()) match {
case None => nextInternal
case Some(v) => v
}
} else {
- tailIter.next
+ tailIter.next()
}
}
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/Window.scala b/algebird-core/src/main/scala/com/twitter/algebird/Window.scala
index 8df431d7e..199553780 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/Window.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/Window.scala
@@ -126,7 +126,7 @@ abstract class WindowMonoid[T](windowSize: Int) extends Monoid[Window[T]] {
val it = ws.toIterator
var queue = Queue.empty[T]
while (it.hasNext) {
- queue = (queue ++ it.next.items).takeRight(windowSize)
+ queue = (queue ++ it.next().items).takeRight(windowSize)
}
Some(Window(monoid.sum(queue), queue))
}
@@ -140,7 +140,7 @@ abstract class WindowMonoid[T](windowSize: Int) extends Monoid[Window[T]] {
while (it.hasNext) {
// avoid materializing the whole list in memory
// at one time
- queue = queue :+ it.next
+ queue = queue :+ it.next()
size = size + 1
if (size > windowSize) {
queue = queue.tail
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/immutable/BitSet.scala b/algebird-core/src/main/scala/com/twitter/algebird/immutable/BitSet.scala
index d58a6c9ab..3e90cadcf 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/immutable/BitSet.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/immutable/BitSet.scala
@@ -573,7 +573,7 @@ object BitSet {
BitSet.adoptedUnion(this, rhs)
} else {
// height == rhs.height, so we know rhs is a Branch.
- val Branch(_, _, rcs) = rhs
+ val Branch(_, _, rcs) = rhs : @unchecked
val cs = new Array[BitSet](32)
var i = 0
while (i < 32) {
@@ -605,7 +605,7 @@ object BitSet {
Empty
} else {
// height == rhs.height, so we know rhs is a Branch.
- val Branch(_, _, rcs) = rhs
+ val Branch(_, _, rcs) = rhs: @unchecked
val cs = new Array[BitSet](32)
var i = 0
var nonEmpty = false
@@ -643,7 +643,7 @@ object BitSet {
false
} else {
// height == rhs.height, so we know rhs is a Branch.
- val Branch(_, _, rcs) = rhs
+ val Branch(_, _, rcs) = rhs : @unchecked
var i = 0
while (i < 32) {
val x = children(i)
@@ -688,7 +688,7 @@ object BitSet {
this | rhs
} else {
// height == rhs.height, so we know rhs is a Branch.
- val Branch(_, _, rcs) = rhs
+ val Branch(_, _, rcs) = rhs : @unchecked
val cs = new Array[BitSet](32)
var i = 0
while (i < 32) {
@@ -805,7 +805,7 @@ object BitSet {
throw InternalError("branch misaligned")
} else {
// height == rhs.height, so we know rhs is a Branch.
- val Branch(_, _, rcs) = rhs
+ val Branch(_, _, rcs) = rhs: @unchecked
var i = 0
while (i < 32) {
val x = children(i)
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/immutable/BloomFilter.scala b/algebird-core/src/main/scala/com/twitter/algebird/immutable/BloomFilter.scala
index 71a861075..572dce367 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/immutable/BloomFilter.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/immutable/BloomFilter.scala
@@ -272,7 +272,7 @@ final case class BloomFilter[A](numHashes: Int, width: Int)(implicit val hash: H
override def +(other: A): Hash = {
val bs = BitSet.newEmpty(0)
- val hash = new Array[Int](numHashes)
+ val hash = new Array[Int](this.numHashes)
hashToArray(item, hash)
bs.mutableAdd(hash)
@@ -336,7 +336,7 @@ final case class BloomFilter[A](numHashes: Int, width: Int)(implicit val hash: H
// use an approximation width of 0.05
override def size: Approximate[Long] =
- BloomFilter.sizeEstimate(numBits, numHashes, width, 0.05)
+ BloomFilter.sizeEstimate(this.numBits, numHashes, width, 0.05)
}
implicit val monoid: Monoid[Hash] with BoundedSemilattice[Hash] =
@@ -402,7 +402,7 @@ final case class BloomFilter[A](numHashes: Int, width: Int)(implicit val hash: H
/**
* Create a bloom filter with multiple items from an iterator
*/
- def create(data: Iterator[A]): Hash = monoid.sum(data.map(Item))
+ def create(data: Iterator[A]): Hash = monoid.sum(data.map(Item.apply))
val empty: Hash = Empty
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/matrix/AdaptiveMatrix.scala b/algebird-core/src/main/scala/com/twitter/algebird/matrix/AdaptiveMatrix.scala
index f970c43f3..c50d912d7 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/matrix/AdaptiveMatrix.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/matrix/AdaptiveMatrix.scala
@@ -95,7 +95,7 @@ object AdaptiveMatrix {
var row = 0
val iter = storage.iterator
while (iter.hasNext) {
- val curRow = iter.next
+ val curRow = iter.next()
curRow.foreach { case (col, value) =>
buffer(row * cols + col) = value
}
@@ -114,7 +114,7 @@ object AdaptiveMatrix {
val sparseStorage = (0 until rows).map(_ => MMap[Int, V]()).toIndexedSeq
while (iter.hasNext) {
- val current = iter.next
+ val current = iter.next()
current match {
case d @ DenseMatrix(_, _, _) => return denseUpdate(d, iter)
case s @ SparseColumnMatrix(_) =>
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/matrix/SparseColumnMatrix.scala b/algebird-core/src/main/scala/com/twitter/algebird/matrix/SparseColumnMatrix.scala
index 69f553360..96f201eb8 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/matrix/SparseColumnMatrix.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/matrix/SparseColumnMatrix.scala
@@ -49,7 +49,7 @@ case class SparseColumnMatrix[V: Monoid](rowsByColumns: IndexedSeq[AdaptiveVecto
while (row < rows) {
val iter = rowsByColumns(row).denseIterator
while (iter.hasNext) {
- val (col, value) = iter.next
+ val (col, value):(Int,V) = iter.next()
val indx = row * lcols + col
buffer(indx) = valueMonoid.plus(buffer(indx), value)
}
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/statistics/IterCallStatistics.scala b/algebird-core/src/main/scala/com/twitter/algebird/statistics/IterCallStatistics.scala
index 5c3e4c37b..38c026937 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/statistics/IterCallStatistics.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/statistics/IterCallStatistics.scala
@@ -36,7 +36,7 @@ private class IterCallStatistics(threadSafe: Boolean) {
total.add(v)
// log2(v + 1) for v up to 2^maxBucket
val bucket = min(64 - numberOfLeadingZeros(v), maxBucket)
- distribution(bucket).increment
+ distribution(bucket).increment()
}
def count: Long = distribution.foldLeft(0L)(_ + _.get) // sum
@@ -59,8 +59,8 @@ private class IterCallStatistics(threadSafe: Boolean) {
private class CountingIterator[T](val i: Iterator[T]) extends Iterator[T] {
private[this] final var nextCount: Long = 0
override def hasNext: Boolean = i.hasNext
- override def next: T = {
- val n = i.next
+ override def next(): T = {
+ val n = i.next()
nextCount += 1
n
}
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/statistics/Statistics.scala b/algebird-core/src/main/scala/com/twitter/algebird/statistics/Statistics.scala
index ce166c250..3becb8b8a 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/statistics/Statistics.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/statistics/Statistics.scala
@@ -37,7 +37,7 @@ class StatisticsSemigroup[T](threadSafe: Boolean = true)(implicit wrappedSemigro
def getSumOptionCallTime: Long = sumOptionCallsStats.getTotalCallTime
override def plus(x: T, y: T): T = {
- plusCallsCount.increment
+ plusCallsCount.increment()
Semigroup.plus(x, y)
}
@@ -66,7 +66,7 @@ class StatisticsMonoid[T](threadSafe: Boolean = true)(implicit wrappedMonoid: Mo
def getSumCallTime: Long = sumCallsStats.getTotalCallTime
override def zero: T = {
- zeroCallsCount.increment
+ zeroCallsCount.increment()
Monoid.zero
}
@@ -95,12 +95,12 @@ class StatisticsGroup[T](threadSafe: Boolean = true)(implicit group: Group[T])
def getMinusCallCount: Long = minusCallsCount.get
override def negate(x: T): T = {
- negateCallsCount.increment
+ negateCallsCount.increment()
Group.negate(x)
}
override def minus(l: T, r: T): T = {
- minusCallsCount.increment
+ minusCallsCount.increment()
Group.minus(l, r)
}
@@ -129,12 +129,12 @@ class StatisticsRing[T](threadSafe: Boolean = true)(implicit ring: Ring[T])
def getProductCallTime: Long = productCallsStats.getTotalCallTime
override def one: T = {
- oneCallsCount.increment
+ oneCallsCount.increment()
Ring.one
}
override def times(x: T, y: T): T = {
- timesCallsCount.increment
+ timesCallsCount.increment()
Ring.times(x, y)
}
diff --git a/build.sbt b/build.sbt
index afc7de9c7..bcd23c4f2 100644
--- a/build.sbt
+++ b/build.sbt
@@ -31,6 +31,8 @@ def scalaBinaryVersion(scalaVersion: String) = scalaVersion match {
case version => sys.error(s"unsupported scala version $version")
}
+def isScala3(scalaVersion: String) = scalaVersion.startsWith("3.")
+
def isScala212x(scalaVersion: String) = scalaBinaryVersion(scalaVersion) == "2.12"
def isScala213x(scalaVersion: String) = scalaBinaryVersion(scalaVersion) == "2.13"
@@ -110,6 +112,16 @@ val sharedSettings = Seq(
scalaVersion.value
)
) ++ mimaSettings
+// NOTE: After dropping Scala 2.11, we can remove src/main/scala-2.11 and share sources between scala 2.12, 2.13 and 3.x.
+lazy val kindprojectorSettings = Seq(
+ Compile / scalacOptions ++= {
+ CrossVersion.partialVersion(scalaVersion.value) match {
+ case Some((3, _)) => Seq("-Ykind-projector:underscores")
+ case Some((2, 12 | 13)) => Seq("-Xsource:3", "-P:kind-projector:underscore-placeholders")
+ case _ => Seq.empty
+ }
+ }
+)
lazy val noPublishSettings = Seq(
publish / skip := true,
@@ -208,33 +220,43 @@ def module(name: String) = {
.settings(sharedSettings ++ Seq(Keys.name := id, mimaPreviousArtifacts := previousVersion(name).toSet))
}
-lazy val algebirdCore = module("core").settings(
- crossScalaVersions += "2.13.8",
- initialCommands := """
+lazy val algebirdCore = module("core")
+ .settings(
+ crossScalaVersions += "2.13.8",
+ // crossScalaVersions += "3.2.2",
+ initialCommands := """
import com.twitter.algebird._
""".stripMargin('|'),
- libraryDependencies ++=
- Seq(
- "com.googlecode.javaewah" % "JavaEWAH" % javaEwahVersion,
- "org.typelevel" %% "algebra" % algebraVersion,
- "org.scala-lang" % "scala-reflect" % scalaVersion.value,
- "org.scalatest" %% "scalatest" % scalaTestVersion % "test",
- "org.scala-lang.modules" %% "scala-collection-compat" % scalaCollectionCompat
- ) ++ {
- if (isScala213x(scalaVersion.value)) {
- Seq()
- } else {
- Seq(compilerPlugin(("org.scalamacros" % "paradise" % paradiseVersion).cross(CrossVersion.full)))
- }
- },
- addCompilerPlugin(("org.typelevel" % "kind-projector" % kindProjectorVersion).cross(CrossVersion.full)),
- Compile / sourceGenerators += Def.task {
- GenTupleAggregators.gen((Compile / sourceManaged).value)
- }.taskValue,
- // Scala 2.12's doc task was failing.
- Compile / doc / sources ~= (_.filterNot(_.absolutePath.contains("javaapi"))),
- Test / testOptions := Seq(Tests.Argument(TestFrameworks.JUnit, "-a"))
-)
+ libraryDependencies ++=
+ Seq(
+ "com.googlecode.javaewah" % "JavaEWAH" % javaEwahVersion,
+ ("org.typelevel" %% "algebra" % algebraVersion).cross(CrossVersion.for3Use2_13),
+ "org.scalatest" %% "scalatest" % scalaTestVersion % "test",
+ "org.scala-lang.modules" %% "scala-collection-compat" % scalaCollectionCompat
+ ) ++ {
+ if (isScala3(scalaVersion.value)) {
+ Seq.empty
+ } else if (isScala213x(scalaVersion.value)) {
+ Seq(
+ "org.scala-lang" % "scala-reflect" % scalaVersion.value,
+ compilerPlugin("org.typelevel" % "kind-projector" % kindProjectorVersion).cross(CrossVersion.full)
+ )
+ } else {
+ Seq(
+ "org.scala-lang" % "scala-reflect" % scalaVersion.value,
+ compilerPlugin(("org.scalamacros" % "paradise" % paradiseVersion).cross(CrossVersion.full)),
+ compilerPlugin("org.typelevel" % "kind-projector" % kindProjectorVersion).cross(CrossVersion.full)
+ )
+ }
+ },
+ Compile / sourceGenerators += Def.task {
+ GenTupleAggregators.gen((Compile / sourceManaged).value)
+ }.taskValue,
+ // Scala 2.12's doc task was failing.
+ Compile / doc / sources ~= (_.filterNot(_.absolutePath.contains("javaapi"))),
+ Test / testOptions := Seq(Tests.Argument(TestFrameworks.JUnit, "-a"))
+ )
+ .settings(kindprojectorSettings)
lazy val algebirdTest = module("test")
.settings(