diff --git a/.gitignore b/.gitignore
index 836905ce4..3913864c6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -48,6 +48,7 @@ sonatype.sbt
 BUILD
 target/
 lib_managed/
+project/metals.sbt
 project/boot/
 project/build/target/
 project/plugins/target/
diff --git a/.scalafmt.conf b/.scalafmt.conf
index c9f903c4f..d4daaafab 100644
--- a/.scalafmt.conf
+++ b/.scalafmt.conf
@@ -1,7 +1,10 @@
 version=3.6.0
 runner.dialect = scala212
 fileOverride {
-  "glob:**/scala-2.13*/**" {
+  "glob:**/scala-3/**" {
+    runner.dialect = scala3
+  }
+  "glob:**/scala-2*/**" {
     runner.dialect = scala213
   }
 }
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/Aggregator.scala b/algebird-core/src/main/scala-2.11/Aggregator.scala
similarity index 99%
rename from algebird-core/src/main/scala/com/twitter/algebird/Aggregator.scala
rename to algebird-core/src/main/scala-2.11/Aggregator.scala
index 4e78d234b..fd380a15d 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/Aggregator.scala
+++ b/algebird-core/src/main/scala-2.11/Aggregator.scala
@@ -20,7 +20,7 @@ object Aggregator extends java.io.Serializable {
    * This is a trivial aggregator that always returns a single value
    */
   def const[T](t: T): MonoidAggregator[Any, Unit, T] =
-    prepareMonoid { _: Any => () }.andThenPresent(_ => t)
+    prepareMonoid((_: Any) => ()).andThenPresent(_ => t)
 
   /**
    * Using Aggregator.prepare,present you can add to this aggregator
@@ -172,7 +172,7 @@ object Aggregator extends java.io.Serializable {
    * How many items satisfy a predicate
    */
   def count[T](pred: T => Boolean): MonoidAggregator[T, Long, Long] =
-    prepareMonoid { t: T => if (pred(t)) 1L else 0L }
+    prepareMonoid((t: T) => if (pred(t)) 1L else 0L)
 
   /**
    * Do any items satisfy some predicate
@@ -310,7 +310,7 @@ object Aggregator extends java.io.Serializable {
    * Put everything in a Set. Note, this could fill the memory if the Set is very large.
    */
   def toSet[T]: MonoidAggregator[T, Set[T], Set[T]] =
-    prepareMonoid { t: T => Set(t) }
+    prepareMonoid((t: T) => Set(t))
 
   /**
    * This builds an in-memory Set, and then finally gets the size of that set. This may not be scalable if the
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/CountMinSketch.scala b/algebird-core/src/main/scala-2.11/CountMinSketch.scala
similarity index 99%
rename from algebird-core/src/main/scala/com/twitter/algebird/CountMinSketch.scala
rename to algebird-core/src/main/scala-2.11/CountMinSketch.scala
index f000c7fe3..809d8785f 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/CountMinSketch.scala
+++ b/algebird-core/src/main/scala-2.11/CountMinSketch.scala
@@ -185,9 +185,9 @@ class CMSSummation[K](params: CMSParams[K]) {
         val rit = matrix.iterator
         while (rit.hasNext) {
           var col = 0
-          val cit = rit.next.iterator
+          val cit = rit.next().iterator
           while (cit.hasNext) {
-            cells(offset + col) += cit.next
+            cells(offset + col) += cit.next()
             col += 1
           }
           offset += width
@@ -206,7 +206,7 @@ class CMSSummation[K](params: CMSParams[K]) {
           b += cells(offset + col)
           col += 1
         }
-        b.result
+        b.result()
       }
 
       val b = Vector.newBuilder[Vector[Long]]
@@ -215,7 +215,7 @@ class CMSSummation[K](params: CMSParams[K]) {
         b += vectorize(row)
         row += 1
       }
-      CMSInstance(CMSInstance.CountsTable(b.result), totalCount, params)
+      CMSInstance(CMSInstance.CountsTable(b.result()), totalCount, params)
     }
 }
 
@@ -724,7 +724,7 @@ case class CMSInstance[K](
     val it = countsTable.counts.iterator
     var i = 0
     while (it.hasNext) {
-      val row = it.next
+      val row = it.next()
       val count = row(hs(i)(item))
       if (count < freq) freq = count
       i += 1
@@ -817,13 +817,13 @@ object CMSInstance {
       val yss = other.counts.iterator
       val rows = Vector.newBuilder[Vector[Long]]
       while (xss.hasNext) {
-        val xs = xss.next.iterator
-        val ys = yss.next.iterator
+        val xs = xss.next().iterator
+        val ys = yss.next().iterator
         val row = Vector.newBuilder[Long]
-        while (xs.hasNext) row += (xs.next + ys.next)
-        rows += row.result
+        while (xs.hasNext) row += (xs.next() + ys.next())
+        rows += row.result()
       }
-      CountsTable[K](rows.result)
+      CountsTable[K](rows.result())
     }
   }
 
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/DecayedVector.scala b/algebird-core/src/main/scala-2.11/DecayedVector.scala
similarity index 100%
rename from algebird-core/src/main/scala/com/twitter/algebird/DecayedVector.scala
rename to algebird-core/src/main/scala-2.11/DecayedVector.scala
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/DecayingCMS.scala b/algebird-core/src/main/scala-2.11/DecayingCMS.scala
similarity index 98%
rename from algebird-core/src/main/scala/com/twitter/algebird/DecayingCMS.scala
rename to algebird-core/src/main/scala-2.11/DecayingCMS.scala
index 2b6a5f157..fd8433754 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/DecayingCMS.scala
+++ b/algebird-core/src/main/scala-2.11/DecayingCMS.scala
@@ -210,7 +210,7 @@ final class DecayingCMS[K](
   val hashFns: Array[K => Int] = {
     val rng = new Random(seed)
     def genPos(): Int =
-      rng.nextInt match {
+      rng.nextInt() match {
         case 0 => genPos()
         case n => n & 0x7fffffff
       }
@@ -323,10 +323,10 @@ final class DecayingCMS[K](
       var i = 0
       while (i < cells.length) {
         val it = cells(i).iterator
-        var localMax = it.next // we know it doesn't start empty
+        var localMax = it.next() // we know it doesn't start empty
         if (localMax < minMinimum) minMinimum = localMax
         while (it.hasNext) {
-          val n = it.next
+          val n = it.next()
           if (n > localMax) localMax = n
           else if (n < minMinimum) minMinimum = n
         }
@@ -362,7 +362,7 @@ final class DecayingCMS[K](
         val it0 = this.cells(i).iterator
         val it1 = that.cells(i).iterator
         while (it0.hasNext) {
-          val x = it0.next * it1.next
+          val x = it0.next() * it1.next()
           if (x != 0.0) sum += x
         }
         if (sum < res) res = sum
@@ -426,7 +426,7 @@ final class DecayingCMS[K](
       val x = this
       val y = other
       val timeInHL = Math.max(x.timeInHL, y.timeInHL)
-      val cms = new CMS(allocCells, 0.0, timeInHL)
+      val cms = new CMS(allocCells(), 0.0, timeInHL)
 
       val xscale = x.getScale(timeInHL)
       val yscale = y.getScale(timeInHL)
@@ -445,7 +445,7 @@ final class DecayingCMS[K](
           bldr += prod(left(j), xscale) + prod(right(j), yscale)
           j += 1
         }
-        cms.cells(i) = bldr.result
+        cms.cells(i) = bldr.result()
         i += 1
       }
       cms
@@ -505,7 +505,7 @@ final class DecayingCMS[K](
       if (expL == 0.0) {
         new CMS(monoid.zero.cells, 0.0, ts)
       } else {
-        val cms = new CMS(allocCells, 0.0, ts)
+        val cms = new CMS(allocCells(), 0.0, ts)
         var i = 0
         while (i < depth) {
           val ci = cells(i)
@@ -547,7 +547,7 @@ final class DecayingCMS[K](
               bldr += scratch(j)
               j += 1
             }
-            cells(i) = bldr.result
+            cells(i) = bldr.result()
             i += 1
           }
           cells
@@ -606,7 +606,7 @@ final class DecayingCMS[K](
           val arr = new Array[CMS](ChunkSize)
           while (it.hasNext) {
             while (it.hasNext && i < ChunkSize) {
-              arr(i) = it.next
+              arr(i) = it.next()
               i += 1
             }
             if (i > 1) {
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/Fold.scala b/algebird-core/src/main/scala-2.11/Fold.scala
similarity index 99%
rename from algebird-core/src/main/scala/com/twitter/algebird/Fold.scala
rename to algebird-core/src/main/scala-2.11/Fold.scala
index c2f21d145..ded32e628 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/Fold.scala
+++ b/algebird-core/src/main/scala-2.11/Fold.scala
@@ -66,8 +66,8 @@ sealed trait Fold[-I, +O] extends Serializable {
     val self = this
     new Fold[I, P] {
       type X = self.X
-      override def build: FoldState[X, I, P] =
-        self.build.map(f)
+      override def build(): FoldState[X, I, P] =
+        self.build().map(f)
     }
   }
 
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/Interval.scala b/algebird-core/src/main/scala-2.11/Interval.scala
similarity index 100%
rename from algebird-core/src/main/scala/com/twitter/algebird/Interval.scala
rename to algebird-core/src/main/scala-2.11/Interval.scala
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/InvariantAlgebras.scala b/algebird-core/src/main/scala-2.11/InvariantAlgebras.scala
similarity index 100%
rename from algebird-core/src/main/scala/com/twitter/algebird/InvariantAlgebras.scala
rename to algebird-core/src/main/scala-2.11/InvariantAlgebras.scala
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/JavaMonoids.scala b/algebird-core/src/main/scala-2.11/JavaMonoids.scala
similarity index 100%
rename from algebird-core/src/main/scala/com/twitter/algebird/JavaMonoids.scala
rename to algebird-core/src/main/scala-2.11/JavaMonoids.scala
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/MapAlgebra.scala b/algebird-core/src/main/scala-2.11/MapAlgebra.scala
similarity index 99%
rename from algebird-core/src/main/scala/com/twitter/algebird/MapAlgebra.scala
rename to algebird-core/src/main/scala-2.11/MapAlgebra.scala
index 8ee81c42d..55a9f8e54 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/MapAlgebra.scala
+++ b/algebird-core/src/main/scala-2.11/MapAlgebra.scala
@@ -224,7 +224,7 @@ object MapAlgebra {
         } else oldVOpt.get
         bldr += v
       }
-      mutable.iterator.map { case (k, bldr) => (k, bldr.result) }.toMap
+      mutable.iterator.map { case (k, bldr) => (k, bldr.result()) }.toMap
     }
 
   // Consider this as edges from k -> v, produce a Map[K,Set[V]]
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/Scan.scala b/algebird-core/src/main/scala-2.11/Scan.scala
similarity index 99%
rename from algebird-core/src/main/scala/com/twitter/algebird/Scan.scala
rename to algebird-core/src/main/scala-2.11/Scan.scala
index ff0dce400..d1d10ced7 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/Scan.scala
+++ b/algebird-core/src/main/scala-2.11/Scan.scala
@@ -169,9 +169,9 @@ sealed abstract class Scan[-I, +O] extends Serializable {
   def scanIterator(iter: Iterator[I]): Iterator[O] = new AbstractIterator[O] {
     override def hasNext: Boolean = iter.hasNext
     var state: State = initialState
-    override def next: O = {
+    override def next(): O = {
       val thisState = state
-      val thisA = iter.next
+      val thisA = iter.next()
       val (thisC, nextState) = presentAndNextState(thisA, thisState)
       state = nextState
       thisC
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/SpaceSaver.scala b/algebird-core/src/main/scala-2.11/SpaceSaver.scala
similarity index 99%
rename from algebird-core/src/main/scala/com/twitter/algebird/SpaceSaver.scala
rename to algebird-core/src/main/scala-2.11/SpaceSaver.scala
index 68830547e..d18b58dd6 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/SpaceSaver.scala
+++ b/algebird-core/src/main/scala-2.11/SpaceSaver.scala
@@ -78,7 +78,7 @@ object SpaceSaver {
           buff.putLong(b)
           buffer ++= buff.array()
         }
-        buffer.result.toArray
+        buffer.result().toArray
     }
 
   // Make sure to be reversible so fromBytes(toBytes(x)) == x
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/VectorSpace.scala b/algebird-core/src/main/scala-2.11/VectorSpace.scala
similarity index 100%
rename from algebird-core/src/main/scala/com/twitter/algebird/VectorSpace.scala
rename to algebird-core/src/main/scala-2.11/VectorSpace.scala
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/monad/EitherMonad.scala b/algebird-core/src/main/scala-2.11/monad/EitherMonad.scala
similarity index 100%
rename from algebird-core/src/main/scala/com/twitter/algebird/monad/EitherMonad.scala
rename to algebird-core/src/main/scala-2.11/monad/EitherMonad.scala
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/monad/Reader.scala b/algebird-core/src/main/scala-2.11/monad/Reader.scala
similarity index 100%
rename from algebird-core/src/main/scala/com/twitter/algebird/monad/Reader.scala
rename to algebird-core/src/main/scala-2.11/monad/Reader.scala
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/monad/StateWithError.scala b/algebird-core/src/main/scala-2.11/monad/StateWithError.scala
similarity index 100%
rename from algebird-core/src/main/scala/com/twitter/algebird/monad/StateWithError.scala
rename to algebird-core/src/main/scala-2.11/monad/StateWithError.scala
diff --git a/algebird-core/src/main/scala-2.12/Aggregator.scala b/algebird-core/src/main/scala-2.12/Aggregator.scala
new file mode 100644
index 000000000..8a4d2b230
--- /dev/null
+++ b/algebird-core/src/main/scala-2.12/Aggregator.scala
@@ -0,0 +1,637 @@
+package com.twitter.algebird
+
+import java.util.PriorityQueue
+import scala.collection.compat._
+import scala.collection.generic.CanBuildFrom
+
+/**
+ * Aggregators compose well.
+ *
+ * To create a parallel aggregator that operates on a single input in parallel, use:
+ * GeneratedTupleAggregator.from2((agg1, agg2))
+ */
+object Aggregator extends java.io.Serializable {
+  implicit def applicative[I]: Applicative[({ type L[O] = Aggregator[I, ?, O] })#L] =
+    new AggregatorApplicative[I]
+
+  private val DefaultSeed = 471312384
+
+  /**
+   * This is a trivial aggregator that always returns a single value
+   */
+  def const[T](t: T): MonoidAggregator[Any, Unit, T] =
+    prepareMonoid { (_: Any) => () }.andThenPresent(_ => t)
+
+  /**
+   * Using Aggregator.prepare,present you can add to this aggregator
+   */
+  def fromReduce[T](red: (T, T) => T): Aggregator[T, T, T] =
+    fromSemigroup(Semigroup.from(red))
+  def fromSemigroup[T](implicit sg: Semigroup[T]): Aggregator[T, T, T] =
+    new Aggregator[T, T, T] {
+      override def prepare(input: T): T = input
+      override def semigroup: Semigroup[T] = sg
+      override def present(reduction: T): T = reduction
+    }
+  def fromMonoid[T](implicit mon: Monoid[T]): MonoidAggregator[T, T, T] =
+    prepareMonoid(identity[T])
+  // Uses the product from the ring
+  def fromRing[T](implicit rng: Ring[T]): RingAggregator[T, T, T] =
+    fromRing[T, T](rng, identity[T])
+
+  def fromMonoid[F, T](implicit mon: Monoid[T], prep: F => T): MonoidAggregator[F, T, T] =
+    prepareMonoid(prep)(mon)
+
+  def prepareSemigroup[F, T](prep: F => T)(implicit sg: Semigroup[T]): Aggregator[F, T, T] =
+    new Aggregator[F, T, T] {
+      override def prepare(input: F): T = prep(input)
+      override def semigroup: Semigroup[T] = sg
+      override def present(reduction: T): T = reduction
+    }
+  def prepareMonoid[F, T](prep: F => T)(implicit m: Monoid[T]): MonoidAggregator[F, T, T] =
+    new MonoidAggregator[F, T, T] {
+      override def prepare(input: F): T = prep(input)
+      override def monoid: Monoid[T] = m
+      override def present(reduction: T): T = reduction
+    }
+  // Uses the product from the ring
+  def fromRing[F, T](implicit rng: Ring[T], prep: F => T): RingAggregator[F, T, T] =
+    new RingAggregator[F, T, T] {
+      override def prepare(input: F): T = prep(input)
+      override def ring: Ring[T] = rng
+      override def present(reduction: T): T = reduction
+    }
+
+  /**
+   * Obtain an [[Aggregator]] that uses an efficient append operation for faster aggregation. Equivalent to
+   * {{{appendSemigroup(prep, appnd, identity[T]_)(sg)}}}
+   */
+  def appendSemigroup[F, T](prep: F => T, appnd: (T, F) => T)(implicit
+      sg: Semigroup[T]
+  ): Aggregator[F, T, T] =
+    appendSemigroup(prep, appnd, identity[T])(sg)
+
+  /**
+   * Obtain an [[Aggregator]] that uses an efficient append operation for faster aggregation
+   * @tparam F
+   *   Data input type
+   * @tparam T
+   *   Aggregating [[Semigroup]] type
+   * @tparam P
+   *   Presentation (output) type
+   * @param prep
+   *   The preparation function. Expected to construct an instance of type T from a single data element.
+   * @param appnd
+   *   Function that appends the [[Semigroup]]. Defines the [[Aggregator.append]] method for this aggregator.
+   *   Analogous to the 'seqop' function in Scala's sequence 'aggregate' method
+   * @param pres
+   *   The presentation function
+   * @param sg
+   *   The [[Semigroup]] type class
+   * @note
+   *   The functions 'appnd' and 'prep' are expected to obey the law: {{{appnd(t, f) == sg.plus(t, prep(f))}}}
+   */
+  def appendSemigroup[F, T, P](prep: F => T, appnd: (T, F) => T, pres: T => P)(implicit
+      sg: Semigroup[T]
+  ): Aggregator[F, T, P] =
+    new Aggregator[F, T, P] {
+      override def semigroup: Semigroup[T] = sg
+      override def prepare(input: F): T = prep(input)
+      override def present(reduction: T): P = pres(reduction)
+
+      override def apply(inputs: TraversableOnce[F]): P =
+        applyOption(inputs).get
+
+      override def applyOption(inputs: TraversableOnce[F]): Option[P] =
+        agg(inputs).map(pres)
+
+      override def append(l: T, r: F): T = appnd(l, r)
+
+      override def appendAll(old: T, items: TraversableOnce[F]): T =
+        if (items.iterator.isEmpty) old else reduce(old, agg(items).get)
+
+      private def agg(inputs: TraversableOnce[F]): Option[T] =
+        if (inputs.iterator.isEmpty) None
+        else {
+          val itr = inputs.iterator
+          val t = prepare(itr.next)
+          Some(itr.foldLeft(t)(appnd))
+        }
+    }
+
+  /**
+   * Obtain a [[MonoidAggregator]] that uses an efficient append operation for faster aggregation. Equivalent
+   * to {{{appendMonoid(appnd, identity[T]_)(m)}}}
+   */
+  def appendMonoid[F, T](appnd: (T, F) => T)(implicit m: Monoid[T]): MonoidAggregator[F, T, T] =
+    appendMonoid(appnd, identity[T])(m)
+
+  /**
+   * Obtain a [[MonoidAggregator]] that uses an efficient append operation for faster aggregation
+   * @tparam F
+   *   Data input type
+   * @tparam T
+   *   Aggregating [[Monoid]] type
+   * @tparam P
+   *   Presentation (output) type
+   * @param appnd
+   *   Function that appends the [[Monoid]]. Defines the [[MonoidAggregator.append]] method for this
+   *   aggregator. Analogous to the 'seqop' function in Scala's sequence 'aggregate' method
+   * @param pres
+   *   The presentation function
+   * @param m
+   *   The [[Monoid]] type class
+   * @note
+   *   The function 'appnd' is expected to obey the law: {{{appnd(t, f) == m.plus(t, appnd(m.zero, f))}}}
+   */
+  def appendMonoid[F, T, P](appnd: (T, F) => T, pres: T => P)(implicit
+      m: Monoid[T]
+  ): MonoidAggregator[F, T, P] =
+    new MonoidAggregator[F, T, P] {
+      override def monoid: Monoid[T] = m
+      override def prepare(input: F): T = appnd(m.zero, input)
+      override def present(reduction: T): P = pres(reduction)
+
+      override def apply(inputs: TraversableOnce[F]): P = present(agg(inputs))
+
+      override def applyOption(inputs: TraversableOnce[F]): Option[P] =
+        if (inputs.isEmpty) None else Some(apply(inputs))
+
+      override def append(l: T, r: F): T = appnd(l, r)
+
+      override def appendAll(old: T, items: TraversableOnce[F]): T =
+        reduce(old, agg(items))
+
+      override def appendAll(items: TraversableOnce[F]): T = agg(items)
+
+      private def agg(inputs: TraversableOnce[F]): T =
+        inputs.foldLeft(m.zero)(append)
+    }
+
+  /**
+   * How many items satisfy a predicate
+   */
+  def count[T](pred: T => Boolean): MonoidAggregator[T, Long, Long] =
+    prepareMonoid { (t: T) => if (pred(t)) 1L else 0L }
+
+  /**
+   * Do any items satisfy some predicate
+   */
+  def exists[T](pred: T => Boolean): MonoidAggregator[T, Boolean, Boolean] =
+    prepareMonoid(pred)(OrVal.unboxedMonoid)
+
+  /**
+   * Do all items satisfy a predicate
+   */
+  def forall[T](pred: T => Boolean): MonoidAggregator[T, Boolean, Boolean] =
+    prepareMonoid(pred)(AndVal.unboxedMonoid)
+
+  /**
+   * Take the first (left most in reduce order) item found
+   */
+  def head[T]: Aggregator[T, T, T] = fromReduce[T]((l, _) => l)
+
+  /**
+   * Take the last (right most in reduce order) item found
+   */
+  def last[T]: Aggregator[T, T, T] = fromReduce[T]((_, r) => r)
+
+  /**
+   * Get the maximum item
+   */
+  def max[T: Ordering]: Aggregator[T, T, T] = new MaxAggregator[T]
+  def maxBy[U, T: Ordering](fn: U => T): Aggregator[U, U, U] = {
+    implicit val ordU: Ordering[U] = Ordering.by(fn)
+    max[U]
+  }
+
+  /**
+   * Get the minimum item
+   */
+  def min[T: Ordering]: Aggregator[T, T, T] = new MinAggregator[T]
+  def minBy[U, T: Ordering](fn: U => T): Aggregator[U, U, U] = {
+    implicit val ordU: Ordering[U] = Ordering.by(fn)
+    min[U]
+  }
+
+  /**
+   * This returns the number of items we find
+   */
+  def size: MonoidAggregator[Any, Long, Long] =
+    prepareMonoid((_: Any) => 1L)
+
+  /**
+   * Take the smallest `count` items using a heap
+   */
+  def sortedTake[T: Ordering](count: Int): MonoidAggregator[T, PriorityQueue[T], Seq[T]] =
+    new mutable.PriorityQueueToListAggregator[T](count)
+
+  /**
+   * Same as sortedTake, but using a function that returns a value that has an Ordering.
+   *
+   * This function is like writing list.sortBy(fn).take(count).
+   */
+  def sortByTake[T, U: Ordering](count: Int)(fn: T => U): MonoidAggregator[T, PriorityQueue[T], Seq[T]] =
+    Aggregator.sortedTake(count)(Ordering.by(fn))
+
+  /**
+   * Take the largest `count` items using a heap
+   */
+  def sortedReverseTake[T: Ordering](count: Int): MonoidAggregator[T, PriorityQueue[T], Seq[T]] =
+    new mutable.PriorityQueueToListAggregator[T](count)(implicitly[Ordering[T]].reverse)
+
+  /**
+   * Same as sortedReverseTake, but using a function that returns a value that has an Ordering.
+   *
+   * This function is like writing list.sortBy(fn).reverse.take(count).
+   */
+  def sortByReverseTake[T, U: Ordering](
+      count: Int
+  )(fn: T => U): MonoidAggregator[T, PriorityQueue[T], Seq[T]] =
+    Aggregator.sortedReverseTake(count)(Ordering.by(fn))
+
+  /**
+   * Immutable version of sortedTake, for frameworks that check immutability of reduce functions.
+   */
+  def immutableSortedTake[T: Ordering](count: Int): MonoidAggregator[T, TopK[T], Seq[T]] =
+    new TopKToListAggregator[T](count)
+
+  /**
+   * Immutable version of sortedReverseTake, for frameworks that check immutability of reduce functions.
+   */
+  def immutableSortedReverseTake[T: Ordering](count: Int): MonoidAggregator[T, TopK[T], Seq[T]] =
+    new TopKToListAggregator[T](count)(implicitly[Ordering[T]].reverse)
+
+  /**
+   * Randomly selects input items where each item has an independent probability 'prob' of being selected.
+   * This assumes that all sampled records can fit in memory, so use this only when the expected number of
+   * sampled values is small.
+   */
+  def randomSample[T](
+      prob: Double,
+      seed: Int = DefaultSeed
+  ): MonoidAggregator[T, Option[Batched[T]], List[T]] = {
+    assert(prob >= 0 && prob <= 1, "randomSample.prob must lie in [0, 1]")
+    val rng = new java.util.Random(seed)
+    Preparer[T]
+      .filter(_ => rng.nextDouble() <= prob)
+      .monoidAggregate(toList)
+  }
+
+  /**
+   * Selects exactly 'count' of the input records randomly (or all of the records if there are less then
+   * 'count' total records). This assumes that all 'count' of the records can fit in memory, so use this only
+   * for small values of 'count'.
+   */
+  def reservoirSample[T](
+      count: Int,
+      seed: Int = DefaultSeed
+  ): MonoidAggregator[T, PriorityQueue[(Double, T)], Seq[T]] = {
+    val rng = new java.util.Random(seed)
+    Preparer[T]
+      .map(rng.nextDouble() -> _)
+      .monoidAggregate(sortByTake(count)(_._1))
+      .andThenPresent(_.map(_._2))
+  }
+
+  /**
+   * Put everything in a List. Note, this could fill the memory if the List is very large.
+   */
+  def toList[T]: MonoidAggregator[T, Option[Batched[T]], List[T]] =
+    new MonoidAggregator[T, Option[Batched[T]], List[T]] {
+      override def prepare(t: T): Option[Batched[T]] = Some(Batched(t))
+      override def monoid: Monoid[Option[Batched[T]]] =
+        Monoid.optionMonoid(Batched.semigroup)
+      override def present(o: Option[Batched[T]]): List[T] =
+        o.map(_.toList).getOrElse(Nil)
+    }
+
+  /**
+   * Put everything in a Set. Note, this could fill the memory if the Set is very large.
+   */
+  def toSet[T]: MonoidAggregator[T, Set[T], Set[T]] =
+    prepareMonoid { (t: T) => Set(t) }
+
+  /**
+   * This builds an in-memory Set, and then finally gets the size of that set. This may not be scalable if the
+   * Uniques are very large. You might check the approximateUniqueCount or HyperLogLog Aggregator to get an
+   * approximate version of this that is scalable.
+   */
+  def uniqueCount[T]: MonoidAggregator[T, Set[T], Int] =
+    toSet[T].andThenPresent(_.size)
+
+  /**
+   * Using a constant amount of memory, give an approximate unique count (~ 1% error). This uses an exact set
+   * for up to 100 items, then HyperLogLog (HLL) with an 1.2% standard error which uses at most 8192 bytes for
+   * each HLL. For more control, see HyperLogLogAggregator.
+   */
+  def approximateUniqueCount[T: Hash128]: MonoidAggregator[T, Either[HLL, Set[T]], Long] =
+    SetSizeHashAggregator[T](hllBits = 13, maxSetSize = 100)
+
+  /**
+   * Returns the lower bound of a given percentile where the percentile is between (0,1] The items that are
+   * iterated over cannot be negative.
+   */
+  def approximatePercentile[T](percentile: Double, k: Int = QTreeAggregator.DefaultK)(implicit
+      num: Numeric[T]
+  ): QTreeAggregatorLowerBound[T] =
+    QTreeAggregatorLowerBound[T](percentile, k)
+
+  /**
+   * Returns the intersection of a bounded percentile where the percentile is between (0,1] The items that are
+   * iterated over cannot be negative.
+   */
+  def approximatePercentileBounds[T](percentile: Double, k: Int = QTreeAggregator.DefaultK)(implicit
+      num: Numeric[T]
+  ): QTreeAggregator[T] =
+    QTreeAggregator[T](percentile, k)
+
+  /**
+   * An aggregator that sums Numeric values into Doubles.
+   *
+   * This is really no more than converting to Double and then summing. The conversion to double means we
+   * don't have the overflow semantics of integer types on the jvm (e.g. Int.MaxValue + 1 == Int.MinValue).
+   *
+   * Note that if you instead wanted to aggregate Numeric values of a type T into the same type T (e.g. if you
+   * want MonoidAggregator[T, T, T] for some Numeric type T), you can directly use Aggregator.fromMonoid[T]
+   * after importing the numericRing implicit:
+   *
+   * > import com.twitter.algebird.Ring.numericRing > def numericAggregator[T: Numeric]: MonoidAggregator[T,
+   * T, T] = Aggregator.fromMonoid[T]
+   */
+  def numericSum[T](implicit num: Numeric[T]): MonoidAggregator[T, Double, Double] =
+    Preparer[T].map(num.toDouble).monoidAggregate(Aggregator.fromMonoid)
+
+}
+
+/**
+ * This is a type that models map/reduce(map). First each item is mapped, then we reduce with a semigroup,
+ * then finally we present the results.
+ *
+ * Unlike Fold, Aggregator keeps it's middle aggregation type externally visible. This is because Aggregators
+ * are useful in parallel map/reduce systems where there may be some additional types needed to cross the
+ * map/reduce boundary (such a serialization and intermediate storage). If you don't care about the middle
+ * type, an _ may be used and the main utility of the instance is still preserved (e.g. def operate[T](ag:
+ * Aggregator[T, _, Int]): Int)
+ *
+ * Note, join is very useful to combine multiple aggregations with one pass. Also
+ * GeneratedTupleAggregator.fromN((agg1, agg2, ... aggN)) can glue these together well.
+ *
+ * This type is the the Fold.M from Haskell's fold package:
+ * https://hackage.haskell.org/package/folds-0.6.2/docs/Data-Fold-M.html
+ */
+trait Aggregator[-A, B, +C] extends java.io.Serializable { self =>
+  def prepare(input: A): B
+  def semigroup: Semigroup[B]
+  def present(reduction: B): C
+
+  /* *****
+   * All the following are in terms of the above
+   */
+
+  /**
+   * combine two inner values
+   */
+  def reduce(l: B, r: B): B = semigroup.plus(l, r)
+
+  /**
+   * This may error if items is empty. To be safe you might use reduceOption if you don't know that items is
+   * non-empty
+   */
+  def reduce(items: TraversableOnce[B]): B = semigroup.sumOption(items).get
+
+  /**
+   * This is the safe version of the above. If the input in empty, return None, else reduce the items
+   */
+  def reduceOption(items: TraversableOnce[B]): Option[B] =
+    semigroup.sumOption(items)
+
+  /**
+   * This may error if inputs are empty (for Monoid Aggregators it never will, instead you see
+   * present(Monoid.zero[B])
+   */
+  def apply(inputs: TraversableOnce[A]): C =
+    present(reduce(inputs.iterator.map(prepare)))
+
+  /**
+   * This returns None if the inputs are empty
+   */
+  def applyOption(inputs: TraversableOnce[A]): Option[C] =
+    reduceOption(inputs.iterator.map(prepare))
+      .map(present)
+
+  /**
+   * This returns the cumulative sum of its inputs, in the same order. If the inputs are empty, the result
+   * will be empty too.
+   */
+  def cumulativeIterator(inputs: Iterator[A]): Iterator[C] =
+    inputs
+      .scanLeft(None: Option[B]) {
+        case (None, a)    => Some(prepare(a))
+        case (Some(b), a) => Some(append(b, a))
+      }
+      .collect { case Some(b) => present(b) }
+
+  /**
+   * This returns the cumulative sum of its inputs, in the same order. If the inputs are empty, the result
+   * will be empty too.
+   */
+  def applyCumulatively[In <: TraversableOnce[A], Out](
+      inputs: In
+  )(implicit bf: CanBuildFrom[In, C, Out]): Out =
+    (bf: BuildFrom[In, C, Out]).fromSpecific(inputs)(cumulativeIterator(inputs.iterator))
+
+  def append(l: B, r: A): B = reduce(l, prepare(r))
+
+  def appendAll(old: B, items: TraversableOnce[A]): B =
+    if (items.iterator.isEmpty) old else reduce(old, reduce(items.iterator.map(prepare)))
+
+  /** Like calling andThen on the present function */
+  def andThenPresent[D](present2: C => D): Aggregator[A, B, D] =
+    new Aggregator[A, B, D] {
+      override def prepare(input: A): B = self.prepare(input)
+      override def semigroup: Semigroup[B] = self.semigroup
+      override def present(reduction: B): D = present2(self.present(reduction))
+    }
+
+  /** Like calling compose on the prepare function */
+  def composePrepare[A1](prepare2: A1 => A): Aggregator[A1, B, C] =
+    new Aggregator[A1, B, C] {
+      override def prepare(input: A1): B = self.prepare(prepare2(input))
+      override def semigroup: Semigroup[B] = self.semigroup
+      override def present(reduction: B): C = self.present(reduction)
+    }
+
+  /**
+   * This allows you to run two aggregators on the same data with a single pass
+   */
+  def join[A2 <: A, B2, C2](that: Aggregator[A2, B2, C2]): Aggregator[A2, (B, B2), (C, C2)] =
+    GeneratedTupleAggregator.from2((this, that))
+
+  /**
+   * This allows you to join two aggregators into one that takes a tuple input, which in turn allows you to
+   * chain .composePrepare onto the result if you have an initial input that has to be prepared differently
+   * for each of the joined aggregators.
+   *
+   * The law here is: ag1.zip(ag2).apply(as.zip(bs)) == (ag1(as), ag2(bs))
+   */
+  def zip[A2, B2, C2](ag2: Aggregator[A2, B2, C2]): Aggregator[(A, A2), (B, B2), (C, C2)] = {
+    val ag1 = this
+    new Aggregator[(A, A2), (B, B2), (C, C2)] {
+      override def prepare(a: (A, A2)): (B, B2) = (ag1.prepare(a._1), ag2.prepare(a._2))
+      override val semigroup = new Tuple2Semigroup()(ag1.semigroup, ag2.semigroup)
+      override def present(b: (B, B2)): (C, C2) = (ag1.present(b._1), ag2.present(b._2))
+    }
+  }
+
+  /**
+   * An Aggregator can be converted to a Fold, but not vice-versa Note, a Fold is more constrained so only do
+   * this if you require joining a Fold with an Aggregator to produce a Fold
+   */
+  def toFold: Fold[A, Option[C]] =
+    Fold.fold[Option[B], A, Option[C]](
+      {
+        case (None, a)    => Some(self.prepare(a))
+        case (Some(b), a) => Some(self.append(b, a))
+      },
+      None,
+      _.map(self.present)
+    )
+
+  def lift: MonoidAggregator[A, Option[B], Option[C]] =
+    new MonoidAggregator[A, Option[B], Option[C]] {
+      override def prepare(input: A): Option[B] = Some(self.prepare(input))
+      override def present(reduction: Option[B]): Option[C] = reduction.map(self.present)
+      override def monoid = new OptionMonoid[B]()(self.semigroup)
+    }
+}
+
+/**
+ * Aggregators are Applicatives, but this hides the middle type. If you need a join that does not hide the
+ * middle type use join on the trait, or GeneratedTupleAggregator.fromN
+ */
+class AggregatorApplicative[I] extends Applicative[({ type L[O] = Aggregator[I, ?, O] })#L] {
+  override def map[T, U](mt: Aggregator[I, ?, T])(fn: T => U): Aggregator[I, ?, U] =
+    mt.andThenPresent(fn)
+  override def apply[T](v: T): Aggregator[I, ?, T] =
+    Aggregator.const(v)
+  override def join[T, U](mt: Aggregator[I, ?, T], mu: Aggregator[I, ?, U]): Aggregator[I, ?, (T, U)] =
+    mt.join(mu)
+  override def join[T1, T2, T3](
+      m1: Aggregator[I, ?, T1],
+      m2: Aggregator[I, ?, T2],
+      m3: Aggregator[I, ?, T3]
+  ): Aggregator[I, ?, (T1, T2, T3)] =
+    GeneratedTupleAggregator.from3((m1, m2, m3))
+
+  override def join[T1, T2, T3, T4](
+      m1: Aggregator[I, ?, T1],
+      m2: Aggregator[I, ?, T2],
+      m3: Aggregator[I, ?, T3],
+      m4: Aggregator[I, ?, T4]
+  ): Aggregator[I, ?, (T1, T2, T3, T4)] =
+    GeneratedTupleAggregator.from4((m1, m2, m3, m4))
+
+  override def join[T1, T2, T3, T4, T5](
+      m1: Aggregator[I, ?, T1],
+      m2: Aggregator[I, ?, T2],
+      m3: Aggregator[I, ?, T3],
+      m4: Aggregator[I, ?, T4],
+      m5: Aggregator[I, ?, T5]
+  ): Aggregator[I, ?, (T1, T2, T3, T4, T5)] =
+    GeneratedTupleAggregator.from5((m1, m2, m3, m4, m5))
+}
+
+trait MonoidAggregator[-A, B, +C] extends Aggregator[A, B, C] { self =>
+  def monoid: Monoid[B]
+  override def semigroup: Monoid[B] = monoid
+  final override def reduce(items: TraversableOnce[B]): B =
+    monoid.sum(items)
+
+  def appendAll(items: TraversableOnce[A]): B = reduce(items.iterator.map(prepare))
+
+  override def andThenPresent[D](present2: C => D): MonoidAggregator[A, B, D] = {
+    val self = this
+    new MonoidAggregator[A, B, D] {
+      override def prepare(a: A): B = self.prepare(a)
+      override def monoid: Monoid[B] = self.monoid
+      override def present(b: B): D = present2(self.present(b))
+    }
+  }
+  override def composePrepare[A2](prepare2: A2 => A): MonoidAggregator[A2, B, C] = {
+    val self = this
+    new MonoidAggregator[A2, B, C] {
+      override def prepare(a: A2): B = self.prepare(prepare2(a))
+      override def monoid: Monoid[B] = self.monoid
+      override def present(b: B): C = self.present(b)
+    }
+  }
+
+  /**
+   * Build a MonoidAggregator that either takes left or right input and outputs the pair from both
+   */
+  def either[A2, B2, C2](
+      that: MonoidAggregator[A2, B2, C2]
+  ): MonoidAggregator[Either[A, A2], (B, B2), (C, C2)] =
+    new MonoidAggregator[Either[A, A2], (B, B2), (C, C2)] {
+      override def prepare(e: Either[A, A2]): (B, B2) = e match {
+        case Left(a)   => (self.prepare(a), that.monoid.zero)
+        case Right(a2) => (self.monoid.zero, that.prepare(a2))
+      }
+      override val monoid = new Tuple2Monoid[B, B2]()(self.monoid, that.monoid)
+      override def present(bs: (B, B2)): (C, C2) = (self.present(bs._1), that.present(bs._2))
+    }
+
+  /**
+   * Only transform values where the function is defined, else discard
+   */
+  def collectBefore[A2](fn: PartialFunction[A2, A]): MonoidAggregator[A2, B, C] =
+    new MonoidAggregator[A2, B, C] {
+      override def prepare(a: A2): B =
+        if (fn.isDefinedAt(a)) self.prepare(fn(a)) else self.monoid.zero
+      override def monoid: Monoid[B] = self.monoid
+      override def present(b: B): C = self.present(b)
+    }
+
+  /**
+   * Only aggregate items that match a predicate
+   */
+  def filterBefore[A1 <: A](pred: A1 => Boolean): MonoidAggregator[A1, B, C] =
+    new MonoidAggregator[A1, B, C] {
+      override def prepare(a: A1): B = if (pred(a)) self.prepare(a) else self.monoid.zero
+      override def monoid: Monoid[B] = self.monoid
+      override def present(b: B): C = self.present(b)
+    }
+
+  /**
+   * This maps the inputs to Bs, then sums them, effectively flattening the inputs to the MonoidAggregator
+   */
+  def sumBefore: MonoidAggregator[TraversableOnce[A], B, C] =
+    new MonoidAggregator[TraversableOnce[A], B, C] {
+      override def monoid: Monoid[B] = self.monoid
+      override def prepare(input: TraversableOnce[A]): B =
+        monoid.sum(input.iterator.map(self.prepare))
+      override def present(reduction: B): C = self.present(reduction)
+    }
+
+  /**
+   * This allows you to join two aggregators into one that takes a tuple input, which in turn allows you to
+   * chain .composePrepare onto the result if you have an initial input that has to be prepared differently
+   * for each of the joined aggregators.
+   *
+   * The law here is: ag1.zip(ag2).apply(as.zip(bs)) == (ag1(as), ag2(bs))
+   */
+  def zip[A2, B2, C2](ag2: MonoidAggregator[A2, B2, C2]): MonoidAggregator[(A, A2), (B, B2), (C, C2)] = {
+    val ag1 = self
+    new MonoidAggregator[(A, A2), (B, B2), (C, C2)] {
+      override def prepare(a: (A, A2)): (B, B2) = (ag1.prepare(a._1), ag2.prepare(a._2))
+      override val monoid = new Tuple2Monoid[B, B2]()(ag1.monoid, ag2.monoid)
+      override def present(b: (B, B2)): (C, C2) = (ag1.present(b._1), ag2.present(b._2))
+    }
+  }
+}
+
+trait RingAggregator[-A, B, +C] extends MonoidAggregator[A, B, C] {
+  def ring: Ring[B]
+  override def monoid: Monoid[B] = Ring.asTimesMonoid(ring)
+}
diff --git a/algebird-core/src/main/scala-2.12/CountMinSketch.scala b/algebird-core/src/main/scala-2.12/CountMinSketch.scala
new file mode 100644
index 000000000..826aebd5a
--- /dev/null
+++ b/algebird-core/src/main/scala-2.12/CountMinSketch.scala
@@ -0,0 +1,1420 @@
+/*
+Copyright 2012 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+ */
+
+package com.twitter.algebird
+
+import algebra.CommutativeMonoid
+
+import scala.collection.compat._
+
+/**
+ * A Count-Min sketch is a probabilistic data structure used for summarizing streams of data in sub-linear
+ * space.
+ *
+ * It works as follows. Let `(eps, delta)` be two parameters that describe the confidence in our error
+ * estimates, and let `d = ceil(ln 1/delta)` and `w = ceil(e / eps)`.
+ *
+ * Note: Throughout the code `d` and `w` are called `depth` and `width`, respectively.
+ *
+ * Then:
+ *
+ *   - Take `d` pairwise independent hash functions `h_i`, each of which maps onto the domain `[0, w - 1]`.
+ *   - Create a 2-dimensional table of counts, with `d` rows and `w` columns, initialized with all zeroes.
+ *   - When a new element x arrives in the stream, update the table of counts by setting `counts[i, h_i[x]] +=
+ *     1`, for each `1 <= i <= d`.
+ *   - (Note the rough similarity to a Bloom filter.)
+ *
+ * As an example application, suppose you want to estimate the number of times an element `x` has appeared in
+ * a data stream so far. The Count-Min sketch estimate of this frequency is
+ *
+ * min_i { counts[i, h_i[x]] }
+ *
+ * With probability at least `1 - delta`, this estimate is within `eps * N` of the true frequency (i.e., `true
+ * frequency <= estimate <= true frequency + eps * N`), where N is the total size of the stream so far.
+ *
+ * See http://www.eecs.harvard.edu/~michaelm/CS222/countmin.pdf for technical details, including proofs of the
+ * estimates and error bounds used in this implementation.
+ *
+ * Parts of this implementation are taken from
+ * https://github.com/clearspring/stream-lib/blob/master/src/main/java/com/clearspring/analytics/stream/frequency/CountMinSketch.java
+ *
+ * @author
+ *   Edwin Chen
+ */
+/**
+ * Monoid for adding CMS sketches.
+ *
+ * =Usage=
+ *
+ * `eps` and `delta` are parameters that bound the error of each query estimate. For example, errors in
+ * answering point queries (e.g., how often has element x appeared in the stream described by the sketch?) are
+ * often of the form: "with probability p >= 1 - delta, the estimate is close to the truth by some factor
+ * depending on eps."
+ *
+ * The type `K` is the type of items you want to count. You must provide an implicit `CMSHasher[K]` for `K`,
+ * and Algebird ships with several such implicits for commonly used types such as `Long` and `BigInt`.
+ *
+ * If your type `K` is not supported out of the box, you have two options: 1) You provide a "translation"
+ * function to convert items of your (unsupported) type `K` to a supported type such as Double, and then use
+ * the `contramap` function of [[CMSHasher]] to create the required `CMSHasher[K]` for your type (see the
+ * documentation of [[CMSHasher]] for an example); 2) You implement a `CMSHasher[K]` from scratch, using the
+ * existing CMSHasher implementations as a starting point.
+ *
+ * Note: Because Arrays in Scala/Java not have sane `equals` and `hashCode` implementations, you cannot safely
+ * use types such as `Array[Byte]`. Extra work is required for Arrays. For example, you may opt to convert
+ * `Array[T]` to a `Seq[T]` via `toSeq`, or you can provide appropriate wrapper classes. Algebird provides one
+ * such wrapper class, [[Bytes]], to safely wrap an `Array[Byte]` for use with CMS.
+ *
+ * @param eps
+ *   One-sided error bound on the error of each point query, i.e. frequency estimate.
+ * @param delta
+ *   A bound on the probability that a query estimate does not lie within some small interval (an interval
+ *   that depends on `eps`) around the truth.
+ * @param seed
+ *   A seed to initialize the random number generator used to create the pairwise independent hash functions.
+ * @param maxExactCountOpt
+ *   An Option parameter about how many exact counts a sparse CMS wants to keep.
+ * @tparam K
+ *   The type used to identify the elements to be counted. For example, if you want to count the occurrence of
+ *   user names, you could map each username to a unique numeric ID expressed as a `Long`, and then count the
+ *   occurrences of those `Long`s with a CMS of type `K=Long`. Note that this mapping between the elements of
+ *   your problem domain and their identifiers used for counting via CMS should be bijective. We require a
+ *   [[CMSHasher]] context bound for `K`, see [[CMSHasherImplicits]] for available implicits that can be
+ *   imported. Which type K should you pick in practice? For domains that have less than `2^64` unique
+ *   elements, you'd typically use `Long`. For larger domains you can try `BigInt`, for example. Other
+ *   possibilities include Spire's `SafeLong` and `Numerical` data types (https://github.com/non/spire),
+ *   though Algebird does not include the required implicits for CMS-hashing (cf. [[CMSHasherImplicits]].
+ */
+class CMSMonoid[K: CMSHasher](eps: Double, delta: Double, seed: Int, maxExactCountOpt: Option[Int] = None)
+    extends Monoid[CMS[K]]
+    with CommutativeMonoid[CMS[K]] {
+
+  val params: CMSParams[K] = {
+    val hashes: Seq[CMSHash[K]] = CMSFunctions.generateHashes(eps, delta, seed)
+    CMSParams(hashes, eps, delta, maxExactCountOpt)
+  }
+
+  override val zero: CMS[K] = CMSZero[K](params)
+
+  /**
+   * Combines the two sketches.
+   *
+   * The sketches must use the same hash functions.
+   */
+  override def plus(left: CMS[K], right: CMS[K]): CMS[K] = {
+    require(left.params.hashes == right.params.hashes, "The sketches must use the same hash functions.")
+    left ++ right
+  }
+
+  /**
+   * Creates a sketch out of a single item.
+   */
+  def create(item: K): CMS[K] = CMSItem[K](item, 1L, params)
+
+  /**
+   * Creates a sketch out of multiple items.
+   */
+  def create(data: Seq[K]): CMS[K] = {
+    val summation = new CMSSummation(params)
+    data.foreach(k => summation.insert(k, 1L))
+    summation.result
+  }
+
+  override def sumOption(sketches: TraversableOnce[CMS[K]]): Option[CMS[K]] =
+    if (sketches.iterator.isEmpty) None else Some(sum(sketches))
+
+  override def sum(sketches: TraversableOnce[CMS[K]]): CMS[K] = {
+    val summation = new CMSSummation(params)
+    summation.updateAll(sketches)
+    summation.result
+  }
+}
+
+/**
+ * This mutable builder can be used when speed is essential and you can be sure the scope of the mutability
+ * cannot escape in an unsafe way. The intended use is to allocate and call result in one method without
+ * letting a reference to the instance escape into a closure.
+ */
+class CMSSummation[K](params: CMSParams[K]) {
+  private[this] val hashes = params.hashes.toArray
+  private[this] val height = CMSFunctions.depth(params.delta)
+  private[this] val width = CMSFunctions.width(params.eps)
+  private[this] val cells = new Array[Long](height * width)
+  private[this] var totalCount = 0L
+
+  final def insert(k: K, count: Long): Unit = {
+    var row = 0
+    var offset = 0
+    val hs = hashes
+    while (row < hs.length) {
+      cells(offset + hs(row)(k)) += count
+      offset += width
+      row += 1
+    }
+    totalCount += count
+  }
+
+  def updateAll(sketches: TraversableOnce[CMS[K]]): Unit =
+    sketches.iterator.foreach(updateInto)
+
+  def updateInto(cms: CMS[K]): Unit =
+    cms match {
+      case CMSZero(_) =>
+        ()
+      case CMSItem(item, count, _) =>
+        insert(item, count)
+      case SparseCMS(table, _, _) =>
+        table.foreach { case (item, c) =>
+          insert(item, c)
+        }
+      case CMSInstance(CMSInstance.CountsTable(matrix), count, _) =>
+        var offset = 0
+        val rit = matrix.iterator
+        while (rit.hasNext) {
+          var col = 0
+          val cit = rit.next().iterator
+          while (cit.hasNext) {
+            cells(offset + col) += cit.next()
+            col += 1
+          }
+          offset += width
+        }
+        totalCount += count
+    }
+
+  def result: CMS[K] =
+    if (totalCount == 0L) CMSZero(params)
+    else {
+      def vectorize(row: Int): Vector[Long] = {
+        val offset = row * width
+        val b = Vector.newBuilder[Long]
+        var col = 0
+        while (col < width) {
+          b += cells(offset + col)
+          col += 1
+        }
+        b.result()
+      }
+
+      val b = Vector.newBuilder[Vector[Long]]
+      var row = 0
+      while (row < height) {
+        b += vectorize(row)
+        row += 1
+      }
+      CMSInstance(CMSInstance.CountsTable(b.result()), totalCount, params)
+    }
+}
+
+/**
+ * An Aggregator for [[CMS]]. Can be created using CMS.aggregator.
+ */
+case class CMSAggregator[K](cmsMonoid: CMSMonoid[K]) extends MonoidAggregator[K, CMS[K], CMS[K]] {
+  override val monoid: CMSMonoid[K] = cmsMonoid
+
+  override def prepare(value: K): CMS[K] = monoid.create(value)
+
+  override def present(cms: CMS[K]): CMS[K] = cms
+
+}
+
+/**
+ * Configuration parameters for [[CMS]].
+ *
+ * @param hashes
+ *   Pair-wise independent hashes functions. We need `N=depth` such functions (`depth` can be derived from
+ *   `delta`).
+ * @param eps
+ *   One-sided error bound on the error of each point query, i.e. frequency estimate.
+ * @param delta
+ *   A bound on the probability that a query estimate does not lie within some small interval (an interval
+ *   that depends on `eps`) around the truth.
+ * @param maxExactCountOpt
+ *   An Option parameter about how many exact counts a sparse CMS wants to keep.
+ * @tparam K
+ *   The type used to identify the elements to be counted.
+ */
+case class CMSParams[K](
+    hashes: Seq[CMSHash[K]],
+    eps: Double,
+    delta: Double,
+    maxExactCountOpt: Option[Int] = None
+) {
+
+  require(0 < eps && eps < 1, "eps must lie in (0, 1)")
+  require(0 < delta && delta < 1, "delta must lie in (0, 1)")
+  require(
+    hashes.size >= CMSFunctions.depth(delta),
+    s"we require at least ${CMSFunctions.depth(delta)} hash functions"
+  )
+
+}
+
+/**
+ * Helper functions to generate or to translate between various CMS parameters (cf. [[CMSParams]]).
+ */
+object CMSFunctions {
+
+  /**
+   * Translates from `width` to `eps`.
+   */
+  def eps(width: Int): Double = scala.math.exp(1.0) / width
+
+  /**
+   * Translates from `depth` to `delta`.
+   */
+  @throws[IllegalArgumentException]("if depth is too large, causing precision errors when computing delta")
+  def delta(depth: Int): Double = {
+    val i = scala.math.exp(-depth)
+    require(
+      i > 0.0,
+      s"depth must be smaller as it causes precision errors when computing delta ($depth led to an invalid delta of $i)"
+    )
+    i
+  }
+
+  /**
+   * Translates from `delta` to `depth`.
+   */
+  @throws[IllegalArgumentException]("if delta is is not in (0, 1)")
+  def depth(delta: Double): Int = {
+    require(0 < delta && delta < 1, "delta must lie in (0, 1)")
+    scala.math.ceil(scala.math.log(1.0 / delta)).toInt
+  }
+
+  /**
+   * Translates from `eps` to `width`.
+   */
+  def width(eps: Double): Int =
+    scala.math.ceil(truncatePrecisionError(scala.math.exp(1) / eps)).toInt
+
+  /**
+   * Compute maxExactCount from parameters or `depth` and `width`
+   */
+  def maxExactCount(maxExactCountOpt: Option[Int], depth: Int, width: Int): Int =
+    maxExactCountOpt.getOrElse(math.max(width * depth / 100, 50))
+
+  // Eliminates precision errors such as the following:
+  //
+  //   scala> val width = 39
+  //   scala> scala.math.exp(1) / CMSFunctions.eps(width)
+  //   res171: Double = 39.00000000000001   <<< should be 39.0
+  //
+  // Because of the actual types on which CMSFunctions operates (i.e. Int and Double), the maximum number of decimal
+  // places should be 6.
+  private def truncatePrecisionError(i: Double, decimalPlaces: Int = 6) =
+    BigDecimal(i)
+      .setScale(decimalPlaces, BigDecimal.RoundingMode.HALF_UP)
+      .toDouble
+
+  /**
+   * Generates `N=depth` pair-wise independent hash functions.
+   *
+   * @param eps
+   *   One-sided error bound on the error of each point query, i.e. frequency estimate.
+   * @param delta
+   *   Error bound on the probability that a query estimate does NOT lie within some small interval around the
+   *   truth.
+   * @param seed
+   *   Seed for the random number generator.
+   * @tparam K
+   *   The type used to identify the elements to be counted.
+   * @return
+   *   The generated hash functions.
+   */
+  def generateHashes[K: CMSHasher](eps: Double, delta: Double, seed: Int): Seq[CMSHash[K]] = {
+    // Typically, we would use d -- aka depth -- pair-wise independent hash functions of the form
+    //
+    //   h_i(x) = a_i * x + b_i (mod p)
+    //
+    // But for this particular application, setting b_i does not matter (since all it does is shift the results of a
+    // particular hash), so we omit it (by setting b_i to 0) and simply use hash functions of the form
+    //
+    //   h_i(x) = a_i * x (mod p)
+    //
+    val r = new scala.util.Random(seed)
+    val numHashes = depth(delta)
+    val numCounters = width(eps)
+    (0 to (numHashes - 1)).map(_ => CMSHash[K](r.nextInt(), 0, numCounters))
+  }
+
+}
+
+/**
+ * A trait for CMS implementations that can count elements in a data stream and that can answer point queries
+ * (i.e. frequency estimates) for these elements.
+ *
+ * Known implementations: [[CMS]], [[TopCMS]].
+ *
+ * @tparam K
+ *   The type used to identify the elements to be counted.
+ * @tparam C
+ *   The type of the actual CMS that implements this trait.
+ */
+trait CMSCounting[K, C[_]] {
+
+  /**
+   * Returns the one-sided error bound on the error of each point query, i.e. frequency estimate.
+   */
+  def eps: Double
+
+  /**
+   * Returns the bound on the probability that a query estimate does NOT lie within some small interval (an
+   * interval that depends on `eps`) around the truth.
+   */
+  def delta: Double
+
+  /**
+   * Number of hash functions (also: number of rows in the counting table). This number is derived from
+   * `delta`.
+   */
+  def depth: Int = CMSFunctions.depth(delta)
+
+  /**
+   * Number of counters per hash function (also: number of columns in the counting table). This number is
+   * derived from `eps`.
+   */
+  def width: Int = CMSFunctions.width(eps)
+
+  /**
+   * An Option parameter about how many exact counts a sparse CMS wants to keep
+   */
+  def maxExactCountOpt: Option[Int]
+
+  /**
+   * Number of exact counts a sparse CMS wants to keep. This number is derived from `maxExactCountOpt`.
+   */
+  def maxExactCount: Int =
+    CMSFunctions.maxExactCount(maxExactCountOpt, depth, width)
+
+  /**
+   * Returns a new sketch that is the combination of this sketch and the other sketch.
+   */
+  def ++(other: C[K]): C[K]
+
+  /**
+   * Counts the item and returns the result as a new sketch.
+   */
+  def +(item: K): C[K] = this + (item, 1L)
+
+  /**
+   * Counts the item `count` times and returns the result as a new sketch.
+   */
+  def +(item: K, count: Long): C[K]
+
+  /**
+   * Returns an estimate of the total number of times this item has been seen in the stream so far. This
+   * estimate is an upper bound.
+   *
+   * It is always true that `estimatedFrequency >= trueFrequency`. With probability `p >= 1 - delta`, it also
+   * holds that `estimatedFrequency <= trueFrequency + eps * totalCount`.
+   */
+  def frequency(item: K): Approximate[Long]
+
+  /**
+   * Returns an estimate of the inner product against another data stream.
+   *
+   * In other words, let a_i denote the number of times element i has been seen in the data stream summarized
+   * by this CMS, and let b_i denote the same for the other CMS. Then this returns an estimate of `<a, b> =
+   * \sum a_i b_i`.
+   *
+   * Note: This can also be viewed as the join size between two relations.
+   *
+   * It is always true that actualInnerProduct <= estimatedInnerProduct. With probability `p >= 1 - delta`, it
+   * also holds that `estimatedInnerProduct <= actualInnerProduct + eps * thisTotalCount * otherTotalCount`.
+   */
+  def innerProduct(other: C[K]): Approximate[Long]
+
+  /**
+   * Total number of elements counted (i.e. seen in the data stream) so far.
+   */
+  def totalCount: Long
+
+  /**
+   * The first frequency moment is the total number of elements in the stream.
+   */
+  def f1: Long = totalCount
+
+  /**
+   * The second frequency moment is `\sum a_i^2`, where `a_i` is the count of the i-th element.
+   */
+  def f2: Approximate[Long]
+
+}
+
+/**
+ * A trait for CMS implementations that can track heavy hitters in a data stream.
+ *
+ * It is up to the implementation how the semantics of tracking heavy hitters are defined. For instance, one
+ * implementation could track the "top %" heavy hitters whereas another implementation could track the "top N"
+ * heavy hitters.
+ *
+ * Known implementations: [[TopCMS]].
+ *
+ * @tparam K
+ *   The type used to identify the elements to be counted.
+ */
+trait CMSHeavyHitters[K] {
+
+  /**
+   * The pluggable logic of how heavy hitters are being tracked.
+   */
+  def heavyHittersLogic: HeavyHittersLogic[K]
+
+  /**
+   * Returns the set of heavy hitters.
+   */
+  def heavyHitters: Set[K]
+
+}
+
+object CMS {
+
+  def monoid[K: CMSHasher](eps: Double, delta: Double, seed: Int): CMSMonoid[K] =
+    monoid(eps, delta, seed, None)
+  def monoid[K: CMSHasher](
+      eps: Double,
+      delta: Double,
+      seed: Int,
+      maxExactCountOpt: Option[Int]
+  ): CMSMonoid[K] =
+    new CMSMonoid[K](eps, delta, seed, maxExactCountOpt)
+
+  def monoid[K: CMSHasher](depth: Int, width: Int, seed: Int): CMSMonoid[K] =
+    monoid(depth, width, seed, None)
+  def monoid[K: CMSHasher](depth: Int, width: Int, seed: Int, maxExactCountOpt: Option[Int]): CMSMonoid[K] =
+    monoid(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, maxExactCountOpt)
+
+  def aggregator[K: CMSHasher](eps: Double, delta: Double, seed: Int): CMSAggregator[K] =
+    aggregator(eps, delta, seed, None)
+  def aggregator[K: CMSHasher](
+      eps: Double,
+      delta: Double,
+      seed: Int,
+      maxExactCountOpt: Option[Int]
+  ): CMSAggregator[K] =
+    new CMSAggregator[K](monoid(eps, delta, seed, maxExactCountOpt))
+
+  def aggregator[K: CMSHasher](depth: Int, width: Int, seed: Int): CMSAggregator[K] =
+    aggregator(depth, width, seed, None)
+  def aggregator[K: CMSHasher](
+      depth: Int,
+      width: Int,
+      seed: Int,
+      maxExactCountOpt: Option[Int]
+  ): CMSAggregator[K] =
+    aggregator(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, maxExactCountOpt)
+
+  /**
+   * Returns a fresh, zeroed CMS instance.
+   */
+  def apply[K: CMSHasher](
+      eps: Double,
+      delta: Double,
+      seed: Int,
+      maxExactCountOpt: Option[Int] = None
+  ): CMS[K] = {
+    val params = {
+      val hashes: Seq[CMSHash[K]] =
+        CMSFunctions.generateHashes(eps, delta, seed)
+      CMSParams(hashes, eps, delta, maxExactCountOpt)
+    }
+    CMSZero[K](params)
+  }
+
+}
+
+/**
+ * A Count-Min sketch data structure that allows for counting and frequency estimation of elements in a data
+ * stream.
+ *
+ * Tip: If you also need to track heavy hitters ("Top N" problems), take a look at [[TopCMS]].
+ *
+ * =Usage=
+ *
+ * This example demonstrates how to count `Long` elements with [[CMS]], i.e. `K=Long`.
+ *
+ * Note that the actual counting is always performed with a `Long`, regardless of your choice of `K`. That is,
+ * the counting table behind the scenes is backed by `Long` values (at least in the current implementation),
+ * and thus the returned frequency estimates are always instances of `Approximate[Long]`.
+ *
+ * @example
+ *   {{{
+ *
+ * // Creates a monoid for a CMS that can count `Long` elements. val cmsMonoid: CMSMonoid[Long] = { val eps =
+ * 0.001 val delta = 1E-10 val seed = 1 CMS.monoid[Long](eps, delta, seed) }
+ *
+ * // Creates a CMS instance that has counted the element `1L`. val cms: CMS[Long] = cmsMonoid.create(1L)
+ *
+ * // Estimates the frequency of `1L` val estimate: Approximate[Long] = cms.frequency(1L)
+ *   }}}
+ *
+ * @tparam K
+ *   The type used to identify the elements to be counted.
+ */
+sealed abstract class CMS[K](val params: CMSParams[K]) extends java.io.Serializable with CMSCounting[K, CMS] {
+
+  override val eps: Double = params.eps
+
+  override val delta: Double = params.delta
+
+  override val maxExactCountOpt: Option[Int] = params.maxExactCountOpt
+
+  override def f2: Approximate[Long] = innerProduct(this)
+
+}
+
+/**
+ * Zero element. Used for initialization.
+ */
+case class CMSZero[K](override val params: CMSParams[K]) extends CMS[K](params) {
+
+  override val totalCount: Long = 0L
+
+  override def +(item: K, count: Long): CMS[K] = CMSItem[K](item, count, params)
+
+  override def ++(other: CMS[K]): CMS[K] = other
+
+  override def frequency(item: K): Approximate[Long] = Approximate.exact(0L)
+
+  override def innerProduct(other: CMS[K]): Approximate[Long] =
+    Approximate.exact(0L)
+
+}
+
+/**
+ * Used for holding a single element, to avoid repeatedly adding elements from sparse counts tables.
+ */
+case class CMSItem[K](item: K, override val totalCount: Long, override val params: CMSParams[K])
+    extends CMS[K](params) {
+
+  override def +(x: K, count: Long): CMS[K] =
+    SparseCMS[K](params) + (item, totalCount) + (x, count)
+
+  override def ++(other: CMS[K]): CMS[K] =
+    other match {
+      case _: CMSZero[?] => this
+      case other: CMSItem[K] =>
+        CMSInstance[K](params) + (item, totalCount) + (other.item, other.totalCount)
+      case _ => other + item
+    }
+
+  override def frequency(x: K): Approximate[Long] =
+    if (item == x) Approximate.exact(totalCount) else Approximate.exact(0L)
+
+  override def innerProduct(other: CMS[K]): Approximate[Long] =
+    Approximate.exact(totalCount) * other.frequency(item)
+
+}
+
+/**
+ * A sparse Count-Min sketch structure, used for situations where the key is highly skewed.
+ */
+case class SparseCMS[K](
+    exactCountTable: Map[K, Long],
+    override val totalCount: Long,
+    override val params: CMSParams[K]
+) extends CMS[K](params) {
+  import SparseCMS._
+
+  override def +(x: K, count: Long): CMS[K] = {
+    val currentCount = exactCountTable.getOrElse(x, 0L)
+    val newTable = exactCountTable.updated(x, currentCount + count)
+    if (newTable.size < maxExactCount) {
+      // still sparse
+      SparseCMS(newTable, totalCount = totalCount + count, params = params)
+    } else {
+      toDense(newTable, params)
+    }
+  }
+
+  override def ++(other: CMS[K]): CMS[K] =
+    other match {
+      case _: CMSZero[?]       => this
+      case other: CMSItem[K]   => this + (other.item, other.totalCount)
+      case other: SparseCMS[K] =>
+        // This SparseCMS's maxExactCount is used, so ++ is not communitive
+        val newTable = Semigroup.plus(exactCountTable, other.exactCountTable)
+        if (newTable.size < maxExactCount) {
+          // still sparse
+          SparseCMS(newTable, totalCount = totalCount + other.totalCount, params = params)
+        } else {
+          toDense(newTable, params)
+        }
+
+      case other: CMSInstance[K] => other ++ this
+    }
+
+  override def frequency(x: K): Approximate[Long] =
+    Approximate.exact(exactCountTable.getOrElse(x, 0L))
+
+  override def innerProduct(other: CMS[K]): Approximate[Long] =
+    exactCountTable.iterator
+      .map { case (x, count) => Approximate.exact(count) * other.frequency(x) }
+      .reduceOption(_ + _)
+      .getOrElse(Approximate.exact(0L))
+}
+
+object SparseCMS {
+
+  /**
+   * Creates a new [[SparseCMS]] with empty exactCountTable
+   */
+  def apply[K](params: CMSParams[K]): SparseCMS[K] = {
+    val exactCountTable = Map[K, Long]()
+    SparseCMS[K](exactCountTable, totalCount = 0, params = params)
+  }
+
+  /**
+   * Creates a new [[CMSInstance]] from a Map[K, Long]
+   */
+  def toDense[K](exactCountTable: Map[K, Long], params: CMSParams[K]): CMS[K] =
+    // Create new CMSInstace
+    exactCountTable.foldLeft(CMSInstance[K](params)) { case (cms, (x, count)) =>
+      cms + (x, count)
+    }
+}
+
+/**
+ * The general Count-Min sketch structure, used for holding any number of elements.
+ */
+case class CMSInstance[K](
+    countsTable: CMSInstance.CountsTable[K],
+    override val totalCount: Long,
+    override val params: CMSParams[K]
+) extends CMS[K](params) {
+
+  override def ++(other: CMS[K]): CMS[K] =
+    other match {
+      case _: CMSZero[?]     => this
+      case other: CMSItem[K] => this + other.item
+      case other: SparseCMS[K] =>
+        other.exactCountTable.foldLeft(this) { case (cms, (x, count)) =>
+          cms + (x, count)
+        }
+      case other: CMSInstance[K] =>
+        val newTable = countsTable ++ other.countsTable
+        val newTotalCount = totalCount + other.totalCount
+        CMSInstance[K](newTable, newTotalCount, params)
+    }
+
+  private def makeApprox(est: Long): Approximate[Long] =
+    if (est == 0L) Approximate.exact(0L)
+    else {
+      val lower = math.max(0L, est - (eps * totalCount).toLong)
+      Approximate(lower, est, est, 1 - delta)
+    }
+
+  override def frequency(item: K): Approximate[Long] = {
+    var freq = Long.MaxValue
+    val hs = params.hashes
+    val it = countsTable.counts.iterator
+    var i = 0
+    while (it.hasNext) {
+      val row = it.next()
+      val count = row(hs(i)(item))
+      if (count < freq) freq = count
+      i += 1
+    }
+    makeApprox(freq)
+  }
+
+  /**
+   * Let X be a CMS, and let count_X[j, k] denote the value in X's 2-dimensional count table at row j and
+   * column k. Then the Count-Min sketch estimate of the inner product between A and B is the minimum inner
+   * product between their rows: estimatedInnerProduct = min_j (\sum_k count_A[j, k] * count_B[j, k]|)
+   */
+  override def innerProduct(other: CMS[K]): Approximate[Long] =
+    other match {
+      case other: CMSInstance[?] =>
+        require(other.depth == depth && other.width == width, "Tables must have the same dimensions.")
+
+        def innerProductAtDepth(d: Int) =
+          (0 to (width - 1)).iterator.map { w =>
+            countsTable.getCount((d, w)) * other.countsTable.getCount((d, w))
+          }.sum
+
+        val est = (0 to (depth - 1)).iterator.map(innerProductAtDepth).min
+        val minimum =
+          math.max(est - (eps * totalCount * other.totalCount).toLong, 0)
+        Approximate(minimum, est, est, 1 - delta)
+      case _ => other.innerProduct(this)
+    }
+
+  override def +(item: K, count: Long): CMSInstance[K] = {
+    require(count >= 0, "count must be >= 0 (negative counts not implemented")
+    if (count != 0L) {
+      val newCountsTable =
+        (0 to (depth - 1)).foldLeft(countsTable) { case (table, row) =>
+          val pos = (row, params.hashes(row)(item))
+          table + (pos, count)
+        }
+      CMSInstance[K](newCountsTable, totalCount + count, params)
+    } else this
+  }
+
+}
+
+object CMSInstance {
+
+  /**
+   * Initializes a [[CMSInstance]] with all zeroes, i.e. nothing has been counted yet.
+   */
+  def apply[K](params: CMSParams[K]): CMSInstance[K] = {
+    val countsTable = CountsTable[K](CMSFunctions.depth(params.delta), CMSFunctions.width(params.eps))
+    CMSInstance[K](countsTable, 0, params)
+  }
+
+  /**
+   * The 2-dimensional table of counters used in the Count-Min sketch. Each row corresponds to a particular
+   * hash function.
+   */
+  // TODO: implement a dense matrix type, and use it here
+  case class CountsTable[K](counts: Vector[Vector[Long]]) {
+    require(depth > 0, "Table must have at least 1 row.")
+    require(width > 0, "Table must have at least 1 column.")
+
+    def depth: Int = counts.size
+
+    def width: Int = counts(0).size
+
+    def getCount(pos: (Int, Int)): Long = {
+      val (row, col) = pos
+      require(row < depth && col < width, "Position must be within the bounds of this table.")
+      counts(row)(col)
+    }
+
+    /**
+     * Updates the count of a single cell in the table.
+     */
+    def +(pos: (Int, Int), count: Long): CountsTable[K] = {
+      val (row, col) = pos
+      val currCount = getCount(pos)
+      val newCounts =
+        counts.updated(row, counts(row).updated(col, currCount + count))
+      CountsTable[K](newCounts)
+    }
+
+    /**
+     * Adds another counts table to this one, through element-wise addition.
+     */
+    def ++(other: CountsTable[K]): CountsTable[K] = {
+      require(depth == other.depth && width == other.width, "Tables must have the same dimensions.")
+      val xss = this.counts.iterator
+      val yss = other.counts.iterator
+      val rows = Vector.newBuilder[Vector[Long]]
+      while (xss.hasNext) {
+        val xs = xss.next().iterator
+        val ys = yss.next().iterator
+        val row = Vector.newBuilder[Long]
+        while (xs.hasNext) row += (xs.next() + ys.next())
+        rows += row.result()
+      }
+      CountsTable[K](rows.result())
+    }
+  }
+
+  object CountsTable {
+
+    /**
+     * Creates a new [[CountsTable]] with counts initialized to all zeroes.
+     */
+    def apply[K](depth: Int, width: Int): CountsTable[K] =
+      CountsTable[K](Vector.fill[Long](depth, width)(0L))
+
+  }
+
+}
+
+case class TopCMSParams[K](logic: HeavyHittersLogic[K])
+
+/**
+ * A Count-Min sketch data structure that allows for (a) counting and frequency estimation of elements in a
+ * data stream and (b) tracking the heavy hitters among these elements.
+ *
+ * The logic of how heavy hitters are computed is pluggable, see [[HeavyHittersLogic]].
+ *
+ * Tip: If you do not need to track heavy hitters, take a look at [[CMS]], which is more efficient in this
+ * case.
+ *
+ * =Usage=
+ *
+ * This example demonstrates how to count `Long` elements with [[TopCMS]], i.e. `K=Long`.
+ *
+ * Note that the actual counting is always performed with a `Long`, regardless of your choice of `K`. That is,
+ * the counting table behind the scenes is backed by `Long` values (at least in the current implementation),
+ * and thus the returned frequency estimates are always instances of `Approximate[Long]`.
+ *
+ * @example
+ *   {{{ // Creates a monoid for a CMS that can count `Long` elements. val topPctCMSMonoid:
+ *   TopPctCMSMonoid[Long] = { val eps = 0.001 val delta = 1E-10 val seed = 1 val heavyHittersPct = 0.1
+ *   TopPctCMS.monoid[Long](eps, delta, seed, heavyHittersPct) }
+ *
+ * // Creates a TopCMS instance that has counted the element `1L`. val topCMS: TopCMS[Long] =
+ * topPctCMSMonoid.create(1L)
+ *
+ * // Estimates the frequency of `1L` val estimate: Approximate[Long] = topCMS.frequency(1L)
+ *
+ * // What are the heavy hitters so far? val heavyHitters: Set[Long] = topCMS.heavyHitters }}}
+ *
+ * @tparam K
+ *   The type used to identify the elements to be counted.
+ */
+sealed abstract class TopCMS[K](val cms: CMS[K], params: TopCMSParams[K])
+    extends java.io.Serializable
+    with CMSCounting[K, TopCMS]
+    with CMSHeavyHitters[K] {
+
+  override val eps: Double = cms.eps
+
+  override val delta: Double = cms.delta
+
+  override val totalCount: Long = cms.totalCount
+
+  override val maxExactCountOpt: Option[Int] = cms.maxExactCountOpt
+
+  override def frequency(item: K): Approximate[Long] = cms.frequency(item)
+
+  override def innerProduct(other: TopCMS[K]): Approximate[Long] =
+    cms.innerProduct(other.cms)
+
+  override def f2: Approximate[Long] = innerProduct(this)
+
+  /**
+   * The pluggable logic with which heavy hitters are being tracked.
+   */
+  override def heavyHittersLogic: HeavyHittersLogic[K] = params.logic
+
+}
+
+/**
+ * Zero element. Used for initialization.
+ */
+case class TopCMSZero[K](override val cms: CMS[K], params: TopCMSParams[K]) extends TopCMS[K](cms, params) {
+
+  override val heavyHitters: Set[K] = Set.empty[K]
+
+  override def +(item: K, count: Long): TopCMS[K] =
+    TopCMSInstance(cms, params) + (item, count)
+
+  override def ++(other: TopCMS[K]): TopCMS[K] = other
+
+}
+
+/**
+ * Used for holding a single element, to avoid repeatedly adding elements from sparse counts tables.
+ */
+case class TopCMSItem[K](item: K, override val cms: CMS[K], params: TopCMSParams[K])
+    extends TopCMS[K](cms, params) {
+
+  override val heavyHitters: Set[K] = Set(item)
+
+  override def +(x: K, count: Long): TopCMS[K] = toCMSInstance + (x, count)
+
+  override def ++(other: TopCMS[K]): TopCMS[K] = other match {
+    case _: TopCMSZero[?]         => this
+    case other: TopCMSItem[K]     => toCMSInstance + other.item
+    case other: TopCMSInstance[K] => other + item
+  }
+
+  private def toCMSInstance: TopCMSInstance[K] = {
+    val hhs = HeavyHitters.from(HeavyHitter(item, 1L))
+    TopCMSInstance(cms, hhs, params)
+  }
+
+}
+
+object TopCMSInstance {
+
+  def apply[K](cms: CMS[K], params: TopCMSParams[K]): TopCMSInstance[K] =
+    TopCMSInstance[K](cms, HeavyHitters.empty[K], params)
+
+}
+
+case class TopCMSInstance[K](override val cms: CMS[K], hhs: HeavyHitters[K], params: TopCMSParams[K])
+    extends TopCMS[K](cms, params) {
+
+  override def heavyHitters: Set[K] = hhs.items
+
+  override def +(item: K, count: Long): TopCMSInstance[K] = {
+    require(count >= 0, "count must be >= 0 (negative counts not implemented")
+    if (count != 0L) {
+      val newCms = cms + (item, count)
+      val newHhs =
+        heavyHittersLogic.updateHeavyHitters(cms, newCms)(hhs, item, count)
+      TopCMSInstance[K](newCms, newHhs, params)
+    } else this
+  }
+
+  override def ++(other: TopCMS[K]): TopCMS[K] = other match {
+    case _: TopCMSZero[?]     => this
+    case other: TopCMSItem[K] => this + other.item
+    case other: TopCMSInstance[K] =>
+      val newCms = cms ++ other.cms
+      val newHhs = heavyHittersLogic.updateHeavyHitters(newCms)(hhs, other.hhs)
+      TopCMSInstance(newCms, newHhs, params)
+  }
+
+}
+
+class TopCMSMonoid[K](emptyCms: CMS[K], logic: HeavyHittersLogic[K]) extends Monoid[TopCMS[K]] {
+
+  val params: TopCMSParams[K] = TopCMSParams(logic)
+
+  override val zero: TopCMS[K] = TopCMSZero[K](emptyCms, params)
+
+  /**
+   * Combines the two sketches.
+   *
+   * The sketches must use the same hash functions.
+   */
+  override def plus(left: TopCMS[K], right: TopCMS[K]): TopCMS[K] = {
+    require(
+      left.cms.params.hashes == right.cms.params.hashes,
+      "The sketches must use the same hash functions."
+    )
+    left ++ right
+  }
+
+  /**
+   * Creates a sketch out of a single item.
+   */
+  def create(item: K): TopCMS[K] =
+    TopCMSItem[K](item, emptyCms + item, params)
+
+  /**
+   * Creates a sketch out of multiple items.
+   */
+  def create(data: Seq[K]): TopCMS[K] =
+    data.foldLeft(zero) { case (acc, x) => plus(acc, create(x)) }
+
+  override def sum(sketches: TraversableOnce[TopCMS[K]]): TopCMS[K] = {
+    val topCandidates = scala.collection.mutable.Set.empty[K]
+    val summation = new CMSSummation(emptyCms.params)
+    sketches.iterator.foreach { sketch =>
+      summation.updateInto(sketch.cms)
+      topCandidates ++= sketch.heavyHitters
+    }
+    val cms = summation.result
+    val ests =
+      topCandidates.map(k => HeavyHitter(k, cms.frequency(k).estimate)).toSet
+    val hhs = logic.purgeHeavyHitters(cms)(HeavyHitters(ests))
+    TopCMSInstance(cms, hhs, params)
+  }
+
+  override def sumOption(sketches: TraversableOnce[TopCMS[K]]): Option[TopCMS[K]] =
+    if (sketches.iterator.isEmpty) None else Some(sum(sketches))
+}
+
+class TopCMSAggregator[K](cmsMonoid: TopCMSMonoid[K]) extends MonoidAggregator[K, TopCMS[K], TopCMS[K]] {
+
+  override def monoid: TopCMSMonoid[K] = cmsMonoid
+
+  override def prepare(value: K): TopCMS[K] = monoid.create(value)
+
+  override def present(cms: TopCMS[K]): TopCMS[K] = cms
+
+}
+
+/**
+ * Controls how a CMS that implements [[CMSHeavyHitters]] tracks heavy hitters.
+ */
+abstract class HeavyHittersLogic[K] extends java.io.Serializable {
+
+  def updateHeavyHitters(
+      oldCms: CMS[K],
+      newCms: CMS[K]
+  )(hhs: HeavyHitters[K], item: K, count: Long): HeavyHitters[K] = {
+    val oldItemCount = oldCms.frequency(item).estimate
+    val oldHh = HeavyHitter[K](item, oldItemCount)
+    val newItemCount = oldItemCount + count
+    val newHh = HeavyHitter[K](item, newItemCount)
+    purgeHeavyHitters(newCms)(hhs - oldHh + newHh)
+  }
+
+  def updateHeavyHitters(cms: CMS[K])(left: HeavyHitters[K], right: HeavyHitters[K]): HeavyHitters[K] = {
+    val candidates = (left.items ++ right.items).map { case i =>
+      HeavyHitter[K](i, cms.frequency(i).estimate)
+    }
+    val newHhs = HeavyHitters.from(candidates)
+    purgeHeavyHitters(cms)(newHhs)
+  }
+
+  def purgeHeavyHitters(cms: CMS[K])(hhs: HeavyHitters[K]): HeavyHitters[K]
+
+}
+
+/**
+ * Finds all heavy hitters, i.e., elements in the stream that appear at least `(heavyHittersPct * totalCount)`
+ * times.
+ *
+ * Every item that appears at least `(heavyHittersPct * totalCount)` times is output, and with probability `p
+ * >= 1 - delta`, no item whose count is less than `(heavyHittersPct - eps) * totalCount` is output.
+ *
+ * This also means that this parameter is an upper bound on the number of heavy hitters that will be tracked:
+ * the set of heavy hitters contains at most `1 / heavyHittersPct` elements. For example, if
+ * `heavyHittersPct=0.01` (or 0.25), then at most `1 / 0.01 = 100` items (or `1 / 0.25 = 4` items) will be
+ * tracked/returned as heavy hitters. This parameter can thus control the memory footprint required for
+ * tracking heavy hitters.
+ */
+case class TopPctLogic[K](heavyHittersPct: Double) extends HeavyHittersLogic[K] {
+
+  require(0 < heavyHittersPct && heavyHittersPct < 1, "heavyHittersPct must lie in (0, 1)")
+
+  override def purgeHeavyHitters(cms: CMS[K])(hitters: HeavyHitters[K]): HeavyHitters[K] = {
+    val minCount = heavyHittersPct * cms.totalCount
+    HeavyHitters[K](hitters.hhs.filter(_.count >= minCount))
+  }
+
+}
+
+/**
+ * Tracks the top N heavy hitters, where `N` is defined by `heavyHittersN`.
+ *
+ * '''Warning:''' top-N computations are not associative. The effect is that a top-N CMS has an ordering bias
+ * (with regard to heavy hitters) when merging instances. This means merging heavy hitters across CMS
+ * instances may lead to incorrect, biased results: the outcome is biased by the order in which CMS instances
+ * / heavy hitters are being merged, with the rule of thumb being that the earlier a set of heavy hitters is
+ * being merged, the more likely is the end result biased towards these heavy hitters.
+ *
+ * @see
+ *   Discussion in [[https://github.com/twitter/algebird/issues/353 Algebird issue 353]]
+ */
+case class TopNLogic[K](heavyHittersN: Int) extends HeavyHittersLogic[K] {
+
+  require(heavyHittersN > 0, "heavyHittersN must be > 0")
+
+  override def purgeHeavyHitters(cms: CMS[K])(hitters: HeavyHitters[K]): HeavyHitters[K] = {
+    val sorted =
+      hitters.hhs.toSeq.sortBy(hh => hh.count).takeRight(heavyHittersN)
+    HeavyHitters[K](sorted.toSet)
+  }
+
+}
+
+/**
+ * Containers for holding heavy hitter items and their associated counts.
+ */
+case class HeavyHitters[K](hhs: Set[HeavyHitter[K]]) extends java.io.Serializable {
+
+  def -(hh: HeavyHitter[K]): HeavyHitters[K] = HeavyHitters[K](hhs - hh)
+
+  def +(hh: HeavyHitter[K]): HeavyHitters[K] = HeavyHitters[K](hhs + hh)
+
+  def ++(other: HeavyHitters[K]): HeavyHitters[K] =
+    HeavyHitters[K](hhs ++ other.hhs)
+
+  def items: Set[K] = hhs.map(_.item)
+
+}
+
+object HeavyHitters {
+
+  def empty[K]: HeavyHitters[K] = HeavyHitters(emptyHhs)
+
+  private def emptyHhs[K]: Set[HeavyHitter[K]] = Set[HeavyHitter[K]]()
+
+  def from[K](hhs: Set[HeavyHitter[K]]): HeavyHitters[K] =
+    hhs.foldLeft(empty[K])(_ + _)
+
+  def from[K](hh: HeavyHitter[K]): HeavyHitters[K] = HeavyHitters(emptyHhs + hh)
+
+}
+
+case class HeavyHitter[K](item: K, count: Long) extends java.io.Serializable
+
+/**
+ * Monoid for Top-% based [[TopCMS]] sketches.
+ *
+ * =Usage=
+ *
+ * The type `K` is the type of items you want to count. You must provide an implicit `CMSHasher[K]` for `K`,
+ * and Algebird ships with several such implicits for commonly used types such as `Long` and `BigInt`.
+ *
+ * If your type `K` is not supported out of the box, you have two options: 1) You provide a "translation"
+ * function to convert items of your (unsupported) type `K` to a supported type such as Double, and then use
+ * the `contramap` function of [[CMSHasher]] to create the required `CMSHasher[K]` for your type (see the
+ * documentation of [[CMSHasher]] for an example); 2) You implement a `CMSHasher[K]` from scratch, using the
+ * existing CMSHasher implementations as a starting point.
+ *
+ * Note: Because Arrays in Scala/Java not have sane `equals` and `hashCode` implementations, you cannot safely
+ * use types such as `Array[Byte]`. Extra work is required for Arrays. For example, you may opt to convert
+ * `Array[T]` to a `Seq[T]` via `toSeq`, or you can provide appropriate wrapper classes. Algebird provides one
+ * such wrapper class, [[Bytes]], to safely wrap an `Array[Byte]` for use with CMS.
+ *
+ * @param cms
+ *   A [[CMS]] instance, which is used for the counting and the frequency estimation performed by this class.
+ * @param heavyHittersPct
+ *   A threshold for finding heavy hitters, i.e., elements that appear at least (heavyHittersPct * totalCount)
+ *   times in the stream.
+ * @tparam K
+ *   The type used to identify the elements to be counted. For example, if you want to count the occurrence of
+ *   user names, you could map each username to a unique numeric ID expressed as a `Long`, and then count the
+ *   occurrences of those `Long`s with a CMS of type `K=Long`. Note that this mapping between the elements of
+ *   your problem domain and their identifiers used for counting via CMS should be bijective. We require a
+ *   [[CMSHasher]] context bound for `K`, see [[CMSHasher]] for available implicits that can be imported.
+ *   Which type K should you pick in practice? For domains that have less than `2^64` unique elements, you'd
+ *   typically use `Long`. For larger domains you can try `BigInt`, for example.
+ */
+class TopPctCMSMonoid[K](cms: CMS[K], heavyHittersPct: Double = 0.01)
+    extends TopCMSMonoid[K](cms, TopPctLogic[K](heavyHittersPct))
+
+object TopPctCMS {
+
+  def monoid[K: CMSHasher](
+      eps: Double,
+      delta: Double,
+      seed: Int,
+      heavyHittersPct: Double
+  ): TopPctCMSMonoid[K] =
+    new TopPctCMSMonoid[K](CMS(eps, delta, seed), heavyHittersPct)
+
+  def monoid[K: CMSHasher](depth: Int, width: Int, seed: Int, heavyHittersPct: Double): TopPctCMSMonoid[K] =
+    monoid(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersPct)
+
+  def aggregator[K: CMSHasher](
+      eps: Double,
+      delta: Double,
+      seed: Int,
+      heavyHittersPct: Double
+  ): TopPctCMSAggregator[K] =
+    new TopPctCMSAggregator[K](monoid(eps, delta, seed, heavyHittersPct))
+
+  def aggregator[K: CMSHasher](
+      depth: Int,
+      width: Int,
+      seed: Int,
+      heavyHittersPct: Double
+  ): TopPctCMSAggregator[K] =
+    aggregator(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersPct)
+
+}
+
+/**
+ * An Aggregator for [[TopPctCMS]]. Can be created using [[TopPctCMS.aggregator]].
+ */
+case class TopPctCMSAggregator[K](cmsMonoid: TopPctCMSMonoid[K]) extends TopCMSAggregator(cmsMonoid)
+
+/**
+ * Monoid for top-N based [[TopCMS]] sketches. '''Use with care! (see warning below)'''
+ *
+ * =Warning: Adding top-N CMS instances (`++`) is an unsafe operation=
+ *
+ * Top-N computations are not associative. The effect is that a top-N CMS has an ordering bias (with regard to
+ * heavy hitters) when ''merging'' CMS instances (e.g. via `++`). This means merging heavy hitters across CMS
+ * instances may lead to incorrect, biased results: the outcome is biased by the order in which CMS instances
+ * / heavy hitters are being merged, with the rule of thumb being that the earlier a set of heavy hitters is
+ * being merged, the more likely is the end result biased towards these heavy hitters.
+ *
+ * The warning above only applies when ''adding CMS instances'' (think: `cms1 ++ cms2`). In comparison, heavy
+ * hitters are correctly computed when:
+ *
+ *   - a top-N CMS instance is created from a single data stream, i.e. `Seq[K]`
+ *   - items are added/counted individually, i.e. `cms + item` or `cms + (item, count)`.
+ *
+ * See the discussion in [[https://github.com/twitter/algebird/issues/353 Algebird issue 353]] for further
+ * details.
+ *
+ * =Alternatives=
+ *
+ * The following, alternative data structures may be better picks than a top-N based CMS given the warning
+ * above:
+ *
+ *   - [[TopPctCMS]]: Has safe merge semantics for its instances including heavy hitters.
+ *   - [[SpaceSaver]]: Has the same ordering bias than a top-N CMS, but at least it provides bounds on the
+ *     bias.
+ *
+ * =Usage=
+ *
+ * The type `K` is the type of items you want to count. You must provide an implicit `CMSHasher[K]` for `K`,
+ * and Algebird ships with several such implicits for commonly used types such as `Long` and `BigInt`.
+ *
+ * If your type `K` is not supported out of the box, you have two options: 1) You provide a "translation"
+ * function to convert items of your (unsupported) type `K` to a supported type such as [[Double]], and then
+ * use the `contramap` function of [[CMSHasher]] to create the required `CMSHasher[K]` for your type (see the
+ * documentation of [[CMSHasher]] for an example); 2) You implement a `CMSHasher[K]` from scratch, using the
+ * existing CMSHasher implementations as a starting point.
+ *
+ * Note: Because Arrays in Scala/Java not have sane `equals` and `hashCode` implementations, you cannot safely
+ * use types such as `Array[Byte]`. Extra work is required for Arrays. For example, you may opt to convert
+ * `Array[T]` to a `Seq[T]` via `toSeq`, or you can provide appropriate wrapper classes. Algebird provides one
+ * such wrapper class, [[Bytes]], to safely wrap an `Array[Byte]` for use with CMS.
+ *
+ * @param cms
+ *   A [[CMS]] instance, which is used for the counting and the frequency estimation performed by this class.
+ * @param heavyHittersN
+ *   The maximum number of heavy hitters to track.
+ * @tparam K
+ *   The type used to identify the elements to be counted. For example, if you want to count the occurrence of
+ *   user names, you could map each username to a unique numeric ID expressed as a `Long`, and then count the
+ *   occurrences of those `Long`s with a CMS of type `K=Long`. Note that this mapping between the elements of
+ *   your problem domain and their identifiers used for counting via CMS should be bijective. We require a
+ *   [[CMSHasher]] context bound for `K`, see [[CMSHasher]] for available implicits that can be imported.
+ *   Which type K should you pick in practice? For domains that have less than `2^64` unique elements, you'd
+ *   typically use `Long`. For larger domains you can try `BigInt`, for example.
+ */
+class TopNCMSMonoid[K](cms: CMS[K], heavyHittersN: Int = 100)
+    extends TopCMSMonoid[K](cms, TopNLogic[K](heavyHittersN))
+
+object TopNCMS {
+
+  def monoid[K: CMSHasher](eps: Double, delta: Double, seed: Int, heavyHittersN: Int): TopNCMSMonoid[K] =
+    new TopNCMSMonoid[K](CMS(eps, delta, seed), heavyHittersN)
+
+  def monoid[K: CMSHasher](depth: Int, width: Int, seed: Int, heavyHittersN: Int): TopNCMSMonoid[K] =
+    monoid(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersN)
+
+  def aggregator[K: CMSHasher](
+      eps: Double,
+      delta: Double,
+      seed: Int,
+      heavyHittersN: Int
+  ): TopNCMSAggregator[K] =
+    new TopNCMSAggregator[K](monoid(eps, delta, seed, heavyHittersN))
+
+  def aggregator[K: CMSHasher](depth: Int, width: Int, seed: Int, heavyHittersN: Int): TopNCMSAggregator[K] =
+    aggregator(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersN)
+
+}
+
+/**
+ * An Aggregator for [[TopNCMS]]. Can be created using [[TopNCMS.aggregator]].
+ */
+case class TopNCMSAggregator[K](cmsMonoid: TopNCMSMonoid[K]) extends TopCMSAggregator(cmsMonoid)
+
+/**
+ * K1 defines a scope for the CMS. For each k1, keep the top heavyHittersN associated k2 values.
+ */
+case class ScopedTopNLogic[K1, K2](heavyHittersN: Int) extends HeavyHittersLogic[(K1, K2)] {
+
+  require(heavyHittersN > 0, "heavyHittersN must be > 0")
+
+  override def purgeHeavyHitters(
+      cms: CMS[(K1, K2)]
+  )(hitters: HeavyHitters[(K1, K2)]): HeavyHitters[(K1, K2)] = {
+    val grouped = hitters.hhs.groupBy(hh => hh.item._1)
+    val (underLimit, overLimit) = grouped.partition {
+      _._2.size <= heavyHittersN
+    }
+    val sorted = overLimit.transform { case (_, hhs) =>
+      hhs.toSeq.sortBy(hh => hh.count)
+    }
+    val purged = sorted.transform { case (_, hhs) =>
+      hhs.takeRight(heavyHittersN)
+    }
+    HeavyHitters[(K1, K2)](purged.values.flatten.toSet ++ underLimit.values.flatten.toSet)
+  }
+
+}
+
+/*
+ * Monoid for Top-N values per key in an associative [[TopCMS]].
+ *
+ * Typical use case for this might be (Country, City) pairs.  For a stream of such
+ * pairs, we might want to keep track of the most popular cities for each country.
+ *
+ * This can, of course, be achieved using a Map[Country, TopNCMS[City]], but this
+ * requires storing one CMS per distinct Country.
+ *
+ * Similarly, one could attempt to use a TopNCMS[(Country, City)], but less common
+ * countries may not make the cut if N is not "very large".
+ *
+ * ScopedTopNCMSMonoid[Country, City] will avoid having one Country drown others
+ * out, while still only using a single CMS.
+ *
+ * In general the eviction of K1 is not supported, and all distinct K1 values must
+ * be retained.  Therefore it is important to only use this Monoid when the number
+ * of distinct K1 values is known to be reasonably bounded.
+ */
+class ScopedTopNCMSMonoid[K1, K2](cms: CMS[(K1, K2)], heavyHittersN: Int = 100)
+    extends TopCMSMonoid[(K1, K2)](cms, ScopedTopNLogic[K1, K2](heavyHittersN))
+
+object ScopedTopNCMS {
+
+  def scopedHasher[K1: CMSHasher, K2: CMSHasher]: CMSHasher[(K1, K2)] = new CMSHasher[(K1, K2)] {
+    private val k1Hasher = implicitly[CMSHasher[K1]]
+    private val k2Hasher = implicitly[CMSHasher[K2]]
+
+    override def hash(a: Int, b: Int, width: Int)(x: (K1, K2)): Int = {
+      val (k1, k2) = x
+      val xs = Seq(k1Hasher.hash(a, b, width)(k1), k2Hasher.hash(a, b, width)(k2), a, b)
+      (scala.util.hashing.MurmurHash3.seqHash(xs) & Int.MaxValue) % width
+    }
+  }
+
+  def monoid[K1: CMSHasher, K2: CMSHasher](
+      eps: Double,
+      delta: Double,
+      seed: Int,
+      heavyHittersN: Int
+  ): ScopedTopNCMSMonoid[K1, K2] =
+    new ScopedTopNCMSMonoid[K1, K2](CMS(eps, delta, seed)(scopedHasher[K1, K2]), heavyHittersN)
+
+  def monoid[K1: CMSHasher, K2: CMSHasher](
+      depth: Int,
+      width: Int,
+      seed: Int,
+      heavyHittersN: Int
+  ): ScopedTopNCMSMonoid[K1, K2] =
+    monoid(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersN)
+
+  def aggregator[K1: CMSHasher, K2: CMSHasher](
+      eps: Double,
+      delta: Double,
+      seed: Int,
+      heavyHittersN: Int
+  ): TopCMSAggregator[(K1, K2)] =
+    new TopCMSAggregator(monoid(eps, delta, seed, heavyHittersN))
+
+  def aggregator[K1: CMSHasher, K2: CMSHasher](
+      depth: Int,
+      width: Int,
+      seed: Int,
+      heavyHittersN: Int
+  ): TopCMSAggregator[(K1, K2)] =
+    aggregator(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersN)
+
+}
+
+case class CMSHash[K: CMSHasher](a: Int, b: Int, width: Int) extends java.io.Serializable {
+
+  /**
+   * Returns `a * x + b (mod p) (mod width)`.
+   */
+  def apply(x: K): Int = implicitly[CMSHasher[K]].hash(a, b, width)(x)
+
+}
+
+/**
+ * This formerly held the instances that moved to object CMSHasher
+ *
+ * These instances are slow, but here for compatibility with old serialized data. For new code, avoid these
+ * and instead use the implicits found in the CMSHasher companion object.
+ */
+object CMSHasherImplicits {
+
+  implicit object CMSHasherBigInt extends CMSHasher[BigInt] {
+    override def hash(a: Int, b: Int, width: Int)(x: BigInt): Int =
+      CMSHasher.hashBytes(a, b, width)(x.toByteArray)
+  }
+
+  implicit object CMSHasherString extends CMSHasher[String] {
+    override def hash(a: Int, b: Int, width: Int)(x: String): Int =
+      CMSHasher.hashBytes(a, b, width)(x.getBytes("UTF-8"))
+  }
+
+  def cmsHasherShort: CMSHasher[Short] = CMSHasher.cmsHasherShort
+}
diff --git a/algebird-core/src/main/scala-2.12/DecayedVector.scala b/algebird-core/src/main/scala-2.12/DecayedVector.scala
new file mode 100644
index 000000000..18e816fe4
--- /dev/null
+++ b/algebird-core/src/main/scala-2.12/DecayedVector.scala
@@ -0,0 +1,75 @@
+/*
+Copyright 2012 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+ */
+
+package com.twitter.algebird
+
+/**
+ * Represents a container class together with time. Its monoid consists of exponentially scaling the older
+ * value and summing with the newer one.
+ */
+object DecayedVector extends CompatDecayedVector {
+  def buildWithHalflife[C[_]](vector: C[Double], time: Double, halfLife: Double): DecayedVector[C] =
+    DecayedVector(vector, time * scala.math.log(2.0) / halfLife)
+
+  def monoidWithEpsilon[C[_]](
+      eps: Double
+  )(implicit vs: VectorSpace[Double, C], metric: Metric[C[Double]]): Monoid[DecayedVector[C]] =
+    new Monoid[DecayedVector[C]] {
+      override val zero = DecayedVector(vs.group.zero, Double.NegativeInfinity)
+      override def plus(left: DecayedVector[C], right: DecayedVector[C]) =
+        if (left.scaledTime <= right.scaledTime) {
+          scaledPlus(right, left, eps)
+        } else {
+          scaledPlus(left, right, eps)
+        }
+    }
+
+  def forMap[K](m: Map[K, Double], scaledTime: Double): DecayedVector[Map[K, _]] =
+    DecayedVector[Map[K, _]](m, scaledTime)
+  def forMapWithHalflife[K](m: Map[K, Double], time: Double, halfLife: Double): DecayedVector[Map[K, _]] =
+    forMap(m, time * scala.math.log(2.0) / halfLife)
+
+  def mapMonoidWithEpsilon[K](
+      eps: Double
+  )(implicit
+      vs: VectorSpace[Double, Map[K, _]],
+      metric: Metric[Map[K, Double]]
+  ): Monoid[DecayedVector[Map[K, _]]] =
+    monoidWithEpsilon[Map[K, _]](eps)
+
+  implicit def mapMonoid[K](implicit
+      vs: VectorSpace[Double, Map[K, _]],
+      metric: Metric[Map[K, Double]]
+  ): Monoid[DecayedVector[Map[K, _]]] =
+    mapMonoidWithEpsilon(-1.0)
+
+  def scaledPlus[C[_]](newVal: DecayedVector[C], oldVal: DecayedVector[C], eps: Double)(implicit
+      vs: VectorSpace[Double, C],
+      metric: Metric[C[Double]]
+  ): DecayedVector[C] = {
+    implicit val mon: Monoid[C[Double]] = vs.group
+    val expFactor = scala.math.exp(oldVal.scaledTime - newVal.scaledTime)
+    val newVector =
+      Monoid.plus(newVal.vector, vs.scale(expFactor, oldVal.vector))
+    if (eps < 0.0 || Metric.norm(newVector) > eps) {
+      DecayedVector(newVector, newVal.scaledTime)
+    } else {
+      DecayedVector(mon.zero, Double.NegativeInfinity)
+    }
+  }
+}
+
+case class DecayedVector[C[_]](vector: C[Double], scaledTime: Double)
diff --git a/algebird-core/src/main/scala-2.12/DecayingCMS.scala b/algebird-core/src/main/scala-2.12/DecayingCMS.scala
new file mode 100644
index 000000000..54809e2a8
--- /dev/null
+++ b/algebird-core/src/main/scala-2.12/DecayingCMS.scala
@@ -0,0 +1,650 @@
+package com.twitter.algebird
+
+import java.lang.Double.{compare => cmp}
+import java.lang.Math
+import java.util.Arrays.deepHashCode
+import scala.concurrent.duration.Duration
+import scala.util.Random
+
+/**
+ * DecayingCMS is a module to build count-min sketch instances whose counts decay exponentially.
+ *
+ * Similar to a Map[K, com.twitter.algebird.DecayedValue], each key is associated with a single count value
+ * that decays over time. Unlike a map, the decyaing CMS is an approximate count -- in exchange for the
+ * possibility of over-counting, we can bound its size in memory.
+ *
+ * The intended use case is for metrics or machine learning where exact values aren't needed.
+ *
+ * You can expect the keys with the biggest values to be fairly accurate but the very small values (rare keys
+ * or very old keys) to be lost in the noise. For both metrics and ML this should be fine: you can't learn too
+ * much from very rare values.
+ *
+ * We recommend depth of at least 5, and width of at least 100, but you should do some experiments to
+ * determine the smallest parameters that will work for your use case.
+ */
+final class DecayingCMS[K](
+    seed: Long,
+    val halfLife: Duration,
+    val depth: Int, // number of hashing functions
+    val width: Int, // number of table cells per hashing function
+    hasher: CMSHasher[K]
+) extends Serializable { module =>
+
+  override def toString: String =
+    s"DecayingCMS(seed=$seed, halfLife=$halfLife, depth=$depth, width=$width)"
+
+  @inline private def getNextLogScale(
+      logScale: Double,
+      oldTimeInHL: Double,
+      nowInHL: Double
+  ): Double =
+    if (nowInHL == oldTimeInHL) logScale else logScale + (nowInHL - oldTimeInHL) * log2
+
+  @inline private def getScale(logScale: Double, oldTimeInHL: Double, nowInHL: Double): Double = {
+    val logScale1 = getNextLogScale(logScale, oldTimeInHL, nowInHL)
+    Math.exp(-logScale1)
+  }
+
+  val empty: CMS =
+    new CMS(Array.fill(depth)(Vector.fill[Double](width)(0.0)), 0.0, Double.NegativeInfinity)
+
+  /**
+   * Represents a decaying scalar value at a particular point in time.
+   *
+   * The value decays according to halfLife. Another way to think about DoubleAt is that it represents a
+   * particular decay curve (and in particular, a point along that curve). Two DoubleAt values may be
+   * equivalent if they are two points on the same curve.
+   *
+   * The `timeToZero` and `timeToUnit` methods can be used to "normalize" DoubleAt values. If two DoubleAt
+   * values do not produce the same (approximate) Double values from these methods, they represent different
+   * curves.
+   */
+  class DoubleAt private[algebird] (val value: Double, val timeInHL: Double) extends Serializable {
+    lhs =>
+
+    // this is not public because it's not safe in general -- you need
+    // to run a function that is time-commutative.
+    private[algebird] def map(f: Double => Double): DoubleAt =
+      new DoubleAt(f(value), timeInHL)
+
+    // this is not public because it's not safe in general -- you need
+    // to run a function that is time-commutative.
+    private[algebird] def map2(rhs: DoubleAt)(f: (Double, Double) => Double): DoubleAt =
+      if (lhs.timeInHL < rhs.timeInHL) {
+        val x = lhs.scaledAt(rhs.timeInHL)
+        new DoubleAt(f(x, rhs.value), rhs.timeInHL)
+      } else if (lhs.timeInHL == rhs.timeInHL) {
+        new DoubleAt(f(lhs.value, rhs.value), rhs.timeInHL)
+      } else {
+        val y = rhs.scaledAt(lhs.timeInHL)
+        new DoubleAt(f(lhs.value, y), lhs.timeInHL)
+      }
+
+    def unary_- : DoubleAt = new DoubleAt(-value, timeInHL)
+    def abs: DoubleAt = new DoubleAt(Math.abs(value), timeInHL)
+    def *(n: Double): DoubleAt = new DoubleAt(value * n, timeInHL)
+
+    def +(rhs: DoubleAt): DoubleAt = map2(rhs)(_ + _)
+    def -(rhs: DoubleAt): DoubleAt = map2(rhs)(_ - _)
+    def min(rhs: DoubleAt): DoubleAt = map2(rhs)(Math.min)
+    def max(rhs: DoubleAt): DoubleAt = map2(rhs)(Math.max)
+
+    def /(rhs: DoubleAt): Double = map2(rhs)(_ / _).value
+
+    /**
+     * We consider two DoubleAt values equal not just if their elements are equal, but also if they represent
+     * the same value at different points of decay.
+     */
+    def compare(rhs: DoubleAt): Int = {
+      val vc = cmp(lhs.value, rhs.value)
+      val tc = cmp(lhs.timeInHL, rhs.timeInHL)
+      if (vc == tc) vc
+      else if (tc == 0) vc
+      else if (vc == 0) tc
+      else if (tc < 0) cmp(lhs.scaledAt(rhs.timeInHL), rhs.value)
+      else cmp(lhs.value, rhs.scaledAt(lhs.timeInHL))
+    }
+
+    /**
+     * Time when this value will reach the smallest double value bigger than zero, unless we are already at
+     * zero in which case we return the current time
+     */
+    def timeToZero: Double =
+      if (java.lang.Double.isNaN(value)) Double.NaN
+      else if (java.lang.Double.isInfinite(value)) Double.PositiveInfinity
+      else if (value == 0.0) timeInHL
+      else timeToUnit + DoubleAt.TimeFromUnitToZero
+
+    /**
+     * This is the scaled time when the current value will reach 1 (or -1 for negative values)
+     *
+     * This method is a way of collapsing a DoubleAt into a single value (the time in the past or future where
+     * its value would be 1, the unit value).
+     */
+    def timeToUnit: Double =
+      if (java.lang.Double.isNaN(value)) Double.NaN
+      else if (java.lang.Double.isInfinite(value)) Double.PositiveInfinity
+      else if (value == 0.0) Double.NegativeInfinity
+      else {
+        // solve for result:
+        //
+        //   1 = value * module.getScale(0.0, timeInHL, result)
+        //   1 = value * Math.exp(-getNextLogScale(0.0, timeInHL, result))
+        //   1 / value = Math.exp(-getNextLogScale(0.0, timeInHL, result))
+        //   log(1 / value) = -getNextLogScale(0.0, timeInHL, result)
+        //   -log(1 / value) = getNextLogScale(0.0, timeInHL, result)
+        //   log(value) = getNextLogScale(0.0, timeInHL, result)
+        //   log(value) = if (result == timeInHL) 0 else 0 + (result - timeInHL) * log2
+        //   log(value) = if (result == timeInHL) 0 else (result - timeInHL) * log2
+        //
+        //   log(value) = (result - timeInHL) * log2
+        //   log(value) / log2 = result - timeInHL
+        //   log(value) / log2 + timeInHL = result
+        Math.log(Math.abs(value)) / log2 + timeInHL
+      }
+
+    override def equals(that: Any): Boolean =
+      that match {
+        case d: DoubleAt => compare(d) == 0
+        case _           => false
+      }
+
+    override def hashCode: Int =
+      timeToUnit.##
+
+    override def toString: String =
+      s"DoubleAt($value, $timeInHL)"
+
+    def <(rhs: DoubleAt): Boolean = (lhs.compare(rhs)) < 0
+    def <=(rhs: DoubleAt): Boolean = (lhs.compare(rhs)) <= 0
+    def >(rhs: DoubleAt): Boolean = (lhs.compare(rhs)) > 0
+    def >=(rhs: DoubleAt): Boolean = (lhs.compare(rhs)) >= 0
+
+    def time: Long =
+      toTimestamp(timeInHL)
+
+    private def scaledAt(t: Double): Double =
+      if (value == 0.0) 0.0
+      else value * module.getScale(0.0, timeInHL, t)
+
+    def at(time: Long): Double =
+      if (value == 0.0) 0.0
+      else value * module.getScale(0.0, timeInHL, fromTimestamp(time))
+  }
+
+  object DoubleAt {
+    def apply(x: Double, t: Long): DoubleAt =
+      new DoubleAt(x, fromTimestamp(t))
+
+    val zero: DoubleAt =
+      new DoubleAt(0.0, Double.NegativeInfinity)
+
+    private val TimeFromUnitToZero: Double =
+      -Math.log(Double.MinPositiveValue) / log2
+  }
+
+  val totalCells: Int = depth * width
+
+  val halfLifeSecs: Double =
+    halfLife.toMillis.toDouble / 1000.0
+
+  // TODO: consider a smaller number?
+  // we are trading accuracy for possible performence
+  private[this] val maxLogScale: Double = 20.0
+
+  /**
+   * Allocate an empty array of row.
+   *
+   * The elements start as null. It's an important optimization _not_ to allocate vectors here, since we're
+   * often building up cells mutably.
+   */
+  private def allocCells(): Array[Vector[Double]] =
+    new Array[Vector[Double]](depth)
+
+  def toTimestamp(t: Double): Long =
+    (t * halfLifeSecs * 1000.0).toLong
+
+  def fromTimestamp(t: Long): Double =
+    (t.toDouble / 1000.0) / halfLifeSecs
+
+  val hashFns: Array[K => Int] = {
+    val rng = new Random(seed)
+    def genPos(): Int =
+      rng.nextInt() match {
+        case 0 => genPos()
+        case n => n & 0x7fffffff
+      }
+
+    (0 until depth).map { _ =>
+      val n = genPos()
+      (k: K) => hasher.hash(n, 0, width)(k)
+    }.toArray
+  }
+
+  private final val log2 = Math.log(2.0)
+
+  /**
+   * The idealized formula for the updating current value for a key (y0 -> y1) is given as:
+   *
+   * delta = (t1 - t0) / halflife y1 = y0 * 2^(-delta) + n
+   *
+   * However, we want to avoid having to rescale every single cell every time we update; i.e. a cell with a
+   * zero value should continue to have a zero value when n=0.
+   *
+   * Therefore, we introduce a change of variable to cell values (z) along with a scale factor (scale), and
+   * the following formula:
+   *
+   * (1) zN = yN * scaleN
+   *
+   * Our constraint is expressed as:
+   *
+   * (2) If n=0, z1 = z0
+   *
+   * In that case:
+   *
+   * (3) If n=0, (y1 * scale1) = (y0 * scale0) (4) Substituting for y1, (y0 * 2^(-delta) + 0) * scale1 = y0 *
+   * scale0 (5) 2^(-delta) * scale1 = scale0 (6) scale1 = scale0 * 2^(delta)
+   *
+   * Also, to express z1 in terms of z0, we say:
+   *
+   * (7) z1 = y1 * scale1 (8) z1 = (y0 * 2^(-delta) + n) * scale1 (9) z1 = ((z0 / scale0) * 2^(-delta) + n) *
+   * scale1 (10) z1 / scale1 = (z0 / (scale1 * 2^(-delta))) * 2^(-delta) + n (11) z1 / scale1 = z0 / scale1 +
+   * n (12) z1 = z0 + n * scale1
+   *
+   * So, for cells where n=0, we just update scale0 to scale1, and for cells where n is non-zero, we update z1
+   * in terms of z0 and scale1.
+   *
+   * If we convert scale to logscale, we have:
+   *
+   * (13) logscale1 = logscale0 + delta * log(2) (14) z1 = z0 + n * exp(logscale1)
+   *
+   * When logscale1 gets big, we start to distort z1. For example, exp(36) is close to 2^53. We can measure
+   * when n * exp(logscale1) gets big, and in those cases we can rescale all our cells (set each z to its
+   * corresponding y) and set the logscale to 0.
+   *
+   * (15) y1 = z1 / scale1 (16) y1 = z1 / exp(logscale1) (17) y1 = z1 * exp(-logscale1)
+   */
+  final class CMS(
+      val cells: Array[Vector[Double]],
+      val logScale: Double,
+      val timeInHL: Double
+  ) extends Serializable {
+
+    @inline private def scale: Double =
+      Math.exp(-logScale)
+
+    override def toString: String = {
+      val s = cells.iterator.map(_.toString).mkString("Array(", ", ", ")")
+      s"CMS($s, $logScale, $timeInHL)"
+    }
+
+    override def hashCode: Int =
+      deepHashCode(cells.asInstanceOf[Array[Object]]) * 59 +
+        logScale.## * 17 +
+        timeInHL.## * 37 +
+        19
+
+    // unfortunately we can't check the path-dependent type of this
+    // CMS, which we signal by using a type projection here.
+    override def equals(any: Any): Boolean =
+      any match {
+        case that: DecayingCMS[?]#CMS =>
+          this.logScale == that.logScale &&
+          this.timeInHL == that.timeInHL &&
+          this.cells.length == that.cells.length && {
+            var i = 0
+            while (i < depth) {
+              if (this.cells(i) != that.cells(i)) return false
+              i += 1
+            }
+            true
+          }
+        case _ =>
+          false
+      }
+
+    def lastUpdateTime: Long =
+      toTimestamp(timeInHL)
+
+    /**
+     * Provide lower and upper bounds on values returned for any possible key.
+     *
+     * The first value is a lower bound: even keys that have never been counted will return this value or
+     * greater. This will be zero unless the CMS is saturated.
+     *
+     * The second value is an upper bound: the key with the largest cardinality will not be reported as being
+     * larger than this value (though it might be reported as being smaller).
+     *
+     * Together these values indicate how saturated and skewed the CMS might be.
+     */
+    def range: (DoubleAt, DoubleAt) = {
+      var minMinimum = Double.PositiveInfinity
+      var minMaximum = Double.PositiveInfinity
+      var i = 0
+      while (i < cells.length) {
+        val it = cells(i).iterator
+        var localMax = it.next() // we know it doesn't start empty
+        if (localMax < minMinimum) minMinimum = localMax
+        while (it.hasNext) {
+          val n = it.next()
+          if (n > localMax) localMax = n
+          else if (n < minMinimum) minMinimum = n
+        }
+        if (localMax < minMaximum) minMaximum = localMax
+        i += 1
+      }
+
+      val s = scale
+      def sc(x: Double): DoubleAt =
+        new DoubleAt(if (x == 0.0) 0.0 else x * s, timeInHL)
+
+      (sc(minMinimum), sc(minMaximum))
+    }
+
+    /**
+     * Returns the square-root of the inner product of two decaying CMSs.
+     *
+     * We want the result to decay at the same rate as the CMS for this method to be valid. Taking the square
+     * root ensures that this is true. Without it, we would violate the following equality (assuming we had
+     * at() on a CMS):
+     *
+     * x.innerProduct(y).at(t) = x.at(t).innerProduct(y.at(t))
+     *
+     * This is why we don't support innerProduct, only innerProductRoot.
+     */
+    def innerProductRoot(that: CMS): DoubleAt = {
+      var i = 0
+      var res = Double.PositiveInfinity
+      val t = Math.max(this.timeInHL, that.timeInHL)
+      val scale = this.getScale(t) * that.getScale(t)
+      while (i < depth) {
+        var sum = 0.0
+        val it0 = this.cells(i).iterator
+        val it1 = that.cells(i).iterator
+        while (it0.hasNext) {
+          val x = it0.next() * it1.next()
+          if (x != 0.0) sum += x
+        }
+        if (sum < res) res = sum
+        i += 1
+      }
+      val x = if (res != 0.0) Math.sqrt(res * scale) else 0.0
+      new DoubleAt(x, t)
+    }
+
+    def l2Norm: DoubleAt =
+      innerProductRoot(this)
+
+    def scale(x: Double): CMS =
+      if (java.lang.Double.isNaN(x)) {
+        throw new IllegalArgumentException(s"invalid scale: $x")
+      } else if (x < 0.0) {
+        throw new IllegalArgumentException(s"negative scale is not allowed: $x")
+      } else if (x == 0.0) {
+        module.empty
+      } else {
+        val s = logScale + Math.log(x)
+        val c = new CMS(cells, s, timeInHL)
+        if (s > maxLogScale) c.rescaleTo(timeInHL) else c
+      }
+
+    /**
+     * Get the total count of all items in the CMS.
+     *
+     * The total is the same as the l1Norm, since we don't allow negative values.
+     *
+     * Total is one of the few non-approximate statistics that DecayingCMS supports. We expect the total to be
+     * exact (except for floating-point error).
+     */
+    def total: DoubleAt = {
+      val n = cells(0).sum
+      val x = if (n == 0.0) 0.0 else scale * n
+      new DoubleAt(x, timeInHL)
+    }
+
+    def get(k: K): DoubleAt = {
+      var minValue = Double.PositiveInfinity
+      var didx = 0
+      while (didx < depth) {
+        val i = hashFns(didx)(k)
+        val inner = cells(didx)
+        val value = inner(i)
+        if (value < minValue) minValue = value
+        didx += 1
+      }
+      val x = if (minValue == 0.0) 0.0 else scale * minValue
+      new DoubleAt(x, timeInHL)
+    }
+
+    def getScale(t: Double): Double =
+      module.getScale(logScale, timeInHL, t)
+
+    private final def nextLogScale(t: Double): Double =
+      module.getNextLogScale(logScale, timeInHL, t)
+
+    def +(other: CMS): CMS = {
+      val x = this
+      val y = other
+      val timeInHL = Math.max(x.timeInHL, y.timeInHL)
+      val cms = new CMS(allocCells(), 0.0, timeInHL)
+
+      val xscale = x.getScale(timeInHL)
+      val yscale = y.getScale(timeInHL)
+
+      // a zero count is zero, no matter, how big the scale is.
+      @inline def prod(x: Double, y: Double): Double =
+        if (x == 0.0) 0.0 else x * y
+
+      var i = 0
+      while (i < depth) {
+        val left = x.cells(i)
+        val right = y.cells(i)
+        var j = 0
+        val bldr = rowBuilder()
+        while (j < width) {
+          bldr += prod(left(j), xscale) + prod(right(j), yscale)
+          j += 1
+        }
+        cms.cells(i) = bldr.result()
+        i += 1
+      }
+      cms
+    }
+
+    def add(t: Long, k: K, n: Double): CMS =
+      scaledAdd(fromTimestamp(t), k, n)
+
+    // TODO: we could allocate a mutable scratch pad, write all the
+    // values into it, and then build a CMS out of it. if items is
+    // very small, this would be less efficient than what we're doing
+    // now. probably the "ideal" solution would be determine how many
+    // items there are. if we have fewer than ~width items, this
+    // approach is fine. for more, a scratch pad would be better
+    // (assuming we wrote that code).
+    //
+    // alternately, you could map items into (zero + item) and then
+    // use the monoid's sum to boil it down.
+    //
+    // we only use this in testing currently so the current code is
+    // fine until we rely on it in production. any change here should
+    // probably include benchmarks justifying the design.
+    def bulkAdd(items: Iterable[(Long, K, Double)]): CMS =
+      items.foldLeft(this) { case (c, (t, k, v)) => c.add(t, k, v) }
+
+    private[algebird] def scaledAdd(ts1: Double, k: K, n: Double): CMS =
+      if (n < 0.0) {
+        val t = toTimestamp(ts1)
+        throw new IllegalArgumentException(
+          s"we can only add non-negative numbers to a CMS, got $n for key: $k at time: $t"
+        )
+      } else if (n == 0.0) {
+        this
+      } else {
+        val logScale1 = nextLogScale(ts1)
+        if (logScale1 > maxLogScale) {
+          rescaleTo(ts1).scaledAdd(ts1, k, n)
+        } else {
+          val increment = n * Math.exp(logScale1)
+          val cells1 = allocCells()
+          var didx = 0
+          while (didx < depth) {
+            val cell = cells(didx)
+            val w = hashFns(didx)(k)
+            cells1(didx) = cell.updated(w, cell(w) + increment)
+            didx += 1
+          }
+          new CMS(cells1, logScale1, ts1)
+        }
+      }
+
+    // Set the scale back to 0.0
+    // input time is in half-lives
+    private[algebird] def rescaleTo(ts: Double): CMS = {
+      val logScale1 = nextLogScale(ts)
+      val expL = Math.exp(-logScale1)
+      if (expL == 0.0) {
+        new CMS(monoid.zero.cells, 0.0, ts)
+      } else {
+        val cms = new CMS(allocCells(), 0.0, ts)
+        var i = 0
+        while (i < depth) {
+          val ci = cells(i)
+          cms.cells(i) = ci.map(_ * expL)
+          i += 1
+        }
+        cms
+      }
+    }
+  }
+
+  private def rowBuilder() = {
+    val bldr = Vector.newBuilder[Double]
+    bldr.sizeHint(width)
+    bldr
+  }
+
+  object CMS {
+
+    implicit val monoidForCMS: Monoid[CMS] =
+      new Monoid[CMS] {
+
+        def zero: CMS = module.empty
+
+        def plus(x: CMS, y: CMS): CMS =
+          x + y
+
+        /**
+         * Turn a flat array into an array of vectors.
+         */
+        private def scratchToCells(scratch: Array[Double]): Array[Vector[Double]] = {
+          val cells = new Array[Vector[Double]](depth)
+          var i = 0
+          while (i < depth) {
+            var j = i * width
+            val limit = j + width
+            val bldr = rowBuilder()
+            while (j < limit) {
+              bldr += scratch(j)
+              j += 1
+            }
+            cells(i) = bldr.result()
+            i += 1
+          }
+          cells
+        }
+
+        /**
+         * This method sums the first `num` items in `arr`.
+         */
+        private def innerSum(arr: Array[CMS], num: Int): CMS =
+          if (num == 0) zero
+          else if (num == 1) arr(0)
+          else if (num == 2) plus(arr(0), arr(1))
+          else {
+            // start with zero
+            val scratch: Array[Double] = new Array(totalCells)
+
+            val latestTimeInHL: Double =
+              arr.iterator.take(num).map(cms => cms.timeInHL).max
+
+            var i = 0
+            while (i < num) {
+              val cms = arr(i)
+              val scale = cms.getScale(latestTimeInHL)
+              var j = 0
+              while (j < depth) {
+                val row = cms.cells(j)
+                val stride = j * width
+                var k = 0
+                while (k < width) {
+                  val n = row(k)
+                  if (n > 0.0) {
+                    scratch(stride + k) += scale * n
+                  }
+                  k += 1
+                }
+                j += 1
+              }
+              i += 1
+            }
+
+            val cells = scratchToCells(scratch)
+
+            new CMS(cells, 0.0, latestTimeInHL)
+          }
+
+        override def sumOption(xs: TraversableOnce[CMS]): Option[CMS] = {
+
+          val it: Iterator[CMS] = xs.toIterator
+          val ChunkSize = 1000
+
+          // the idea here is that we read up to 1000 CMS values into
+          // a fixed array, crunch them down to a single CMS, store it
+          // in the first array index, read up to 999 more CMS values
+          // in, crunch them down, and so on.
+          var i = 0
+          val arr = new Array[CMS](ChunkSize)
+          while (it.hasNext) {
+            while (it.hasNext && i < ChunkSize) {
+              arr(i) = it.next()
+              i += 1
+            }
+            if (i > 1) {
+              arr(0) = innerSum(arr, i)
+            }
+            i = 1
+          }
+          if (i == 0) None else Some(arr(0))
+        }
+      }
+  }
+
+  val monoid: Monoid[CMS] = CMS.monoidForCMS
+}
+
+object DecayingCMS {
+
+  /**
+   * Construct a DecayingCMS module.
+   *
+   * The seed is used to initialize the hash families used by the count-min sketch. Using the same seed will
+   * always produce the same hash family.
+   *
+   * Half-life determines the rate at which values in the CMS decay. If a key was counted once at time t, by
+   * time (t + halfLife), the value for that key will be 0.5. After enough half lives the value will decay to
+   * zero.
+   *
+   * The size of the CMS in bytes is O(depth * width).
+   *
+   * Width controls the relative error due to over-counting (approximately 1/width). For 1% error, use
+   * width=100, for 0.1% error, use width=1000, etc.
+   *
+   * Depth controls the probability the error bounds are broken and that probability scales with exp(-alpha *
+   * depth) so, a small depth (e.g. 5-10) is fine. Each update requires O(depth) work so you want to keep this
+   * as small as possible.
+   */
+  def apply[K](seed: Long, halfLife: Duration, depth: Int, width: Int)(implicit
+      hasher: CMSHasher[K]
+  ): DecayingCMS[K] =
+    new DecayingCMS(seed, halfLife, depth, width, hasher)
+}
diff --git a/algebird-core/src/main/scala-2.12/Fold.scala b/algebird-core/src/main/scala-2.12/Fold.scala
new file mode 100644
index 000000000..0b89f2d62
--- /dev/null
+++ b/algebird-core/src/main/scala-2.12/Fold.scala
@@ -0,0 +1,352 @@
+/*
+Copyright 2014 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+ */
+package com.twitter.algebird
+
+import java.io.Serializable
+import scala.collection.compat._
+
+/**
+ * Folds are first-class representations of "Traversable.foldLeft." They have the nice property that they can
+ * be fused to work in parallel over an input sequence.
+ *
+ * A Fold accumulates inputs (I) into some internal type (X), converting to a defined output type (O) when
+ * done. We use existential types to hide internal details and to allow for internal and external (X and O)
+ * types to differ for "map" and "join."
+ *
+ * In discussing this type we draw parallels to Function1 and related types. You can think of a fold as a
+ * function "Seq[I] => O" but in reality we do not have to materialize the input sequence at once to "run" the
+ * fold.
+ *
+ * The traversal of the input data structure is NOT done by Fold itself. Instead we expose some methods like
+ * "overTraversable" that know how to iterate through various sequence types and drive the fold. We also
+ * expose some internal state so library authors can fold over their own types.
+ *
+ * See the companion object for constructors.
+ */
+sealed trait Fold[-I, +O] extends Serializable {
+
+  /**
+   * Users can ignore this type.
+   *
+   * The internal accumulator type. No one outside this Fold needs to know what this is, and that's a good
+   * thing. It keeps type signatures sane and makes this easy to use for the amount of flexibility it
+   * provides.
+   */
+  type X
+
+  /**
+   * Users can ignore this method. It is exposed so library authors can run folds over their own sequence
+   * types.
+   *
+   * "build" constructs a FoldState, which tells us how to run the fold. It is expected that we can run the
+   * same Fold many times over different data structures, but we must build a new FoldState every time.
+   *
+   * See FoldState for information on how to use this for your own sequence types.
+   */
+  def build(): FoldState[X, I, O]
+
+  /**
+   * Transforms the output of the Fold after iteration is complete. This is analogous to "Future.map" or
+   * "Function1.compose."
+   */
+  def map[P](f: O => P): Fold[I, P] = {
+    val self = this
+    new Fold[I, P] {
+      type X = self.X
+      override def build(): FoldState[X, I, P] =
+        self.build().map(f)
+    }
+  }
+
+  /**
+   * Joins two folds into one and combines the results. The fused fold accumulates with both at the same time
+   * and combines at the end.
+   */
+  def joinWith[I2 <: I, P, Q](other: Fold[I2, P])(f: (O, P) => Q): Fold[I2, Q] = {
+    val self = this
+    new Fold[I2, Q] {
+      type X = (self.X, other.X)
+      override def build(): FoldState[X, I2, Q] = {
+        val first = self.build()
+        val second = other.build()
+        new FoldState(
+          { case ((x, y), i) => (first.add(x, i), second.add(y, i)) },
+          (first.start, second.start),
+          { case (x, y) => f(first.end(x), second.end(y)) }
+        )
+      }
+    }
+  }
+
+  /**
+   * Convenient shorthand for joining Folds without combining at the end.
+   */
+  def join[I2 <: I, P](other: Fold[I2, P]): Fold[I2, (O, P)] =
+    joinWith(other) { case (o, p) => (o, p) }
+
+  /**
+   * Transforms the input of the fold before every accumulation. (The name comes from "contravariant map.")
+   * This is analogous to "Function1.andThen."
+   */
+  def contramap[H](f: H => I): Fold[H, O] = {
+    val self = this
+    new Fold[H, O] {
+      type X = self.X
+      override def build(): FoldState[X, H, O] =
+        self.build().contramap(f)
+    }
+  }
+
+  /**
+   * Trivially runs a Fold over an empty sequence.
+   */
+  def overEmpty: O = {
+    // build is a "def" so we construct the state once and use the pieces to run the fold
+    val state = build()
+    state.end(state.start)
+  }
+
+  /**
+   * Trivially runs a Fold over a single element sequence.
+   */
+  def overSingleton(i: I): O = {
+    val state = build()
+    state.end(state.add(state.start, i))
+  }
+
+  /**
+   * Runs a Fold over a Traversable.
+   */
+  def overTraversable(is: TraversableOnce[I]): O = {
+    val state = build()
+    state.end(is.iterator.foldLeft(state.start)(state.add))
+  }
+}
+
+/**
+ * A FoldState defines a left fold with a "hidden" accumulator type. It is exposed so library authors can run
+ * Folds over their own sequence types.
+ *
+ * The fold can be executed correctly according to the properties of "add" and your traversed data structure.
+ * For example, the "add" function of a monoidal fold will be associative. A FoldState is valid for only one
+ * iteration because the accumulator (seeded by "start" may be mutable.
+ *
+ * The three components of a fold are add: (X, I) => X - updates and returns internal state for every input I
+ * start: X - the initial state end: X => O - transforms internal state to a final result
+ *
+ * Folding over Seq(x, y) would produce the result end(add(add(start, x), y))
+ */
+final class FoldState[X, -I, +O] private[algebird] (val add: (X, I) => X, val start: X, val end: X => O)
+    extends Serializable {
+
+  /**
+   * Transforms the output type of the FoldState (see Fold.map).
+   */
+  def map[P](f: O => P): FoldState[X, I, P] =
+    new FoldState(add, start, end.andThen(f))
+
+  /**
+   * Transforms the input type of the FoldState (see Fold.contramap).
+   */
+  def contramap[H](f: H => I): FoldState[X, H, O] =
+    new FoldState((x, h) => add(x, f(h)), start, end)
+}
+
+/**
+ * Methods to create and run Folds.
+ *
+ * The Folds defined here are immutable and serializable, which we expect by default. It is important that you
+ * as a user indicate mutability or non-serializability when defining new Folds. Additionally, it is
+ * recommended that "end" functions not mutate the accumulator in order to support scans (producing a stream
+ * of intermediate outputs by calling "end" at each step).
+ */
+object Fold extends CompatFold {
+
+  /**
+   * "import Fold.applicative" will bring the Applicative instance into scope. See FoldApplicative.
+   */
+  implicit def applicative[I]: Applicative[Fold[I, _]] =
+    new FoldApplicative[I]
+
+  /**
+   * Turn a common Scala foldLeft into a Fold. The accumulator MUST be immutable and serializable.
+   */
+  def foldLeft[I, O](o: O)(add: (O, I) => O): Fold[I, O] =
+    fold[O, I, O](add, o, o => o)
+
+  /**
+   * A general way of defining Folds that supports a separate accumulator type. The accumulator MUST be
+   * immutable and serializable.
+   */
+  def fold[M, I, O](add: (M, I) => M, start: M, end: M => O): Fold[I, O] =
+    new Fold[I, O] {
+      type X = M
+      override def build(): FoldState[X, I, O] =
+        new FoldState(add, start, end)
+    }
+
+  /**
+   * A general way of defining Folds that supports constructing mutable or non-serializable accumulators.
+   */
+  def foldMutable[M, I, O](add: (M, I) => M, start: Unit => M, end: M => O): Fold[I, O] =
+    new Fold[I, O] {
+      type X = M
+      override def build(): FoldState[X, I, O] =
+        new FoldState(add, start(()), end)
+    }
+
+  /**
+   * Fuse a sequence of Folds into one that outputs the result of each.
+   */
+  def sequence[I, O](ms: Seq[Fold[I, O]]): Fold[I, Seq[O]] =
+    new Fold[I, Seq[O]] {
+      type X = Seq[Any]
+      override def build(): FoldState[Seq[Any], I, Seq[O]] = {
+        val bs: Seq[FoldState[Any, I, O]] =
+          ms.map(_.build().asInstanceOf[FoldState[Any, I, O]])
+        val adds =
+          bs.map(_.add)
+        val ends =
+          bs.map(_.end)
+        val starts: Seq[Any] =
+          bs.map(_.start)
+        val add: (Seq[Any], I) => Seq[Any] = { (xs, i) => adds.zip(xs).map { case (f, x) => f(x, i) } }
+        val end: (Seq[Any] => Seq[O]) = { xs => ends.zip(xs).map { case (f, x) => f(x) } }
+        new FoldState(add, starts, end)
+      }
+    }
+
+  /**
+   * An even simpler Fold that collects into a Seq. Shorthand for "container[I, Seq];" fewer type arguments,
+   * better type inferrence.
+   */
+  def seq[I]: Fold[I, Seq[I]] =
+    container[I, Seq]
+
+  /**
+   * A Fold that does no work and returns a constant. Analogous to Function1 const: def const[A, B](b: B): (A
+   * \=> B) = { _ => b }
+   */
+  def const[O](value: O): Fold[Any, O] =
+    Fold.foldLeft(value) { case (u, _) => u }
+
+  /**
+   * A Fold that runs the given side effect for every element.
+   */
+  def foreach[I](e: I => Unit): Fold[I, Unit] =
+    Fold.foldLeft(()) { case (_, i) => e(i) }
+
+  /**
+   * A Fold that returns the first value in a sequence.
+   */
+  def first[I]: Fold[I, Option[I]] =
+    Fold.foldLeft[I, Option[I]](None) {
+      case (None, i) => Some(i)
+      case (x, _)    => x
+    }
+
+  /**
+   * A Fold that returns the last value in a sequence.
+   */
+  def last[I]: Fold[I, Option[I]] =
+    Fold.foldLeft[I, Option[I]](None) { case (_, i) => Some(i) }
+
+  /**
+   * A Fold that returns the max value in a sequence. (Biased to earlier equal values.)
+   */
+  def max[I](implicit ordering: Ordering[I]): Fold[I, Option[I]] =
+    Fold.foldLeft[I, Option[I]](None) {
+      case (None, i)                                  => Some(i)
+      case (Some(y), i) if ordering.compare(y, i) < 0 => Some(i)
+      case (x, _)                                     => x
+    }
+
+  /**
+   * A Fold that returns a min value in a sequence. (Biased to earlier equal values.)
+   */
+  def min[I](implicit ordering: Ordering[I]): Fold[I, Option[I]] =
+    Fold.foldLeft[I, Option[I]](None) {
+      case (None, i)                                  => Some(i)
+      case (Some(y), i) if ordering.compare(y, i) > 0 => Some(i)
+      case (x, _)                                     => x
+    }
+
+  /**
+   * A Fold that returns the sum of a numeric sequence. Does not protect against overflow.
+   */
+  def sum[I](implicit numeric: Monoid[I]): Fold[I, I] =
+    Fold.foldLeft(numeric.zero) { case (x, i) => numeric.plus(x, i) }
+
+  /**
+   * For a semigroup, if we get more than 0 items, use plus
+   */
+  def sumOption[T](implicit sg: Semigroup[T]): Fold[T, Option[T]] =
+    Fold.foldLeft(None: Option[T]) {
+      case (None, i)    => Some(i)
+      case (Some(l), r) => Some(sg.plus(l, r))
+    }
+
+  /**
+   * A Fold that returns the product of a numeric sequence. Does not protect against overflow.
+   */
+  def product[I](implicit numeric: Ring[I]): Fold[I, I] =
+    Fold.foldLeft(numeric.one) { case (x, i) => numeric.times(x, i) }
+
+  /**
+   * A Fold that returns the length of a sequence.
+   */
+  def size: Fold[Any, Long] =
+    Fold.foldLeft(0L) { case (x, _) => x + 1 }
+
+  /**
+   * A Fold that returns "true" if all elements of the sequence statisfy the predicate. Note this does not
+   * short-circuit enumeration of the sequence.
+   */
+  def forall[I](pred: I => Boolean): Fold[I, Boolean] =
+    foldLeft(true)((b, i) => b && pred(i))
+
+  /**
+   * A Fold that returns "true" if any element of the sequence statisfies the predicate. Note this does not
+   * short-circuit enumeration of the sequence.
+   */
+  def exists[I](pred: I => Boolean): Fold[I, Boolean] =
+    foldLeft(false)((b, i) => b || pred(i))
+
+  /**
+   * A Fold that counts the number of elements satisfying the predicate.
+   */
+  def count[I](pred: I => Boolean): Fold[I, Long] =
+    foldLeft(0L) {
+      case (c, i) if pred(i) => c + 1L
+      case (c, _)            => c
+    }
+}
+
+/**
+ * Folds are Applicatives!
+ */
+class FoldApplicative[I] extends Applicative[Fold[I, _]] {
+  override def map[T, U](mt: Fold[I, T])(fn: T => U): Fold[I, U] =
+    mt.map(fn)
+  override def apply[T](v: T): Fold[I, T] =
+    Fold.const(v)
+  override def join[T, U](mt: Fold[I, T], mu: Fold[I, U]): Fold[I, (T, U)] =
+    mt.join(mu)
+  override def sequence[T](ms: Seq[Fold[I, T]]): Fold[I, Seq[T]] =
+    Fold.sequence(ms)
+  override def joinWith[T, U, V](mt: Fold[I, T], mu: Fold[I, U])(fn: (T, U) => V): Fold[I, V] =
+    mt.joinWith(mu)(fn)
+}
diff --git a/algebird-core/src/main/scala-2.12/Interval.scala b/algebird-core/src/main/scala-2.12/Interval.scala
new file mode 100644
index 000000000..6a1645d16
--- /dev/null
+++ b/algebird-core/src/main/scala-2.12/Interval.scala
@@ -0,0 +1,380 @@
+/*
+ Copyright 2013 Twitter, Inc.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package com.twitter.algebird
+
+// TODO this is clearly more general than summingbird, and should be extended to be a ring (add union, etc...)
+
+/**
+ * Represents a single interval on a T with an Ordering
+ */
+sealed trait Interval[T] extends java.io.Serializable {
+  def contains(t: T)(implicit ord: Ordering[T]): Boolean
+
+  def intersect(that: Interval[T])(implicit ord: Ordering[T]): Interval[T]
+  final def apply(t: T)(implicit ord: Ordering[T]): Boolean = contains(t)
+  final def &&(that: Interval[T])(implicit ord: Ordering[T]): Interval[T] = intersect(that)
+
+  /**
+   * Map the Interval with a non-decreasing function. If you use a non-monotonic function (like x^2) then the
+   * result is meaningless. TODO: It might be good to have types for these properties in algebird.
+   */
+  def mapNonDecreasing[U](fn: T => U): Interval[U]
+}
+
+case class Universe[T]() extends Interval[T] {
+  override def contains(t: T)(implicit ord: Ordering[T]): Boolean = true
+  override def intersect(that: Interval[T])(implicit ord: Ordering[T]): Interval[T] =
+    that
+  override def mapNonDecreasing[U](fn: T => U): Interval[U] = Universe()
+}
+
+case class Empty[T]() extends Interval[T] {
+  override def contains(t: T)(implicit ord: Ordering[T]): Boolean = false
+  override def intersect(that: Interval[T])(implicit ord: Ordering[T]): Interval[T] =
+    this
+  override def mapNonDecreasing[U](fn: T => U): Interval[U] = Empty()
+}
+
+object Interval extends java.io.Serializable {
+
+  /**
+   * Class that only exists so that [[leftClosedRightOpen]] and [[leftOpenRightClosed]] can retain the type
+   * information of the returned interval. The compiler doesn't know anything about ordering, so without
+   * [[MaybeEmpty]] the only valid return type is Interval[T].
+   */
+  sealed abstract class MaybeEmpty[T, NonEmpty[t] <: Interval[t]] {
+    def isEmpty: Boolean
+  }
+  object MaybeEmpty {
+
+    /**
+     * Represents an empty interval.
+     */
+    case class SoEmpty[T, NonEmpty[t] <: Interval[t]]() extends MaybeEmpty[T, NonEmpty] {
+      override def isEmpty: Boolean = true
+    }
+
+    /**
+     * Represents a non-empty interval.
+     */
+    case class NotSoEmpty[T, NonEmpty[t] <: Interval[t]](get: NonEmpty[T]) extends MaybeEmpty[T, NonEmpty] {
+      override def isEmpty: Boolean = false
+    }
+  }
+
+  type GenIntersection[T] = Intersection[Lower, Upper, T]
+  type InLowExUp[T] = Intersection[InclusiveLower, ExclusiveUpper, T]
+  type InLowInUp[T] = Intersection[InclusiveLower, InclusiveUpper, T]
+  type ExLowExUp[T] = Intersection[ExclusiveLower, ExclusiveUpper, T]
+  type ExLowInUp[T] = Intersection[ExclusiveLower, InclusiveUpper, T]
+
+  implicit def monoid[T: Ordering]: Monoid[Interval[T]] =
+    Monoid.from[Interval[T]](Universe[T]())(_ && _)
+
+  // Automatically convert from a MaybeEmpty instance
+  implicit def fromMaybeEmpty[T, NonEmpty[t] <: Interval[t]](me: MaybeEmpty[T, NonEmpty]): Interval[T] =
+    me match {
+      case MaybeEmpty.SoEmpty()     => Empty()
+      case MaybeEmpty.NotSoEmpty(i) => i
+    }
+
+  def leftClosedRightOpen[T: Ordering](lower: T, upper: T): MaybeEmpty[T, InLowExUp] =
+    if (Ordering[T].lt(lower, upper))
+      MaybeEmpty.NotSoEmpty[T, InLowExUp](Intersection(InclusiveLower(lower), ExclusiveUpper(upper)))
+    else MaybeEmpty.SoEmpty[T, InLowExUp]()
+
+  def leftOpenRightClosed[T: Ordering](lower: T, upper: T): MaybeEmpty[T, ExLowInUp] =
+    if (Ordering[T].lt(lower, upper))
+      MaybeEmpty.NotSoEmpty[T, ExLowInUp](Intersection(ExclusiveLower(lower), InclusiveUpper(upper)))
+    else MaybeEmpty.SoEmpty[T, ExLowInUp]()
+
+  def closed[T: Ordering](lower: T, upper: T): MaybeEmpty[T, InLowInUp] =
+    if (Ordering[T].lteq(lower, upper))
+      MaybeEmpty.NotSoEmpty[T, InLowInUp](Intersection(InclusiveLower(lower), InclusiveUpper(upper)))
+    else MaybeEmpty.SoEmpty[T, InLowInUp]()
+
+  def open[T: Ordering](lower: T, upper: T): MaybeEmpty[T, ExLowExUp] =
+    if (Ordering[T].lt(lower, upper))
+      MaybeEmpty.NotSoEmpty[T, ExLowExUp](Intersection(ExclusiveLower(lower), ExclusiveUpper(upper)))
+    else MaybeEmpty.SoEmpty[T, ExLowExUp]()
+
+  /**
+   * This is here for binary compatibility reasons. These methods should be moved to Interval, which should
+   * also be an abstract class for better binary compatibility at the next incompatible change
+   */
+  implicit final class IntervalMethods[T](val intr: Interval[T]) extends AnyVal {
+    def isEmpty(implicit succ: Successible[T], pred: Predecessible[T]): Boolean = intr match {
+      case Empty()    => true
+      case Universe() => false
+      case Intersection(InclusiveLower(l), ExclusiveUpper(u)) =>
+        !succ.ordering.lt(l, u)
+      case Intersection(InclusiveLower(l), InclusiveUpper(u)) =>
+        !succ.ordering.lteq(l, u)
+      case Intersection(ExclusiveLower(l), ExclusiveUpper(u)) =>
+        !succ.next(l).exists(succ.ordering.lt(_, u))
+      case Intersection(ExclusiveLower(l), InclusiveUpper(u)) =>
+        !succ.next(l).exists(succ.ordering.lteq(_, u))
+      case InclusiveLower(_) => false // we at least have l
+      case InclusiveUpper(_) => false // false // we at least have u
+      case ExclusiveLower(l) =>
+        succ.next(l).isEmpty
+      case ExclusiveUpper(u) =>
+        pred.prev(u).isEmpty
+    }
+
+    /**
+     * If this returns Some(t), then intr.contains(t) and there is no s less than t such that intr.contains(s)
+     *
+     * if this returns None, it may be Empty, Upper or Universe
+     */
+    def boundedLeast(implicit succ: Successible[T]): Option[T] = intr match {
+      case Empty()                => None
+      case Universe()             => None
+      case _: Upper[?]            => None
+      case i @ Intersection(_, _) => i.least
+      case l: Lower[?]            => l.least
+    }
+
+    /**
+     * If this returns Some(t), then intr.contains(t) and there is no s greater than t such that
+     * intr.contains(s)
+     *
+     * if this returns None, it may be Empty, Lower, or Universe
+     */
+    def boundedGreatest(implicit pred: Predecessible[T]): Option[T] =
+      intr match {
+        case Empty()                => None
+        case Universe()             => None
+        case _: Lower[?]            => None
+        case i @ Intersection(_, _) => i.greatest
+        case u: Upper[?]            => u.greatest
+      }
+  }
+}
+
+// Marker traits to keep lower on the left in Intersection
+sealed trait Lower[T] extends Interval[T] {
+
+  /**
+   * This may give a false positive (but should try not to). Note the case of (0,1) for the integers. If they
+   * were doubles, this would intersect, but since there are no members of the set Int that are bigger than 0
+   * and less than 1, they don't really intersect. So, ordering is not enough here. You need a stronger
+   * notion, which we don't have a typeclass for.
+   */
+  def intersects(u: Upper[T])(implicit ord: Ordering[T]): Boolean
+
+  /**
+   * The smallest value that is contained here This is an Option, because of cases like
+   * ExclusiveLower(Int.MaxValue) which are pathological and equivalent to Empty
+   */
+  def least(implicit s: Successible[T]): Option[T]
+  def strictLowerBound(implicit p: Predecessible[T]): Option[T]
+
+  /**
+   * Iterates all the items in this Lower[T] from lowest to highest
+   */
+  def toIterable(implicit s: Successible[T]): Iterable[T] =
+    least match {
+      case Some(l) => s.iterateNext(l)
+      case None    => Iterable.empty
+    }
+}
+sealed trait Upper[T] extends Interval[T] {
+
+  /**
+   * The smallest value that is contained here This is an Option, because of cases like
+   * ExclusiveUpper(Int.MinValue), which are pathological and equivalent to Empty
+   */
+  def greatest(implicit p: Predecessible[T]): Option[T]
+  // The smallest value that is not present
+  def strictUpperBound(implicit s: Successible[T]): Option[T]
+
+  /**
+   * Iterates all the items in this Upper[T] from highest to lowest
+   */
+  def toIterable(implicit p: Predecessible[T]): Iterable[T] =
+    greatest match {
+      case Some(g) => p.iteratePrev(g)
+      case None    => Iterable.empty
+    }
+}
+
+case class InclusiveLower[T](lower: T) extends Interval[T] with Lower[T] {
+  override def contains(t: T)(implicit ordering: Ordering[T]): Boolean =
+    ordering.lteq(lower, t)
+  override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match {
+    case Universe() => this
+    case Empty()    => that
+    case ub @ InclusiveUpper(_) =>
+      if (intersects(ub)) Intersection(this, ub) else Empty()
+    case ub @ ExclusiveUpper(_) =>
+      if (intersects(ub)) Intersection(this, ub) else Empty()
+    case InclusiveLower(thatlb) =>
+      if (ordering.gt(lower, thatlb)) this else that
+    case ExclusiveLower(thatlb) =>
+      if (ordering.gt(lower, thatlb)) this else that
+    case Intersection(thatL, thatU) => (this && thatL) && thatU
+  }
+  override def intersects(u: Upper[T])(implicit ordering: Ordering[T]): Boolean =
+    u match {
+      case InclusiveUpper(upper) => ordering.lteq(lower, upper)
+      case ExclusiveUpper(upper) => ordering.lt(lower, upper)
+    }
+  override def least(implicit s: Successible[T]): Option[T] = Some(lower)
+  override def strictLowerBound(implicit p: Predecessible[T]): Option[T] = p.prev(lower)
+  override def mapNonDecreasing[U](fn: T => U): Interval[U] = InclusiveLower(fn(lower))
+}
+case class ExclusiveLower[T](lower: T) extends Interval[T] with Lower[T] {
+  override def contains(t: T)(implicit ordering: Ordering[T]): Boolean =
+    ordering.lt(lower, t)
+  override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match {
+    case Universe() => this
+    case Empty()    => that
+    case ub @ InclusiveUpper(_) =>
+      if (intersects(ub)) Intersection(this, ub) else Empty()
+    case ub @ ExclusiveUpper(_) =>
+      if (intersects(ub)) Intersection(this, ub) else Empty()
+    case InclusiveLower(thatlb) =>
+      if (ordering.gteq(lower, thatlb)) this else that
+    case ExclusiveLower(thatlb) =>
+      if (ordering.gteq(lower, thatlb)) this else that
+    case Intersection(thatL, thatU) => (this && thatL) && thatU
+  }
+  override def intersects(u: Upper[T])(implicit ordering: Ordering[T]): Boolean =
+    u match {
+      case InclusiveUpper(upper) => ordering.lt(lower, upper)
+      case ExclusiveUpper(upper) =>
+        ordering.lt(lower, upper) // This is a false positive for (x, next(x))
+    }
+  override def least(implicit s: Successible[T]): Option[T] = s.next(lower)
+  override def strictLowerBound(implicit p: Predecessible[T]): Option[T] = Some(lower)
+  override def mapNonDecreasing[U](fn: T => U): Interval[U] = ExclusiveLower(fn(lower))
+}
+case class InclusiveUpper[T](upper: T) extends Interval[T] with Upper[T] {
+  override def contains(t: T)(implicit ordering: Ordering[T]): Boolean =
+    ordering.lteq(t, upper)
+  override def greatest(implicit p: Predecessible[T]): Option[T] = Some(upper)
+  // The smallest value that is not present
+  override def strictUpperBound(implicit s: Successible[T]): Option[T] = s.next(upper)
+  override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match {
+    case Universe() => this
+    case Empty()    => that
+    case lb @ InclusiveLower(_) =>
+      if (lb.intersects(this)) Intersection(lb, this) else Empty()
+    case lb @ ExclusiveLower(_) =>
+      if (lb.intersects(this)) Intersection(lb, this) else Empty()
+    case InclusiveUpper(thatub) =>
+      if (ordering.lt(upper, thatub)) this else that
+    case ExclusiveUpper(thatub) =>
+      if (ordering.lt(upper, thatub)) this else that
+    case Intersection(thatL, thatU) => thatL && (this && thatU)
+  }
+  override def mapNonDecreasing[U](fn: T => U): Interval[U] = InclusiveUpper(fn(upper))
+}
+case class ExclusiveUpper[T](upper: T) extends Interval[T] with Upper[T] {
+  override def contains(t: T)(implicit ordering: Ordering[T]): Boolean =
+    ordering.lt(t, upper)
+  override def greatest(implicit p: Predecessible[T]): Option[T] = p.prev(upper)
+  // The smallest value that is not present
+  override def strictUpperBound(implicit s: Successible[T]): Option[T] = Some(upper)
+  override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match {
+    case Universe() => this
+    case Empty()    => that
+    case lb @ InclusiveLower(_) =>
+      if (lb.intersects(this)) Intersection(lb, this) else Empty()
+    case lb @ ExclusiveLower(_) =>
+      if (lb.intersects(this)) Intersection(lb, this) else Empty()
+    case InclusiveUpper(thatub) =>
+      if (ordering.lteq(upper, thatub)) this else that
+    case ExclusiveUpper(thatub) =>
+      if (ordering.lteq(upper, thatub)) this else that
+    case Intersection(thatL, thatU) => thatL && (this && thatU)
+  }
+  override def mapNonDecreasing[U](fn: T => U): Interval[U] = ExclusiveUpper(fn(upper))
+}
+
+case class Intersection[L[t] <: Lower[t], U[t] <: Upper[t], T](lower: L[T], upper: U[T]) extends Interval[T] {
+  override def contains(t: T)(implicit ordering: Ordering[T]): Boolean =
+    lower.contains(t) && upper.contains(t)
+  override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match {
+    case Universe()                 => this
+    case Empty()                    => that
+    case lb @ InclusiveLower(_)     => (lb && lower) && upper
+    case lb @ ExclusiveLower(_)     => (lb && lower) && upper
+    case ub @ InclusiveUpper(_)     => lower && (ub && upper)
+    case ub @ ExclusiveUpper(_)     => lower && (ub && upper)
+    case Intersection(thatL, thatU) => (lower && thatL) && (upper && thatU)
+  }
+  override def mapNonDecreasing[T1](fn: T => T1): Interval[T1] = {
+    val newLower = lower match {
+      case InclusiveLower(l) => InclusiveLower(fn(l))
+      case ExclusiveLower(l) => ExclusiveLower(fn(l))
+    }
+    val newUpper = upper match {
+      case InclusiveUpper(u) => InclusiveUpper(fn(u))
+      case ExclusiveUpper(u) => ExclusiveUpper(fn(u))
+    }
+    Intersection(newLower, newUpper)
+  }
+
+  def least(implicit s: Successible[T]): Option[T] =
+    lower.least.filter(upper.contains(_)(s.ordering))
+
+  /**
+   * Goes from lowest to highest for all items that are contained in this Intersection
+   */
+  def leastToGreatest(implicit s: Successible[T]): Iterable[T] = {
+    val self = this
+    implicit val ord: Ordering[T] = s.ordering
+    // TODO https://github.com/twitter/algebird/issues/263
+    new AbstractIterable[T] {
+      // We have to do this because the normal takeWhile causes OOM on big intervals
+      override def iterator: Iterator[T] = lower.toIterable.iterator.takeWhile(self.upper.contains(_))
+    }
+  }
+
+  def greatest(implicit p: Predecessible[T]): Option[T] =
+    upper.greatest.filter(lower.contains(_)(p.ordering))
+
+  /**
+   * Goes from highest to lowest for all items that are contained in this Intersection
+   */
+  def greatestToLeast(implicit p: Predecessible[T]): Iterable[T] = {
+    val self = this
+    implicit val ord: Ordering[T] = p.ordering
+    // TODO https://github.com/twitter/algebird/issues/263
+    new AbstractIterable[T] {
+      // We have to do this because the normal takeWhile causes OOM on big intervals
+      override def iterator: Iterator[T] = upper.toIterable.iterator.takeWhile(self.lower.contains(_))
+    }
+  }
+
+  /**
+   * Some intervals can actually be synonyms for empty: (0,0) for instance, contains nothing. This cannot be
+   * normalized to [a, b) form, thus we return an option Also, there are cases like [Int.MinValue,
+   * Int.MaxValue] that cannot are actually equivalent to Universe. The bottom line: if this returns None, it
+   * just means you can't express it this way, it does not mean it is empty or universe, etc... (there are
+   * other cases).
+   */
+  def toLeftClosedRightOpen(implicit
+      s: Successible[T]
+  ): Option[Intersection[InclusiveLower, ExclusiveUpper, T]] =
+    for {
+      l <- lower.least
+      g <- upper.strictUpperBound if s.ordering.lt(l, g)
+    } yield Intersection(InclusiveLower(l), ExclusiveUpper(g))
+}
diff --git a/algebird-core/src/main/scala-2.12/InvariantAlgebras.scala b/algebird-core/src/main/scala-2.12/InvariantAlgebras.scala
new file mode 100644
index 000000000..6f30ebc1c
--- /dev/null
+++ b/algebird-core/src/main/scala-2.12/InvariantAlgebras.scala
@@ -0,0 +1,48 @@
+package com.twitter.algebird
+
+class InvariantSemigroup[T, U](val forward: T => U, val reverse: U => T)(implicit val semigroup: Semigroup[T])
+    extends Semigroup[U] {
+  override def plus(l: U, r: U): U =
+    forward(semigroup.plus(reverse(l), reverse(r)))
+  override def sumOption(iter: TraversableOnce[U]): Option[U] =
+    semigroup.sumOption(iter.map(reverse)).map(forward)
+
+  /*
+   * Note these work for the subclasses since in those cases semigroup
+   * will be the appropriate algebra.
+   */
+  override val hashCode: Int = (forward, reverse, semigroup).hashCode
+  override def equals(that: Any): Boolean =
+    that match {
+      case r: InvariantSemigroup[?, ?] =>
+        (hashCode == r.hashCode) &&
+        (forward == r.forward) &&
+        (reverse == r.reverse) &&
+        (semigroup == r.semigroup)
+      case _ => false
+    }
+}
+
+class InvariantMonoid[T, U](forward: T => U, reverse: U => T)(implicit val monoid: Monoid[T])
+    extends InvariantSemigroup[T, U](forward, reverse)
+    with Monoid[U] {
+  override val zero: U = forward(monoid.zero)
+}
+
+class InvariantGroup[T, U](forward: T => U, reverse: U => T)(implicit val group: Group[T])
+    extends InvariantMonoid[T, U](forward, reverse)
+    with Group[U] {
+  override def negate(u: U): U = forward(group.negate(reverse(u)))
+  override def minus(l: U, r: U): U =
+    forward(group.minus(reverse(l), reverse(r)))
+}
+
+class InvariantRing[T, U](forward: T => U, reverse: U => T)(implicit val ring: Ring[T])
+    extends InvariantGroup[T, U](forward, reverse)
+    with Ring[U] {
+  override val one: U = forward(ring.one)
+  override def times(l: U, r: U): U =
+    forward(ring.times(reverse(l), reverse(r)))
+  override def product(iter: TraversableOnce[U]): U =
+    forward(ring.product(iter.map(reverse)))
+}
diff --git a/algebird-core/src/main/scala-2.12/JavaMonoids.scala b/algebird-core/src/main/scala-2.12/JavaMonoids.scala
new file mode 100644
index 000000000..26ce54f0a
--- /dev/null
+++ b/algebird-core/src/main/scala-2.12/JavaMonoids.scala
@@ -0,0 +1,147 @@
+/*
+Copyright 2012 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+ */
+package com.twitter.algebird
+
+import java.lang.{
+  Boolean => JBool,
+  Double => JDouble,
+  Float => JFloat,
+  Integer => JInt,
+  Long => JLong,
+  Short => JShort
+}
+import java.util.{ArrayList => JArrayList, HashMap => JHashMap, List => JList, Map => JMap}
+
+import scala.collection.JavaConverters._
+
+object JIntRing extends Ring[JInt] {
+  override val zero: JInt = JInt.valueOf(0)
+  override val one: JInt = JInt.valueOf(1)
+  override def plus(x: JInt, y: JInt): JInt = x + y
+  override def negate(x: JInt): JInt = -x
+  override def minus(x: JInt, y: JInt): JInt = x - y
+  override def times(x: JInt, y: JInt): JInt = x * y
+}
+
+object JShortRing extends Ring[JShort] {
+  override val zero: JShort = Short.box(0)
+  override val one: JShort = Short.box(1)
+  override def plus(x: JShort, y: JShort): JShort = (x + y).toShort
+  override def negate(x: JShort): JShort = (-x).toShort
+  override def minus(x: JShort, y: JShort): JShort = (x - y).toShort
+  override def times(x: JShort, y: JShort): JShort = (x * y).toShort
+}
+
+object JLongRing extends Ring[JLong] {
+  override val zero: JLong = JLong.valueOf(0L)
+  override val one: JLong = JLong.valueOf(1L)
+  override def plus(x: JLong, y: JLong): JLong = x + y
+  override def negate(x: JLong): JLong = -x
+  override def minus(x: JLong, y: JLong): JLong = x - y
+  override def times(x: JLong, y: JLong): JLong = x * y
+}
+
+object JFloatRing extends Ring[JFloat] {
+  override val zero: JFloat = JFloat.valueOf(0.0f)
+  override val one: JFloat = JFloat.valueOf(1.0f)
+  override def plus(x: JFloat, y: JFloat): JFloat = x + y
+  override def negate(x: JFloat): JFloat = -x
+  override def minus(x: JFloat, y: JFloat): JFloat = x - y
+  override def times(x: JFloat, y: JFloat): JFloat = x * y
+}
+
+object JDoubleRing extends Ring[JDouble] {
+  override val zero: JDouble = JDouble.valueOf(0.0)
+  override val one: JDouble = JDouble.valueOf(1.0)
+  override def plus(x: JDouble, y: JDouble): JDouble = x + y
+  override def negate(x: JDouble): JDouble = -x
+  override def minus(x: JDouble, y: JDouble): JDouble = x - y
+  override def times(x: JDouble, y: JDouble): JDouble = x * y
+}
+
+object JBoolRing extends Ring[JBool] {
+  override val zero: JBool = JBool.FALSE
+  override val one: JBool = JBool.TRUE
+  override def plus(x: JBool, y: JBool): JBool =
+    JBool.valueOf(x.booleanValue ^ y.booleanValue)
+  override def negate(x: JBool): JBool = x
+  override def minus(x: JBool, y: JBool): JBool = plus(x, y)
+  override def times(x: JBool, y: JBool): JBool =
+    JBool.valueOf(x.booleanValue & y.booleanValue)
+}
+
+/**
+ * Since Lists are mutable, this always makes a full copy. Prefer scala immutable Lists if you use scala
+ * immutable lists, the tail of the result of plus is always the right argument
+ */
+class JListMonoid[T] extends Monoid[JList[T]] {
+  override def isNonZero(x: JList[T]): Boolean = !x.isEmpty
+  override lazy val zero: JArrayList[T] = new JArrayList[T](0)
+  override def plus(x: JList[T], y: JList[T]): JArrayList[T] = {
+    val res = new JArrayList[T](x.size + y.size)
+    res.addAll(x)
+    res.addAll(y)
+    res
+  }
+}
+
+/**
+ * Since maps are mutable, this always makes a full copy. Prefer scala immutable maps if you use scala
+ * immutable maps, this operation is much faster TODO extend this to Group, Ring
+ */
+class JMapMonoid[K, V: Semigroup] extends Monoid[JMap[K, V]] {
+  override lazy val zero: JHashMap[K, V] = new JHashMap[K, V](0)
+
+  val nonZero: (V => Boolean) = implicitly[Semigroup[V]] match {
+    case mon: Monoid[?] => mon.isNonZero(_)
+    case _              => _ => true
+  }
+
+  override def isNonZero(x: JMap[K, V]): Boolean =
+    !x.isEmpty && (implicitly[Semigroup[V]] match {
+      case mon: Monoid[?] =>
+        x.values.asScala.exists(v => mon.isNonZero(v))
+      case _ => true
+    })
+  override def plus(x: JMap[K, V], y: JMap[K, V]): JHashMap[K, V] = {
+    val (big, small, bigOnLeft) =
+      if (x.size > y.size) {
+        (x, y, true)
+      } else {
+        (y, x, false)
+      }
+    val vsemi = implicitly[Semigroup[V]]
+    val result = new JHashMap[K, V](big.size + small.size)
+    result.putAll(big)
+    small.entrySet.asScala.foreach { kv =>
+      val smallK = kv.getKey
+      val smallV = kv.getValue
+      if (big.containsKey(smallK)) {
+        val bigV = big.get(smallK)
+        val newV =
+          if (bigOnLeft) vsemi.plus(bigV, smallV) else vsemi.plus(smallV, bigV)
+        if (nonZero(newV))
+          result.put(smallK, newV)
+        else
+          result.remove(smallK)
+      } else {
+        // No need to explicitly add with zero on V, just put in the small value
+        result.put(smallK, smallV)
+      }
+    }
+    result
+  }
+}
diff --git a/algebird-core/src/main/scala-2.12/MapAlgebra.scala b/algebird-core/src/main/scala-2.12/MapAlgebra.scala
new file mode 100644
index 000000000..9ca370eaf
--- /dev/null
+++ b/algebird-core/src/main/scala-2.12/MapAlgebra.scala
@@ -0,0 +1,320 @@
+/*
+Copyright 2012 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+ */
+package com.twitter.algebird
+
+import com.twitter.algebird.macros.{Cuber, Roller}
+import scala.collection.mutable.{Builder, Map => MMap}
+import scala.collection.{Map => ScMap}
+import algebra.ring.Rng
+import scala.collection.compat._
+
+trait MapOperations[K, V, M <: ScMap[K, V]] {
+  def add(oldMap: M, kv: (K, V)): M
+  def remove(oldMap: M, k: K): M
+  def fromMutable(mut: MMap[K, V]): M
+}
+
+abstract class GenericMapMonoid[K, V, M <: ScMap[K, V]](implicit val semigroup: Semigroup[V])
+    extends Monoid[M]
+    with MapOperations[K, V, M] {
+
+  val nonZero: (V => Boolean) = semigroup match {
+    case mon: Monoid[?] => mon.isNonZero(_)
+    case _              => _ => true
+  }
+
+  override def isNonZero(x: M): Boolean =
+    !x.isEmpty && (semigroup match {
+      case mon: Monoid[?] =>
+        x.valuesIterator.exists(v => mon.isNonZero(v))
+      case _ => true
+    })
+
+  override def plus(x: M, y: M): M = {
+    // Scala maps can reuse internal structure, so don't copy just add into the bigger one:
+    // This really saves computation when adding lots of small maps into big ones (common)
+    val (big, small, bigOnLeft) =
+      if (x.size > y.size) {
+        (x, y, true)
+      } else {
+        (y, x, false)
+      }
+    small match {
+      // Mutable maps create new copies of the underlying data on add so don't use the
+      // handleImmutable method.
+      // Cannot have a None so 'get' is safe here.
+      case _: MMap[?, ?] => sumOption(Seq(big, small)).get
+      case _             => handleImmutable(big, small, bigOnLeft)
+    }
+  }
+
+  private def handleImmutable(big: M, small: M, bigOnLeft: Boolean) =
+    small.foldLeft(big) { (oldMap, kv) =>
+      val newV = big
+        .get(kv._1)
+        .map { bigV =>
+          if (bigOnLeft)
+            semigroup.plus(bigV, kv._2)
+          else
+            semigroup.plus(kv._2, bigV)
+        }
+        .getOrElse(kv._2)
+      if (nonZero(newV))
+        add(oldMap, kv._1 -> newV)
+      else
+        remove(oldMap, kv._1)
+    }
+  override def sumOption(items: TraversableOnce[M]): Option[M] =
+    if (items.iterator.isEmpty) None
+    else {
+      val mutable = MMap[K, V]()
+      items.iterator.foreach { m =>
+        m.foreach { case (k, v) =>
+          val oldVOpt = mutable.get(k)
+          // sorry for the micro optimization here: avoiding a closure
+          val newV =
+            if (oldVOpt.isEmpty) v else Semigroup.plus(oldVOpt.get, v)
+          if (nonZero(newV))
+            mutable.update(k, newV)
+          else
+            mutable.remove(k)
+        }
+      }
+      Some(fromMutable(mutable))
+    }
+}
+
+class MapMonoid[K, V](implicit semigroup: Semigroup[V]) extends GenericMapMonoid[K, V, Map[K, V]] {
+  override lazy val zero: Map[K, V] = Map[K, V]()
+  override def add(oldMap: Map[K, V], kv: (K, V)): Map[K, V] = oldMap + kv
+  override def remove(oldMap: Map[K, V], k: K): Map[K, V] = oldMap - k
+  override def fromMutable(mut: MMap[K, V]): Map[K, V] =
+    new MutableBackedMap(mut)
+}
+
+class ScMapMonoid[K, V](implicit semigroup: Semigroup[V]) extends GenericMapMonoid[K, V, ScMap[K, V]] {
+  override lazy val zero: ScMap[K, V] = ScMap[K, V]()
+  override def add(oldMap: ScMap[K, V], kv: (K, V)): ScMap[K, V] = oldMap + kv
+  override def remove(oldMap: ScMap[K, V], k: K): ScMap[K, V] = oldMap - k
+  override def fromMutable(mut: MMap[K, V]): ScMap[K, V] =
+    new MutableBackedMap(mut)
+}
+
+/**
+ * You can think of this as a Sparse vector group
+ */
+class MapGroup[K, V](implicit val group: Group[V]) extends MapMonoid[K, V]()(group) with Group[Map[K, V]] {
+  override def negate(kv: Map[K, V]): Map[K, V] =
+    kv.iterator.map { case (k, v) =>
+      (k, group.negate(v))
+    }.toMap
+}
+
+class ScMapGroup[K, V](implicit val group: Group[V])
+    extends ScMapMonoid[K, V]()(group)
+    with Group[ScMap[K, V]] {
+  override def negate(kv: ScMap[K, V]): ScMap[K, V] =
+    kv.iterator.map { case (k, v) =>
+      (k, group.negate(v))
+    }.toMap
+}
+
+/**
+ * You can think of this as a Sparse vector ring
+ */
+trait GenericMapRing[K, V, M <: ScMap[K, V]] extends Rng[M] with MapOperations[K, V, M] {
+
+  implicit def ring: Ring[V]
+
+  override def times(x: M, y: M): M = {
+    val (big, small, bigOnLeft) =
+      if (x.size > y.size) {
+        (x, y, true)
+      } else {
+        (y, x, false)
+      }
+    small.foldLeft(zero) { (oldMap, kv) =>
+      val bigV = big.getOrElse(kv._1, ring.zero)
+      val newV =
+        if (bigOnLeft) ring.times(bigV, kv._2) else ring.times(kv._2, bigV)
+      if (ring.isNonZero(newV)) {
+        add(oldMap, kv._1 -> newV)
+      } else {
+        remove(oldMap, kv._1)
+      }
+    }
+  }
+}
+
+class MapRing[K, V](implicit override val ring: Ring[V])
+    extends MapGroup[K, V]()(ring)
+    with GenericMapRing[K, V, Map[K, V]]
+
+class ScMapRing[K, V](implicit override val ring: Ring[V])
+    extends ScMapGroup[K, V]()(ring)
+    with GenericMapRing[K, V, ScMap[K, V]]
+
+object MapAlgebra {
+  def rightContainsLeft[K, V: Equiv](l: Map[K, V], r: Map[K, V]): Boolean =
+    l.forall { case (k, v) =>
+      r.get(k).exists(Equiv[V].equiv(_, v))
+    }
+
+  implicit def sparseEquiv[K, V: Monoid: Equiv]: Equiv[Map[K, V]] =
+    Equiv.fromFunction { (m1, m2) =>
+      val cleanM1 = removeZeros(m1)
+      val cleanM2 = removeZeros(m2)
+      rightContainsLeft(cleanM1, cleanM2) && rightContainsLeft(cleanM2, cleanM1)
+    }
+
+  def mergeLookup[T, U, V: Monoid](
+      keys: TraversableOnce[T]
+  )(lookup: T => Option[V])(present: T => U): Map[U, V] =
+    sumByKey {
+      keys.iterator.map(k => present(k) -> lookup(k).getOrElse(Monoid.zero[V]))
+    }
+
+  // Returns a new map with zero-value entries removed
+  def removeZeros[K, V: Monoid](m: Map[K, V]): Map[K, V] =
+    m.filter { case (_, v) => Monoid.isNonZero(v) }
+
+  /**
+   * For each key, sum all the values. Note that if V is a Monoid, the current implementation will drop from
+   * the output any key where the values are all Monoid.zero. If the Semigroup is a Monoid, This function is
+   * equivalent to:
+   *
+   * pairs.filter(_._2 != Monoid.zero).groupBy(_._1).mapValues(_.map(_._2).sum)
+   *
+   * Otherwise, the function is equivalent to:
+   *
+   * pairs.groupBy(_._1).mapValues(_.map(_._2).sum)
+   */
+  def sumByKey[K, V: Semigroup](pairs: TraversableOnce[(K, V)]): Map[K, V] =
+    Monoid.sum(pairs.iterator.map(Map(_)))
+
+  /**
+   * For each key, creates a list of all values. This function is equivalent to:
+   *
+   * pairs.groupBy(_._1).mapValues(_.map(_._2))
+   */
+  def group[K, V](pairs: TraversableOnce[(K, V)]): Map[K, List[V]] =
+    if (pairs.iterator.isEmpty) Map.empty
+    else {
+      val mutable = MMap[K, Builder[V, List[V]]]()
+      pairs.iterator.foreach { case (k, v) =>
+        val oldVOpt = mutable.get(k)
+        // sorry for the micro optimization here: avoiding a closure
+        val bldr = if (oldVOpt.isEmpty) {
+          val b = List.newBuilder[V]
+          mutable.update(k, b)
+          b
+        } else oldVOpt.get
+        bldr += v
+      }
+      mutable.iterator.map { case (k, bldr) => (k, bldr.result()) }.toMap
+    }
+
+  // Consider this as edges from k -> v, produce a Map[K,Set[V]]
+  def toGraph[K, V](pairs: TraversableOnce[(K, V)]): Map[K, Set[V]] =
+    Monoid.sum(pairs.map { case (k, v) => Map(k -> Set(v)) })
+
+  /** join the keys of two maps (similar to outer-join in a DB) */
+  def join[K, V, W](map1: Map[K, V], map2: Map[K, W]): Map[K, (Option[V], Option[W])] =
+    Monoid
+      .plus(
+        map1.transform { case (_, v) =>
+          (List(v), List[W]())
+        },
+        map2.transform { case (_, w) =>
+          (List[V](), List(w))
+        }
+      )
+      .transform { case (_, (v, w)) => (v.headOption, w.headOption) }
+
+  /**
+   * Reverses a graph losslessly None key is for v's with no sources.
+   */
+  def invertExact[K, V](m: Map[Option[K], Set[V]]): Map[Option[V], Set[K]] = {
+    def nonEmptyIter[T](i: Iterable[T]): Iterable[Option[T]] =
+      if (i.isEmpty) Iterable(None)
+      else {
+        i.map(Some(_))
+      }
+
+    Monoid.sum {
+      for {
+        (k, sv) <- m.view.toIterable
+        v <- nonEmptyIter(sv)
+      } yield Map(v -> k.toSet)
+    }
+  }
+
+  /**
+   * Invert the Common case of exactly one value for each key
+   */
+  def invert[K, V](m: Map[K, V]): Map[V, Set[K]] =
+    Monoid.sum(m.view.toIterable.map { case (k, v) => Map(v -> Set(k)) })
+
+  def dot[K, V](left: Map[K, V], right: Map[K, V])(implicit mring: Ring[Map[K, V]], mon: Monoid[V]): V =
+    Monoid.sum(mring.times(left, right).values)
+
+  def cube[K, V](it: TraversableOnce[(K, V)])(implicit c: Cuber[K]): Map[c.K, List[V]] = {
+    val map: MMap[c.K, List[V]] = MMap[c.K, List[V]]()
+    it.iterator.foreach { case (k, v) =>
+      c(k).iterator.foreach { ik =>
+        map.get(ik) match {
+          case Some(vs) => map += ik -> (v :: vs)
+          case None     => map += ik -> List(v)
+        }
+      }
+    }
+    map.foreach { case (k, v) => map(k) = v.reverse }
+    new MutableBackedMap(map)
+  }
+
+  def cubeSum[K, V](it: TraversableOnce[(K, V)])(implicit c: Cuber[K], sg: Semigroup[V]): Map[c.K, V] =
+    sumByKey(it.iterator.flatMap { case (k, v) => c(k).map((_, v)) })
+
+  def cubeAggregate[T, K, U, V](it: TraversableOnce[T], agg: Aggregator[T, U, V])(
+      fn: T => K
+  )(implicit c: Cuber[K]): Map[c.K, V] =
+    sumByKey(it.iterator.flatMap(t => c(fn(t)).iterator.map((_, agg.prepare(t)))))(agg.semigroup)
+      .map { case (k, v) => (k, agg.present(v)) }
+
+  def rollup[K, V](it: TraversableOnce[(K, V)])(implicit r: Roller[K]): Map[r.K, List[V]] = {
+    val map: MMap[r.K, List[V]] = MMap[r.K, List[V]]()
+    it.iterator.foreach { case (k, v) =>
+      r(k).iterator.foreach { ik =>
+        map.get(ik) match {
+          case Some(vs) => map += ik -> (v :: vs)
+          case None     => map += ik -> List(v)
+        }
+      }
+    }
+    map.foreach { case (k, v) => map(k) = v.reverse }
+    new MutableBackedMap(map)
+  }
+
+  def rollupSum[K, V](it: TraversableOnce[(K, V)])(implicit r: Roller[K], sg: Semigroup[V]): Map[r.K, V] =
+    sumByKey(it.iterator.flatMap { case (k, v) => r(k).iterator.map((_, v)) })
+
+  def rollupAggregate[T, K, U, V](it: TraversableOnce[T], agg: Aggregator[T, U, V])(
+      fn: T => K
+  )(implicit r: Roller[K]): Map[r.K, V] =
+    sumByKey(it.iterator.flatMap(t => r(fn(t)).iterator.map((_, agg.prepare(t)))))(agg.semigroup)
+      .map { case (k, v) => (k, agg.present(v)) }
+
+}
diff --git a/algebird-core/src/main/scala-2.12/Scan.scala b/algebird-core/src/main/scala-2.12/Scan.scala
new file mode 100644
index 000000000..2dc2ff9c2
--- /dev/null
+++ b/algebird-core/src/main/scala-2.12/Scan.scala
@@ -0,0 +1,333 @@
+package com.twitter.algebird
+
+import scala.collection.compat._
+
+object Scan {
+
+  /**
+   * Most consumers of Scan don't care about the type of the type State type variable. But for those that do,
+   * we make an effort to expose it in all of our combinators.
+   * @tparam I
+   * @tparam S
+   * @tparam O
+   */
+  type Aux[-I, S, +O] = Scan[I, O] { type State = S }
+
+  implicit def applicative[I]: Applicative[Scan[I, _]] = new ScanApplicative[I]
+
+  def from[I, S, O](initState: S)(presentAndNextStateFn: (I, S) => (O, S)): Aux[I, S, O] =
+    new Scan[I, O] {
+      override type State = S
+      override val initialState = initState
+      override def presentAndNextState(i: I, s: State): (O, State) = presentAndNextStateFn(i, s)
+    }
+
+  def fromFunction[I, O](f: I => O): Aux[I, Unit, O] = new Scan[I, O] {
+    override type State = Unit
+    override val initialState = ()
+    override def presentAndNextState(i: I, stateBeforeProcessingI: Unit): (O, State) = (f(i), ())
+  }
+
+  /**
+   * Scans take streams of inputs to streams of outputs, but some scans have trivial inputs and just produce a
+   * stream of outputs. Streams can be thought of as being a hidden state that is queryable for a head
+   * element, and another hidden state that represents the rest of the stream.
+   * @param initState
+   *   The initial state of the scan; think of this as an infinite stream.
+   * @param destructor
+   *   This function decomposes a stream into the its head-element and tail-stream.
+   * @tparam S
+   *   The hidden state of the stream that we are turning into a Scan.
+   * @tparam O
+   *   The type of the elments of the stream that we are turning into a Scan
+   * @return
+   *   A Scan whose inputs are irrelevant, and whose outputs are those that we would get from implementing a
+   *   stream using the information provided to this method.
+   */
+  def iterate[S, O](initState: S)(destructor: S => (O, S)): Aux[Any, S, O] = new Scan[Any, O] {
+    override type State = S
+    override val initialState = initState
+    override def presentAndNextState(i: Any, stateBeforeProcessingI: S): (O, S) =
+      destructor(stateBeforeProcessingI)
+  }
+
+  /**
+   * A Scan whose `Nth` output is the number `N` (starting from 0).
+   */
+  val index: Aux[Any, Long, Long] = iterate(0L)(n => (n, n + 1))
+
+  def identity[A]: Aux[A, Unit, A] = fromFunction[A, A](x => x)
+
+  /**
+   * @param initStateCreator
+   *   A call-by-name method that allocates new mutable state
+   * @param presentAndUpdateStateFn
+   *   A function that both presents the output value, and has the side-effect of updating the mutable state
+   * @tparam I
+   * @tparam S
+   * @tparam O
+   * @return
+   *   A Scan that safely encapsulates state while it's doing its thing.
+   */
+  def mutable[I, S, O](initStateCreator: => S)(presentAndUpdateStateFn: (I, S) => O): Aux[I, S, O] =
+    new Scan[I, O] {
+      override type State = S
+      override def initialState = initStateCreator
+      override def presentAndNextState(i: I, s: S): (O, S) = (presentAndUpdateStateFn(i, s), s)
+    }
+
+  /**
+   * The trivial scan that always returns the same value, regardless of input
+   * @param t
+   * @tparam T
+   */
+  def const[T](t: T): Aux[Any, Unit, T] = fromFunction(_ => t)
+
+  /**
+   * @param aggregator
+   * @param initState
+   * @tparam A
+   * @tparam B
+   * @tparam C
+   * @return
+   *   A scan which, when given `[a_1, ..., a_n]` outputs `[c_1, ..., c_n]` where `c_i = initState +
+   *   aggregator.prepare(a_1) + ... + aggregator.prepare(a_i)`
+   */
+  def fromAggregator[A, B, C](aggregator: Aggregator[A, B, C], initState: B): Aux[A, B, C] =
+    from(initState) { (a: A, stateBeforeProcessingI: B) =>
+      // nb: the order of the arguments to semigroup.plus here is what determines the order of the final summation;
+      // this matters because not all semigroups are commutative
+      val stateAfterProcessingA =
+        aggregator.append(stateBeforeProcessingI, a)
+      (aggregator.present(stateAfterProcessingA), stateAfterProcessingA)
+    }
+
+  /**
+   * @param monoidAggregator
+   * @tparam A
+   * @tparam B
+   * @tparam C
+   * @return
+   *   A scan which, when given `[a_1, ..., a_n]` outputs `[c_1, ..., c_n]` where `c_i =
+   *   monoidAggregator.monoid.zero + aggregator.prepare(a_1) + ... + aggregator.prepare(a_i)`
+   */
+  def fromMonoidAggregator[A, B, C](monoidAggregator: MonoidAggregator[A, B, C]): Aux[A, B, C] =
+    fromAggregator(monoidAggregator, monoidAggregator.monoid.zero)
+
+}
+
+/**
+ * The Scan trait is an alternative to the `scanLeft` method on iterators/other collections for a range of of
+ * use-cases where `scanLeft` is awkward to use. At a high level it provides some of the same functionality as
+ * `scanLeft`, but with a separation of "what is the state of the scan" from "what are the elements that I'm
+ * scanning over?". In particular, when scanning over an iterator with `N` elements, the output is an iterator
+ * with `N` elements (in contrast to scanLeft's `N+1`).
+ *
+ * If you find yourself writing a `scanLeft` over pairs of elements, where you only use one element of the
+ * pair within the `scanLeft`, then throw that element away in a `map` immediately after the scanLeft is done,
+ * then this abstraction is for you.
+ *
+ * The canonical method to use a scan is `apply`.
+ *
+ * @tparam I
+ *   The type of elements that the computation is scanning over.
+ * @tparam O
+ *   The output type of the scan (typically distinct from the hidden `State` of the scan).
+ */
+sealed abstract class Scan[-I, +O] extends Serializable {
+
+  import Scan.{from, Aux}
+
+  /**
+   * The computation of any given scan involves keeping track of a hidden state.
+   */
+  type State
+
+  /**
+   * The state of the scan before any elements have been processed
+   * @return
+   */
+  def initialState: State
+
+  /**
+   * @param i
+   *   An element in the stream to process
+   * @param stateBeforeProcessingI
+   *   The state of the scan before processing i
+   * @return
+   *   The output of the scan corresponding to processing i with state stateBeforeProcessing, along with the
+   *   result of updating stateBeforeProcessing with the information from i.
+   */
+  def presentAndNextState(i: I, stateBeforeProcessingI: State): (O, State)
+
+  /**
+   * @param iter
+   * @return
+   *   If `iter = Iterator(a_1, ..., a_n)`, return:` `Iterator(o_1, ..., o_n)` where `(o_(i+1), state_(i+1)) =
+   *   presentAndNextState(a_i, state_i)` and `state_0 = initialState`
+   */
+  def scanIterator(iter: Iterator[I]): Iterator[O] = new AbstractIterator[O] {
+    override def hasNext: Boolean = iter.hasNext
+    var state: State = initialState
+    override def next(): O = {
+      val thisState = state
+      val thisA = iter.next()
+      val (thisC, nextState) = presentAndNextState(thisA, thisState)
+      state = nextState
+      thisC
+    }
+  }
+
+  /**
+   * @param inputs
+   * @param bf
+   * @tparam In
+   *   The type of the input collection
+   * @tparam Out
+   *   The type of the output collection
+   * @return
+   *   Given inputs as a collection of the form `[a_1, ..., a_n]` the output will be a collection of the form:
+   *   `[o_1, ..., o_n]` where `(o_(i+1), state_(i+1)) = presentAndNextState(a_i, state_i)` and `state_0 =
+   *   initialState`.
+   */
+  def apply[In <: TraversableOnce[I], Out](
+      inputs: In
+  )(implicit bf: BuildFrom[In, O, Out]): Out =
+    bf.fromSpecific(inputs)(scanIterator(inputs.toIterator))
+
+  // combinators
+
+  /**
+   * Return a new scan that is the same as this scan, but with a different `initialState`.
+   * @param newInitialState
+   * @return
+   */
+  def replaceState(newInitialState: => State): Aux[I, State, O] =
+    from(newInitialState)(presentAndNextState(_, _))
+
+  def composePrepare[I1](f: I1 => I): Aux[I1, State, O] = from(initialState) { (i, stateBeforeProcessingI) =>
+    presentAndNextState(f(i), stateBeforeProcessingI)
+  }
+
+  def andThenPresent[O1](g: O => O1): Aux[I, State, O1] = from(initialState) { (i, stateBeforeProcessingI) =>
+    val (c, stateAfterProcessingA) = presentAndNextState(i, stateBeforeProcessingI)
+    (g(c), stateAfterProcessingA)
+  }
+
+  /**
+   * Return a scan that is semantically identical to `this.join(Scan.identity[I1])`, but where we don't
+   * pollute the `State` by pairing it redundantly with `Unit`.
+   * @tparam I1
+   * @return
+   *   If this Scan's `apply` method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1,
+   *   ..., o_n`, then this results in a Scan whose `apply` method returns `[(o_1, a_1), ..., (o_n, a_n)]`
+   *   when given the same input.
+   */
+  def joinWithInput[I1 <: I]: Aux[I1, State, (O, I1)] = from(initialState) { (i, stateBeforeProcessingI) =>
+    val (o, stateAfterProcessingI) = presentAndNextState(i, stateBeforeProcessingI)
+    ((o, i), stateAfterProcessingI)
+  }
+
+  /**
+   * Return a scan whose output is paired with the state of the scan before each input updates the state.
+   * @return
+   *   If this Scan's `apply` method is given inputs [a_1, ..., a_n] resulting in outputs of the form `[o_1,
+   *   ..., o_n]`, where `(o_(i+1), state_(i+1)) = presentAndNextState(a_i, state_i)` and `state_0 =
+   *   initialState`, return a scan that whose apply method, when given inputs `[a_1, ..., a_n]` will return
+   *   `[(o_1, state_0), ..., (o_n, state_(n-1))]`.
+   */
+  def joinWithPriorState: Aux[I, State, (State, O)] = from(initialState) { (i, stateBeforeProcessingI) =>
+    val (o, stateAfterProcessingA) = presentAndNextState(i, stateBeforeProcessingI)
+    ((stateBeforeProcessingI, o), stateAfterProcessingA)
+  }
+
+  /**
+   * Return a scan whose output is paired with the state of the scan after each input updates the state.
+   * @return
+   *   If this Scan's `apply` method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1,
+   *   ..., o_n]`, where `(o_(i+1), state_(i+1)) = presentAndNextState(a_i, state_i)`` and state_0 =
+   *   initialState, return a scan that whose apply method, when given inputs `[a_1, ..., a_n]` will return
+   *   `[(o_1, state_1), ..., (o_n, state_n]`.
+   */
+  def joinWithPosteriorState: Aux[I, State, (O, State)] = from(initialState) { (i, stateBeforeProcessingI) =>
+    val (c, stateAfterProcessingA) = presentAndNextState(i, stateBeforeProcessingI)
+    ((c, stateAfterProcessingA), stateAfterProcessingA)
+  }
+
+  /**
+   * For every `foo`, `scan.joinWithIndex(foo) == scan(foo).zipWithIndex`.
+   * @return
+   *   If this Scan's `apply` method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1,
+   *   ..., o_n]`, return a scan that whose apply method, when given the same input, will return `[(o_1, 1),
+   *   ..., (o_n, n)]`.
+   */
+  def joinWithIndex: Aux[I, (State, Long), (O, Long)] = join(Scan.index)
+
+  /**
+   * Compose two scans pairwise such that, when given pairwise zipped inputs, the resulting scan will output
+   * pairwise zipped outputs.
+   * @param scan2
+   * @tparam I2
+   * @tparam O2
+   * @return
+   *   If this Scan's apply method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1,
+   *   ..., o_n]`, and `scan2.apply([b_1, ..., b_n] = [p_1, ..., p_n]` then `zip` will return a scan whose
+   *   apply method, when given input `[(a_1, b_1), ..., (a_n, b_n)]` results in the output `[(o_1, p_1), ...,
+   *   (o_2, p_2)]`. In other words: `scan.zip(scan2)(foo.zip(bar)) == scan(foo).zip(scan2(bar))`
+   */
+  def zip[I2, O2](scan2: Scan[I2, O2]): Aux[(I, I2), (State, scan2.State), (O, O2)] =
+    from((initialState, scan2.initialState)) { (i1i2, stateBeforeProcessingI1I2) =>
+      val (o1, state1AfterProcesingI1) =
+        presentAndNextState(i1i2._1, stateBeforeProcessingI1I2._1)
+      val (o2, state2AfterProcesingI2) =
+        scan2.presentAndNextState(i1i2._2, stateBeforeProcessingI1I2._2)
+      ((o1, o2), (state1AfterProcesingI1, state2AfterProcesingI2))
+    }
+
+  /**
+   * Given a scan that takes compatible input to this one, pairwise compose the state and outputs of each scan
+   * on a common input stream.
+   * @param scan2
+   * @tparam I2
+   * @tparam O2
+   * @return
+   *   If this Scan's apply method is given inputs [a_1, ..., a_n] resulting in outputs of the form `[o_1,
+   *   ..., o_n]`, and `scan2.apply([a_1, ..., a_n] = [p_1, ..., p_n]` then `join` will return a scan whose
+   *   apply method returns `[(o_1, p_1), ..., (o_2, p_2)]`. In other words: `scan.join(scan2)(foo) ==
+   *   scan(foo).zip(scan2(foo))`
+   */
+  def join[I2 <: I, O2](scan2: Scan[I2, O2]): Aux[I2, (State, scan2.State), (O, O2)] =
+    from((initialState, scan2.initialState)) { (i, stateBeforeProcessingI) =>
+      val (o1, state1AfterProcesingI1) = presentAndNextState(i, stateBeforeProcessingI._1)
+      val (o2, state2AfterProcesingI2) = scan2.presentAndNextState(i, stateBeforeProcessingI._2)
+      ((o1, o2), (state1AfterProcesingI1, state2AfterProcesingI2))
+    }
+
+  /**
+   * Takes the output of this scan and feeds as input into scan2.
+   * @param scan2
+   * @tparam P
+   * @return
+   *   If this Scan's apply method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1,
+   *   ..., o_n]`, and `scan2.apply([o_1, ..., o_n] = [p_1, ..., p_n]` then `compose` will return a scan which
+   *   returns `[p_1, ..., p_n]`.
+   */
+  def compose[P](scan2: Scan[O, P]): Aux[I, (State, scan2.State), P] =
+    from((initialState, scan2.initialState)) { (i, stateBeforeProcessingI) =>
+      val (o, state1AfterProcesingI) = presentAndNextState(i, stateBeforeProcessingI._1)
+      val (p, state2AfterProcesingO) = scan2.presentAndNextState(o, stateBeforeProcessingI._2)
+      (p, (state1AfterProcesingI, state2AfterProcesingO))
+    }
+
+}
+
+class ScanApplicative[I] extends Applicative[Scan[I, _]] {
+  override def map[T, U](mt: Scan[I, T])(fn: T => U): Scan[I, U] =
+    mt.andThenPresent(fn)
+
+  override def apply[T](v: T): Scan[I, T] =
+    Scan.const(v)
+
+  override def join[T, U](mt: Scan[I, T], mu: Scan[I, U]): Scan[I, (T, U)] =
+    mt.join(mu)
+}
diff --git a/algebird-core/src/main/scala-2.12/SpaceSaver.scala b/algebird-core/src/main/scala-2.12/SpaceSaver.scala
new file mode 100644
index 000000000..5f9eee7e6
--- /dev/null
+++ b/algebird-core/src/main/scala-2.12/SpaceSaver.scala
@@ -0,0 +1,296 @@
+package com.twitter.algebird
+
+import java.nio.ByteBuffer
+
+import scala.collection.immutable.SortedMap
+import scala.util.{Failure, Success, Try}
+
+object SpaceSaver {
+
+  /**
+   * Construct SpaceSaver with given capacity containing a single item. This is the public api to create a new
+   * SpaceSaver.
+   */
+  def apply[T](capacity: Int, item: T): SpaceSaver[T] = SSOne(capacity, item)
+
+  /**
+   * Construct SpaceSaver with given capacity containing a single item with provided exact count. This is the
+   * public api to create a new SpaceSaver.
+   */
+  def apply[T](capacity: Int, item: T, count: Long): SpaceSaver[T] =
+    SSMany(capacity, Map(item -> ((count, 0L))))
+
+  private[algebird] val ordering =
+    Ordering.by[(?, (Long, Long)), (Long, Long)] { case (_, (count, err)) =>
+      (-count, err)
+    }
+
+  implicit def spaceSaverSemiGroup[T]: Semigroup[SpaceSaver[T]] =
+    new SpaceSaverSemigroup[T]
+
+  /**
+   * Encodes the SpaceSaver as a sequence of bytes containing in order
+   *   - 1 byte: 1/2 => 1 = SSOne, 2 = SSMany
+   *   - 4 bytes: the capacity
+   *   - N bytes: the item/counters (counters as length + N*(item size + item + 2 * counters)
+   */
+  def toBytes[T](ss: SpaceSaver[T], tSerializer: T => Array[Byte]): Array[Byte] =
+    ss match {
+      case SSOne(capacity, item) =>
+        val itemAsBytes = tSerializer(item)
+        val itemLength = itemAsBytes.length
+        // 1 for the type, 4 for capacity, 4 for itemAsBytes.length
+        val buffer = new Array[Byte](1 + 4 + 4 + itemLength)
+        ByteBuffer
+          .wrap(buffer)
+          .put(1: Byte)
+          .putInt(capacity)
+          .putInt(itemLength)
+          .put(itemAsBytes)
+        buffer
+
+      case SSMany(
+            capacity,
+            counters,
+            _
+          ) => // We do not care about the buckets are thery are created by SSMany.apply
+        val buffer = scala.collection.mutable.ArrayBuffer.newBuilder[Byte]
+        buffer += (2: Byte)
+
+        var buff = ByteBuffer.allocate(4)
+        buff.putInt(capacity)
+        buffer ++= buff.array()
+
+        buff = ByteBuffer.allocate(4)
+        buff.putInt(counters.size)
+        buffer ++= buff.array()
+        counters.foreach { case (item, (a, b)) =>
+          val itemAsBytes = tSerializer(item)
+
+          buff = ByteBuffer.allocate(4)
+          buff.putInt(itemAsBytes.length)
+          buffer ++= buff.array()
+
+          buffer ++= itemAsBytes
+
+          buff = ByteBuffer.allocate(8 * 2)
+          buff.putLong(a)
+          buff.putLong(b)
+          buffer ++= buff.array()
+        }
+        buffer.result().toArray
+    }
+
+  // Make sure to be reversible so fromBytes(toBytes(x)) == x
+  def fromBytes[T](bytes: Array[Byte], tDeserializer: Array[Byte] => Try[T]): Try[SpaceSaver[T]] =
+    fromByteBuffer(ByteBuffer.wrap(bytes), buffer => tDeserializer(buffer.array()))
+
+  def fromByteBuffer[T](bb: ByteBuffer, tDeserializer: ByteBuffer => Try[T]): Try[SpaceSaver[T]] =
+    Try {
+      bb.get.toInt match {
+        case 1 =>
+          val capacity = bb.getInt
+          val itemLength = bb.getInt
+          val itemAsBytes = new Array[Byte](itemLength)
+          bb.get(itemAsBytes)
+          tDeserializer(ByteBuffer.wrap(itemAsBytes)).map(item => SSOne(capacity, item))
+        case 2 =>
+          val capacity = bb.getInt
+
+          var countersToDeserialize = bb.getInt
+          val counters = scala.collection.mutable.Map.empty[T, (Long, Long)]
+          while (countersToDeserialize != 0) {
+            val itemLength = bb.getInt()
+            val itemAsBytes = new Array[Byte](itemLength)
+            bb.get(itemAsBytes)
+            val item = tDeserializer(ByteBuffer.wrap(itemAsBytes))
+
+            val a = bb.getLong
+            val b = bb.getLong
+
+            item match {
+              case Failure(e) => return Failure(e)
+              case Success(i) =>
+                counters += ((i, (a, b)))
+            }
+
+            countersToDeserialize -= 1
+          }
+
+          Success(SSMany(capacity, counters.toMap))
+      }
+    }.flatten
+}
+
+/**
+ * Data structure used in the Space-Saving Algorithm to find the approximate most frequent and top-k elements.
+ * The algorithm is described in "Efficient Computation of Frequent and Top-k Elements in Data Streams". See
+ * here: www.cs.ucsb.edu/research/tech_reports/reports/2005-23.pdf In the paper the data structure is called
+ * StreamSummary but we chose to call it SpaceSaver instead. Note that the adaptation to hadoop and
+ * parallelization were not described in the article and have not been proven to be mathematically correct or
+ * preserve the guarantees or benefits of the algorithm.
+ */
+sealed abstract class SpaceSaver[T] {
+  import SpaceSaver.ordering
+
+  /**
+   * Maximum number of counters to keep (parameter "m" in the research paper).
+   */
+  def capacity: Int
+
+  /**
+   * Current lowest value for count
+   */
+  def min: Long
+
+  /**
+   * Map of item to counter, where each counter consists of an observed count and possible over-estimation
+   * (error)
+   */
+  def counters: Map[T, (Long, Long)]
+
+  def ++(other: SpaceSaver[T]): SpaceSaver[T]
+
+  /**
+   * returns the frequency estimate for the item
+   */
+  def frequency(item: T): Approximate[Long] = {
+    val (count, err) = counters.getOrElse(item, (min, min))
+    Approximate(count - err, count, count, 1.0)
+  }
+
+  /**
+   * Get the elements that show up more than thres times. Returns sorted in descending order: (item,
+   * Approximate[Long], guaranteed)
+   */
+  def mostFrequent(thres: Int): Seq[(T, Approximate[Long], Boolean)] =
+    counters.iterator
+      .filter { case (_, (count, _)) => count >= thres }
+      .toList
+      .sorted(ordering)
+      .map { case (item, (count, err)) =>
+        (item, Approximate(count - err, count, count, 1.0), thres <= count - err)
+      }
+
+  /**
+   * Get the top-k elements. Returns sorted in descending order: (item, Approximate[Long], guaranteed)
+   */
+  def topK(k: Int): Seq[(T, Approximate[Long], Boolean)] = {
+    require(k < capacity)
+    val si = counters.toList
+      .sorted(ordering)
+    val siK = si.take(k)
+    val countKPlus1 = si.drop(k).headOption.map(_._2._1).getOrElse(0L)
+    siK.map { case (item, (count, err)) =>
+      (item, Approximate(count - err, count, count, 1.0), countKPlus1 < count - err)
+    }
+  }
+
+  /**
+   * Check consistency with other SpaceSaver, useful for testing. Returns boolean indicating if they are
+   * consistent
+   */
+  def consistentWith(that: SpaceSaver[T]): Boolean =
+    (counters.keys ++ that.counters.keys).forall(item => (frequency(item) - that.frequency(item)) ~ 0)
+}
+
+case class SSOne[T] private[algebird] (override val capacity: Int, item: T) extends SpaceSaver[T] {
+  require(capacity > 1)
+
+  override def min: Long = 0L
+
+  override def counters: Map[T, (Long, Long)] = Map(item -> ((1L, 1L)))
+
+  override def ++(other: SpaceSaver[T]): SpaceSaver[T] = other match {
+    case other: SSOne[?]  => SSMany(this).add(other)
+    case other: SSMany[?] => other.add(this)
+  }
+}
+
+object SSMany {
+  private def bucketsFromCounters[T](counters: Map[T, (Long, Long)]): SortedMap[Long, Set[T]] =
+    SortedMap[Long, Set[T]]() ++ counters.groupBy(_._2._1).mapValues(_.keySet).toMap
+
+  private[algebird] def apply[T](capacity: Int, counters: Map[T, (Long, Long)]): SSMany[T] =
+    SSMany(capacity, counters, bucketsFromCounters(counters))
+
+  private[algebird] def apply[T](one: SSOne[T]): SSMany[T] =
+    SSMany(one.capacity, Map(one.item -> ((1L, 0L))), SortedMap(1L -> Set(one.item)))
+}
+
+case class SSMany[T] private (
+    override val capacity: Int,
+    override val counters: Map[T, (Long, Long)],
+    buckets: SortedMap[Long, Set[T]]
+) extends SpaceSaver[T] {
+  private val exact: Boolean = counters.size < capacity
+
+  override val min: Long = if (counters.size < capacity) 0L else buckets.firstKey
+
+  // item is already present and just needs to be bumped up one
+  private def bump(item: T) = {
+    val (count, err) = counters(item)
+    val counters1 = counters + (item -> ((count + 1L, err))) // increment by one
+    val currBucket = buckets(count) // current bucket
+    val buckets1 = {
+      if (currBucket.size == 1) // delete current bucket since it will be empty
+        buckets - count
+      else // remove item from current bucket
+        buckets + (count -> (currBucket - item))
+    } + (count + 1L -> (buckets.getOrElse(count + 1L, Set()) + item))
+    SSMany(capacity, counters1, buckets1)
+  }
+
+  // lose one item to meet capacity constraint
+  private def loseOne = {
+    val firstBucket = buckets(buckets.firstKey)
+    val itemToLose = firstBucket.head
+    val counters1 = counters - itemToLose
+    val buckets1 =
+      if (firstBucket.size == 1)
+        buckets - min
+      else
+        buckets + (min -> (firstBucket - itemToLose))
+    SSMany(capacity, counters1, buckets1)
+  }
+
+  // introduce new item
+  private def introduce(item: T, count: Long, err: Long) = {
+    val counters1 = counters + (item -> ((count, err)))
+    val buckets1 = buckets + (count -> (buckets.getOrElse(count, Set()) + item))
+    SSMany(capacity, counters1, buckets1)
+  }
+
+  // add a single element
+  private[algebird] def add(x: SSOne[T]): SSMany[T] = {
+    require(x.capacity == capacity)
+    if (counters.contains(x.item))
+      bump(x.item)
+    else
+      (if (exact) this else this.loseOne).introduce(x.item, min + 1L, min)
+  }
+
+  // merge two stream summaries
+  private def merge(x: SSMany[T]): SSMany[T] = {
+    require(x.capacity == capacity)
+    val counters1 = Map() ++
+      (counters.keySet ++ x.counters.keySet).toList
+        .map { key =>
+          val (count1, err1) = counters.getOrElse(key, (min, min))
+          val (count2, err2) = x.counters.getOrElse(key, (x.min, x.min))
+          key -> ((count1 + count2, err1 + err2))
+        }
+        .sorted(SpaceSaver.ordering)
+        .take(capacity)
+    SSMany(capacity, counters1)
+  }
+
+  override def ++(other: SpaceSaver[T]): SpaceSaver[T] = other match {
+    case other: SSOne[?]  => add(other)
+    case other: SSMany[?] => merge(other)
+  }
+}
+
+class SpaceSaverSemigroup[T] extends Semigroup[SpaceSaver[T]] {
+  override def plus(x: SpaceSaver[T], y: SpaceSaver[T]): SpaceSaver[T] = x ++ y
+}
diff --git a/algebird-core/src/main/scala-2.12/VectorSpace.scala b/algebird-core/src/main/scala-2.12/VectorSpace.scala
new file mode 100644
index 000000000..f8818600c
--- /dev/null
+++ b/algebird-core/src/main/scala-2.12/VectorSpace.scala
@@ -0,0 +1,59 @@
+/*
+Copyright 2012 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+ */
+
+package com.twitter.algebird
+
+import scala.annotation.implicitNotFound
+
+/**
+ * This class represents a vector space. For the required properties see:
+ *
+ * http://en.wikipedia.org/wiki/Vector_space#Definition
+ */
+object VectorSpace extends VectorSpaceOps with Implicits
+
+sealed trait VectorSpaceOps {
+  def scale[F, C[_]](v: F, c: C[F])(implicit vs: VectorSpace[F, C]): C[F] =
+    vs.scale(v, c)
+  def from[F, C[_]](scaleFn: (F, C[F]) => C[F])(implicit r: Ring[F], cGroup: Group[C[F]]): VectorSpace[F, C] =
+    new VectorSpace[F, C] {
+      override def ring: Ring[F] = r
+      override def group: Group[C[F]] = cGroup
+      override def scale(v: F, c: C[F]): C[F] =
+        if (r.isNonZero(v)) scaleFn(v, c) else cGroup.zero
+    }
+}
+private object VectorSpaceOps extends VectorSpaceOps
+
+sealed trait Implicits extends LowPrioImpicits {
+  implicit def indexedSeqSpace[T: Ring]: VectorSpace[T, IndexedSeq] =
+    VectorSpaceOps.from[T, IndexedSeq]((s, seq) => seq.map(Ring.times(s, _)))
+}
+
+sealed trait LowPrioImpicits {
+  implicit def mapSpace[K, T: Ring]: VectorSpace[T, Map[K, _]] =
+    VectorSpaceOps.from[T, Map[K, _]] { (s, m) =>
+      m.transform { case (_, v) => Ring.times(s, v) }
+    }
+}
+
+@implicitNotFound(msg = "Cannot find VectorSpace type class for Container: ${C} and Ring: ${F}")
+trait VectorSpace[F, C[_]] extends java.io.Serializable {
+  implicit def ring: Ring[F]
+  def field: Ring[F] = ring // this is for compatibility with older versions
+  implicit def group: Group[C[F]]
+  def scale(v: F, c: C[F]): C[F]
+}
diff --git a/algebird-core/src/main/scala-2.12/monad/EitherMonad.scala b/algebird-core/src/main/scala-2.12/monad/EitherMonad.scala
new file mode 100644
index 000000000..b6d5e2ffc
--- /dev/null
+++ b/algebird-core/src/main/scala-2.12/monad/EitherMonad.scala
@@ -0,0 +1,37 @@
+/*
+ Copyright 2013 Twitter, Inc.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package com.twitter.algebird.monad
+
+import com.twitter.algebird.Monad
+
+// Monad for either, used for modeling Error where L is the type of the error
+object EitherMonad {
+  class Error[L] extends Monad[Either[L, *]] {
+    override def apply[R](r: R): Right[L, R] = Right(r)
+
+    override def flatMap[T, U](self: Either[L, T])(next: T => Either[L, U]): Either[L, U] =
+      self.right.flatMap(next)
+
+    override def map[T, U](self: Either[L, T])(fn: T => U): Either[L, U] =
+      self.right.map(fn)
+  }
+
+  implicit def monad[L]: Monad[Either[L, _]] = new Error[L]
+
+  def assert[L](truth: Boolean, failure: => L): Either[L, Unit] =
+    if (truth) Right(()) else Left(failure)
+}
diff --git a/algebird-core/src/main/scala-2.12/monad/Reader.scala b/algebird-core/src/main/scala-2.12/monad/Reader.scala
new file mode 100644
index 000000000..e0747af20
--- /dev/null
+++ b/algebird-core/src/main/scala-2.12/monad/Reader.scala
@@ -0,0 +1,76 @@
+/*
+ Copyright 2013 Twitter, Inc.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package com.twitter.algebird.monad
+
+import com.twitter.algebird.Monad
+
+// TODO this is general, move somewhere better
+
+// Reader Monad, represents a series of operations that mutate some environment
+// type (the input to the function)
+
+sealed trait Reader[-Env, +T] {
+  def apply(env: Env): T
+  def flatMap[E1 <: Env, U](next: T => Reader[E1, U]): Reader[E1, U] =
+    FlatMappedReader[E1, T, U](this, next)
+  def map[U](thatFn: T => U): Reader[Env, U] =
+    FlatMappedReader(this, (t: T) => ConstantReader(thatFn(t)))
+}
+
+final case class ConstantReader[+T](get: T) extends Reader[Any, T] {
+  override def apply(env: Any): T = get
+  override def map[U](fn: T => U): ConstantReader[U] = ConstantReader(fn(get))
+  override def flatMap[E1 <: Any, U](next: T => Reader[E1, U]): Reader[E1, U] =
+    next(get)
+}
+final case class ReaderFn[E, +T](fn: E => T) extends Reader[E, T] {
+  override def apply(env: E): T = fn(env)
+}
+final case class FlatMappedReader[E, U, +T](first: Reader[E, U], fn: U => Reader[E, T]) extends Reader[E, T] {
+  override def apply(env: E): T = {
+    @annotation.tailrec
+    def loop(r: Reader[E, Any], stack: List[(Any) => Reader[E, Any]]): Any =
+      r match {
+        case ConstantReader(get) =>
+          stack match {
+            case head :: tail => loop(head(get), tail)
+            case Nil          => get
+          }
+        case ReaderFn(fn) =>
+          stack match {
+            case head :: tail => loop(head(fn(env)), tail)
+            case Nil          => fn(env)
+          }
+        case FlatMappedReader(first, nextFn) => loop(first, nextFn :: stack)
+      }
+    loop(first, List(fn.asInstanceOf[(Any) => Reader[E, Any]])).asInstanceOf[T]
+  }
+}
+
+object Reader {
+  def const[T](t: T): Reader[Any, T] = ConstantReader(t)
+  implicit def apply[E, T](fn: (E) => T): Reader[E, T] = ReaderFn(fn)
+
+  class ReaderM[Env] extends Monad[Reader[Env, _]] {
+    override def apply[T](t: T): ConstantReader[T] = ConstantReader(t)
+    override def flatMap[T, U](self: Reader[Env, T])(next: T => Reader[Env, U]): Reader[Env, U] =
+      self.flatMap(next)
+    override def map[T, U](self: Reader[Env, T])(fn: T => U): Reader[Env, U] = self.map(fn)
+  }
+
+  implicit def monad[Env]: Monad[Reader[Env, _]] = new ReaderM[Env]
+}
diff --git a/algebird-core/src/main/scala-2.12/monad/StateWithError.scala b/algebird-core/src/main/scala-2.12/monad/StateWithError.scala
new file mode 100644
index 000000000..e15a9ebc3
--- /dev/null
+++ b/algebird-core/src/main/scala-2.12/monad/StateWithError.scala
@@ -0,0 +1,130 @@
+/*
+ Copyright 2013 Twitter, Inc.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package com.twitter.algebird.monad
+
+import com.twitter.algebird.{Monad, Semigroup}
+
+/**
+ * Monad to handle mutating input state and possible failures. This is used to interact in the planning phase
+ * with existing mutable APIs (like storm or cascading), but retain the ability to compose carefully.
+ */
+sealed trait StateWithError[S, +F, +T] {
+  def join[F1 >: F, U](
+      that: StateWithError[S, F1, U],
+      mergeErr: (F1, F1) => F1,
+      mergeState: (S, S) => S
+  ): StateWithError[S, F1, (T, U)] =
+    join(that)(Semigroup.from(mergeErr), Semigroup.from(mergeState))
+
+  def join[F1 >: F, U](that: StateWithError[S, F1, U])(implicit
+      sgf: Semigroup[F1],
+      sgs: Semigroup[S]
+  ): // TODO: deep joins could blow the stack, not yet using trampoline here
+  StateWithError[S, F1, (T, U)] =
+    StateFn { (requested: S) =>
+      (run(requested), that.run(requested)) match {
+        case (Right((s1, r1)), Right((s2, r2))) =>
+          Right((sgs.plus(s1, s2), (r1, r2)))
+        case (Left(err1), Left(err2)) =>
+          Left(sgf.plus(err1, err2)) // Our earlier is not ready
+        case (Left(err), _) => Left(err)
+        case (_, Left(err)) => Left(err)
+      }
+    }
+
+  def apply(state: S): Either[F, (S, T)] = run(state)
+
+  def run(state: S): Either[F, (S, T)]
+
+  def flatMap[F1 >: F, U](next: T => StateWithError[S, F1, U]): StateWithError[S, F1, U] =
+    FlatMappedState(this, next)
+
+  def map[U](fn: (T) => U): StateWithError[S, F, U] =
+    FlatMappedState(this, (t: T) => StateWithError.const(fn(t)))
+}
+
+/** Simple wrapper of a function in the Monad */
+final case class StateFn[S, F, T](fn: S => Either[F, (S, T)]) extends StateWithError[S, F, T] {
+  override def run(state: S): Either[F, (S, T)] = fn(state)
+}
+
+/**
+ * A Trampolining instance that should prevent stack overflow at the expense of performance
+ */
+final case class FlatMappedState[S, F, T, U](start: StateWithError[S, F, T], fn: T => StateWithError[S, F, U])
+    extends StateWithError[S, F, U] {
+  override def run(state: S): Either[F, (S, U)] = {
+    @annotation.tailrec
+    def loop(inState: S, st: StateWithError[S, F, Any], stack: List[Any => StateWithError[S, F, Any]]): Any =
+      st match {
+        case StateFn(fn) =>
+          fn(inState) match {
+            case err @ Left(_) => err // bail at first error
+            case noError @ Right((newState, out)) =>
+              stack match {
+                case head :: tailStack => loop(newState, head(out), tailStack)
+                case Nil               => noError // recursion ends
+              }
+          }
+        case FlatMappedState(st, next) => loop(inState, st, next :: stack)
+      }
+    loop(state, this, Nil).asInstanceOf[Either[F, (S, U)]]
+  }
+}
+
+object StateWithError {
+  def getState[S]: StateWithError[S, Nothing, S] =
+    StateFn((state: S) => Right((state, state)))
+  def putState[S](newState: S): StateWithError[S, Nothing, Unit] =
+    StateFn((_: S) => Right((newState, ())))
+  def swapState[S](newState: S): StateWithError[S, Nothing, S] =
+    StateFn((old: S) => Right((newState, old)))
+
+  def const[S, T](t: T): StateWithError[S, Nothing, T] =
+    StateFn((state: S) => Right((state, t)))
+  def lazyVal[S, T](t: => T): StateWithError[S, Nothing, T] =
+    StateFn((state: S) => Right((state, t)))
+  def failure[S, F](f: F): StateWithError[S, F, Nothing] =
+    StateFn(_ => Left(f))
+
+  /**
+   * Use like fromEither[Int](Right("good")) to get a constant Either in the monad
+   */
+  def fromEither[S]: ConstantStateMaker[S] = new ConstantStateMaker[S]
+  class ConstantStateMaker[S] {
+    def apply[F, T](either: Either[F, T]): StateWithError[S, F, T] = { (s: S) => either.right.map((s, _)) }
+  }
+
+  class FunctionLifter[S] {
+    def apply[I, F, T](fn: I => Either[F, T]): (I => StateWithError[S, F, T]) = { (i: I) =>
+      StateFn((s: S) => fn(i).right.map((s, _)))
+    }
+  }
+  // TODO this should move to Monad and work for any Monad
+  def toKleisli[S]: FunctionLifter[S] = new FunctionLifter[S]
+
+  implicit def apply[S, F, T](fn: S => Either[F, (S, T)]): StateWithError[S, F, T] = StateFn(fn)
+  implicit def monad[S, F]: Monad[StateWithError[S, F, _]] = new StateFMonad[F, S]
+
+  class StateFMonad[F, S] extends Monad[StateWithError[S, F, _]] {
+    override def apply[T](const: T): StateWithError[S, Nothing, T] = { (s: S) => Right((s, const)) }
+    override def flatMap[T, U](
+        earlier: StateWithError[S, F, T]
+    )(next: T => StateWithError[S, F, U]): StateWithError[S, F, U] =
+      earlier.flatMap(next)
+  }
+}
diff --git a/algebird-core/src/main/scala-2.13/Aggregator.scala b/algebird-core/src/main/scala-2.13/Aggregator.scala
new file mode 100644
index 000000000..8a4d2b230
--- /dev/null
+++ b/algebird-core/src/main/scala-2.13/Aggregator.scala
@@ -0,0 +1,637 @@
+package com.twitter.algebird
+
+import java.util.PriorityQueue
+import scala.collection.compat._
+import scala.collection.generic.CanBuildFrom
+
+/**
+ * Aggregators compose well.
+ *
+ * To create a parallel aggregator that operates on a single input in parallel, use:
+ * GeneratedTupleAggregator.from2((agg1, agg2))
+ */
+object Aggregator extends java.io.Serializable {
+  implicit def applicative[I]: Applicative[({ type L[O] = Aggregator[I, ?, O] })#L] =
+    new AggregatorApplicative[I]
+
+  private val DefaultSeed = 471312384
+
+  /**
+   * This is a trivial aggregator that always returns a single value
+   */
+  def const[T](t: T): MonoidAggregator[Any, Unit, T] =
+    prepareMonoid { (_: Any) => () }.andThenPresent(_ => t)
+
+  /**
+   * Using Aggregator.prepare,present you can add to this aggregator
+   */
+  def fromReduce[T](red: (T, T) => T): Aggregator[T, T, T] =
+    fromSemigroup(Semigroup.from(red))
+  def fromSemigroup[T](implicit sg: Semigroup[T]): Aggregator[T, T, T] =
+    new Aggregator[T, T, T] {
+      override def prepare(input: T): T = input
+      override def semigroup: Semigroup[T] = sg
+      override def present(reduction: T): T = reduction
+    }
+  def fromMonoid[T](implicit mon: Monoid[T]): MonoidAggregator[T, T, T] =
+    prepareMonoid(identity[T])
+  // Uses the product from the ring
+  def fromRing[T](implicit rng: Ring[T]): RingAggregator[T, T, T] =
+    fromRing[T, T](rng, identity[T])
+
+  def fromMonoid[F, T](implicit mon: Monoid[T], prep: F => T): MonoidAggregator[F, T, T] =
+    prepareMonoid(prep)(mon)
+
+  def prepareSemigroup[F, T](prep: F => T)(implicit sg: Semigroup[T]): Aggregator[F, T, T] =
+    new Aggregator[F, T, T] {
+      override def prepare(input: F): T = prep(input)
+      override def semigroup: Semigroup[T] = sg
+      override def present(reduction: T): T = reduction
+    }
+  def prepareMonoid[F, T](prep: F => T)(implicit m: Monoid[T]): MonoidAggregator[F, T, T] =
+    new MonoidAggregator[F, T, T] {
+      override def prepare(input: F): T = prep(input)
+      override def monoid: Monoid[T] = m
+      override def present(reduction: T): T = reduction
+    }
+  // Uses the product from the ring
+  def fromRing[F, T](implicit rng: Ring[T], prep: F => T): RingAggregator[F, T, T] =
+    new RingAggregator[F, T, T] {
+      override def prepare(input: F): T = prep(input)
+      override def ring: Ring[T] = rng
+      override def present(reduction: T): T = reduction
+    }
+
+  /**
+   * Obtain an [[Aggregator]] that uses an efficient append operation for faster aggregation. Equivalent to
+   * {{{appendSemigroup(prep, appnd, identity[T]_)(sg)}}}
+   */
+  def appendSemigroup[F, T](prep: F => T, appnd: (T, F) => T)(implicit
+      sg: Semigroup[T]
+  ): Aggregator[F, T, T] =
+    appendSemigroup(prep, appnd, identity[T])(sg)
+
+  /**
+   * Obtain an [[Aggregator]] that uses an efficient append operation for faster aggregation
+   * @tparam F
+   *   Data input type
+   * @tparam T
+   *   Aggregating [[Semigroup]] type
+   * @tparam P
+   *   Presentation (output) type
+   * @param prep
+   *   The preparation function. Expected to construct an instance of type T from a single data element.
+   * @param appnd
+   *   Function that appends the [[Semigroup]]. Defines the [[Aggregator.append]] method for this aggregator.
+   *   Analogous to the 'seqop' function in Scala's sequence 'aggregate' method
+   * @param pres
+   *   The presentation function
+   * @param sg
+   *   The [[Semigroup]] type class
+   * @note
+   *   The functions 'appnd' and 'prep' are expected to obey the law: {{{appnd(t, f) == sg.plus(t, prep(f))}}}
+   */
+  def appendSemigroup[F, T, P](prep: F => T, appnd: (T, F) => T, pres: T => P)(implicit
+      sg: Semigroup[T]
+  ): Aggregator[F, T, P] =
+    new Aggregator[F, T, P] {
+      override def semigroup: Semigroup[T] = sg
+      override def prepare(input: F): T = prep(input)
+      override def present(reduction: T): P = pres(reduction)
+
+      override def apply(inputs: TraversableOnce[F]): P =
+        applyOption(inputs).get
+
+      override def applyOption(inputs: TraversableOnce[F]): Option[P] =
+        agg(inputs).map(pres)
+
+      override def append(l: T, r: F): T = appnd(l, r)
+
+      override def appendAll(old: T, items: TraversableOnce[F]): T =
+        if (items.iterator.isEmpty) old else reduce(old, agg(items).get)
+
+      private def agg(inputs: TraversableOnce[F]): Option[T] =
+        if (inputs.iterator.isEmpty) None
+        else {
+          val itr = inputs.iterator
+          val t = prepare(itr.next)
+          Some(itr.foldLeft(t)(appnd))
+        }
+    }
+
+  /**
+   * Obtain a [[MonoidAggregator]] that uses an efficient append operation for faster aggregation. Equivalent
+   * to {{{appendMonoid(appnd, identity[T]_)(m)}}}
+   */
+  def appendMonoid[F, T](appnd: (T, F) => T)(implicit m: Monoid[T]): MonoidAggregator[F, T, T] =
+    appendMonoid(appnd, identity[T])(m)
+
+  /**
+   * Obtain a [[MonoidAggregator]] that uses an efficient append operation for faster aggregation
+   * @tparam F
+   *   Data input type
+   * @tparam T
+   *   Aggregating [[Monoid]] type
+   * @tparam P
+   *   Presentation (output) type
+   * @param appnd
+   *   Function that appends the [[Monoid]]. Defines the [[MonoidAggregator.append]] method for this
+   *   aggregator. Analogous to the 'seqop' function in Scala's sequence 'aggregate' method
+   * @param pres
+   *   The presentation function
+   * @param m
+   *   The [[Monoid]] type class
+   * @note
+   *   The function 'appnd' is expected to obey the law: {{{appnd(t, f) == m.plus(t, appnd(m.zero, f))}}}
+   */
+  def appendMonoid[F, T, P](appnd: (T, F) => T, pres: T => P)(implicit
+      m: Monoid[T]
+  ): MonoidAggregator[F, T, P] =
+    new MonoidAggregator[F, T, P] {
+      override def monoid: Monoid[T] = m
+      override def prepare(input: F): T = appnd(m.zero, input)
+      override def present(reduction: T): P = pres(reduction)
+
+      override def apply(inputs: TraversableOnce[F]): P = present(agg(inputs))
+
+      override def applyOption(inputs: TraversableOnce[F]): Option[P] =
+        if (inputs.isEmpty) None else Some(apply(inputs))
+
+      override def append(l: T, r: F): T = appnd(l, r)
+
+      override def appendAll(old: T, items: TraversableOnce[F]): T =
+        reduce(old, agg(items))
+
+      override def appendAll(items: TraversableOnce[F]): T = agg(items)
+
+      private def agg(inputs: TraversableOnce[F]): T =
+        inputs.foldLeft(m.zero)(append)
+    }
+
+  /**
+   * How many items satisfy a predicate
+   */
+  def count[T](pred: T => Boolean): MonoidAggregator[T, Long, Long] =
+    prepareMonoid { (t: T) => if (pred(t)) 1L else 0L }
+
+  /**
+   * Do any items satisfy some predicate
+   */
+  def exists[T](pred: T => Boolean): MonoidAggregator[T, Boolean, Boolean] =
+    prepareMonoid(pred)(OrVal.unboxedMonoid)
+
+  /**
+   * Do all items satisfy a predicate
+   */
+  def forall[T](pred: T => Boolean): MonoidAggregator[T, Boolean, Boolean] =
+    prepareMonoid(pred)(AndVal.unboxedMonoid)
+
+  /**
+   * Take the first (left most in reduce order) item found
+   */
+  def head[T]: Aggregator[T, T, T] = fromReduce[T]((l, _) => l)
+
+  /**
+   * Take the last (right most in reduce order) item found
+   */
+  def last[T]: Aggregator[T, T, T] = fromReduce[T]((_, r) => r)
+
+  /**
+   * Get the maximum item
+   */
+  def max[T: Ordering]: Aggregator[T, T, T] = new MaxAggregator[T]
+  def maxBy[U, T: Ordering](fn: U => T): Aggregator[U, U, U] = {
+    implicit val ordU: Ordering[U] = Ordering.by(fn)
+    max[U]
+  }
+
+  /**
+   * Get the minimum item
+   */
+  def min[T: Ordering]: Aggregator[T, T, T] = new MinAggregator[T]
+  def minBy[U, T: Ordering](fn: U => T): Aggregator[U, U, U] = {
+    implicit val ordU: Ordering[U] = Ordering.by(fn)
+    min[U]
+  }
+
+  /**
+   * This returns the number of items we find
+   */
+  def size: MonoidAggregator[Any, Long, Long] =
+    prepareMonoid((_: Any) => 1L)
+
+  /**
+   * Take the smallest `count` items using a heap
+   */
+  def sortedTake[T: Ordering](count: Int): MonoidAggregator[T, PriorityQueue[T], Seq[T]] =
+    new mutable.PriorityQueueToListAggregator[T](count)
+
+  /**
+   * Same as sortedTake, but using a function that returns a value that has an Ordering.
+   *
+   * This function is like writing list.sortBy(fn).take(count).
+   */
+  def sortByTake[T, U: Ordering](count: Int)(fn: T => U): MonoidAggregator[T, PriorityQueue[T], Seq[T]] =
+    Aggregator.sortedTake(count)(Ordering.by(fn))
+
+  /**
+   * Take the largest `count` items using a heap
+   */
+  def sortedReverseTake[T: Ordering](count: Int): MonoidAggregator[T, PriorityQueue[T], Seq[T]] =
+    new mutable.PriorityQueueToListAggregator[T](count)(implicitly[Ordering[T]].reverse)
+
+  /**
+   * Same as sortedReverseTake, but using a function that returns a value that has an Ordering.
+   *
+   * This function is like writing list.sortBy(fn).reverse.take(count).
+   */
+  def sortByReverseTake[T, U: Ordering](
+      count: Int
+  )(fn: T => U): MonoidAggregator[T, PriorityQueue[T], Seq[T]] =
+    Aggregator.sortedReverseTake(count)(Ordering.by(fn))
+
+  /**
+   * Immutable version of sortedTake, for frameworks that check immutability of reduce functions.
+   */
+  def immutableSortedTake[T: Ordering](count: Int): MonoidAggregator[T, TopK[T], Seq[T]] =
+    new TopKToListAggregator[T](count)
+
+  /**
+   * Immutable version of sortedReverseTake, for frameworks that check immutability of reduce functions.
+   */
+  def immutableSortedReverseTake[T: Ordering](count: Int): MonoidAggregator[T, TopK[T], Seq[T]] =
+    new TopKToListAggregator[T](count)(implicitly[Ordering[T]].reverse)
+
+  /**
+   * Randomly selects input items where each item has an independent probability 'prob' of being selected.
+   * This assumes that all sampled records can fit in memory, so use this only when the expected number of
+   * sampled values is small.
+   */
+  def randomSample[T](
+      prob: Double,
+      seed: Int = DefaultSeed
+  ): MonoidAggregator[T, Option[Batched[T]], List[T]] = {
+    assert(prob >= 0 && prob <= 1, "randomSample.prob must lie in [0, 1]")
+    val rng = new java.util.Random(seed)
+    Preparer[T]
+      .filter(_ => rng.nextDouble() <= prob)
+      .monoidAggregate(toList)
+  }
+
+  /**
+   * Selects exactly 'count' of the input records randomly (or all of the records if there are less then
+   * 'count' total records). This assumes that all 'count' of the records can fit in memory, so use this only
+   * for small values of 'count'.
+   */
+  def reservoirSample[T](
+      count: Int,
+      seed: Int = DefaultSeed
+  ): MonoidAggregator[T, PriorityQueue[(Double, T)], Seq[T]] = {
+    val rng = new java.util.Random(seed)
+    Preparer[T]
+      .map(rng.nextDouble() -> _)
+      .monoidAggregate(sortByTake(count)(_._1))
+      .andThenPresent(_.map(_._2))
+  }
+
+  /**
+   * Put everything in a List. Note, this could fill the memory if the List is very large.
+   */
+  def toList[T]: MonoidAggregator[T, Option[Batched[T]], List[T]] =
+    new MonoidAggregator[T, Option[Batched[T]], List[T]] {
+      override def prepare(t: T): Option[Batched[T]] = Some(Batched(t))
+      override def monoid: Monoid[Option[Batched[T]]] =
+        Monoid.optionMonoid(Batched.semigroup)
+      override def present(o: Option[Batched[T]]): List[T] =
+        o.map(_.toList).getOrElse(Nil)
+    }
+
+  /**
+   * Put everything in a Set. Note, this could fill the memory if the Set is very large.
+   */
+  def toSet[T]: MonoidAggregator[T, Set[T], Set[T]] =
+    prepareMonoid { (t: T) => Set(t) }
+
+  /**
+   * This builds an in-memory Set, and then finally gets the size of that set. This may not be scalable if the
+   * Uniques are very large. You might check the approximateUniqueCount or HyperLogLog Aggregator to get an
+   * approximate version of this that is scalable.
+   */
+  def uniqueCount[T]: MonoidAggregator[T, Set[T], Int] =
+    toSet[T].andThenPresent(_.size)
+
+  /**
+   * Using a constant amount of memory, give an approximate unique count (~ 1% error). This uses an exact set
+   * for up to 100 items, then HyperLogLog (HLL) with an 1.2% standard error which uses at most 8192 bytes for
+   * each HLL. For more control, see HyperLogLogAggregator.
+   */
+  def approximateUniqueCount[T: Hash128]: MonoidAggregator[T, Either[HLL, Set[T]], Long] =
+    SetSizeHashAggregator[T](hllBits = 13, maxSetSize = 100)
+
+  /**
+   * Returns the lower bound of a given percentile where the percentile is between (0,1] The items that are
+   * iterated over cannot be negative.
+   */
+  def approximatePercentile[T](percentile: Double, k: Int = QTreeAggregator.DefaultK)(implicit
+      num: Numeric[T]
+  ): QTreeAggregatorLowerBound[T] =
+    QTreeAggregatorLowerBound[T](percentile, k)
+
+  /**
+   * Returns the intersection of a bounded percentile where the percentile is between (0,1] The items that are
+   * iterated over cannot be negative.
+   */
+  def approximatePercentileBounds[T](percentile: Double, k: Int = QTreeAggregator.DefaultK)(implicit
+      num: Numeric[T]
+  ): QTreeAggregator[T] =
+    QTreeAggregator[T](percentile, k)
+
+  /**
+   * An aggregator that sums Numeric values into Doubles.
+   *
+   * This is really no more than converting to Double and then summing. The conversion to double means we
+   * don't have the overflow semantics of integer types on the jvm (e.g. Int.MaxValue + 1 == Int.MinValue).
+   *
+   * Note that if you instead wanted to aggregate Numeric values of a type T into the same type T (e.g. if you
+   * want MonoidAggregator[T, T, T] for some Numeric type T), you can directly use Aggregator.fromMonoid[T]
+   * after importing the numericRing implicit:
+   *
+   * > import com.twitter.algebird.Ring.numericRing > def numericAggregator[T: Numeric]: MonoidAggregator[T,
+   * T, T] = Aggregator.fromMonoid[T]
+   */
+  def numericSum[T](implicit num: Numeric[T]): MonoidAggregator[T, Double, Double] =
+    Preparer[T].map(num.toDouble).monoidAggregate(Aggregator.fromMonoid)
+
+}
+
+/**
+ * This is a type that models map/reduce(map). First each item is mapped, then we reduce with a semigroup,
+ * then finally we present the results.
+ *
+ * Unlike Fold, Aggregator keeps it's middle aggregation type externally visible. This is because Aggregators
+ * are useful in parallel map/reduce systems where there may be some additional types needed to cross the
+ * map/reduce boundary (such a serialization and intermediate storage). If you don't care about the middle
+ * type, an _ may be used and the main utility of the instance is still preserved (e.g. def operate[T](ag:
+ * Aggregator[T, _, Int]): Int)
+ *
+ * Note, join is very useful to combine multiple aggregations with one pass. Also
+ * GeneratedTupleAggregator.fromN((agg1, agg2, ... aggN)) can glue these together well.
+ *
+ * This type is the the Fold.M from Haskell's fold package:
+ * https://hackage.haskell.org/package/folds-0.6.2/docs/Data-Fold-M.html
+ */
+trait Aggregator[-A, B, +C] extends java.io.Serializable { self =>
+  def prepare(input: A): B
+  def semigroup: Semigroup[B]
+  def present(reduction: B): C
+
+  /* *****
+   * All the following are in terms of the above
+   */
+
+  /**
+   * combine two inner values
+   */
+  def reduce(l: B, r: B): B = semigroup.plus(l, r)
+
+  /**
+   * This may error if items is empty. To be safe you might use reduceOption if you don't know that items is
+   * non-empty
+   */
+  def reduce(items: TraversableOnce[B]): B = semigroup.sumOption(items).get
+
+  /**
+   * This is the safe version of the above. If the input in empty, return None, else reduce the items
+   */
+  def reduceOption(items: TraversableOnce[B]): Option[B] =
+    semigroup.sumOption(items)
+
+  /**
+   * This may error if inputs are empty (for Monoid Aggregators it never will, instead you see
+   * present(Monoid.zero[B])
+   */
+  def apply(inputs: TraversableOnce[A]): C =
+    present(reduce(inputs.iterator.map(prepare)))
+
+  /**
+   * This returns None if the inputs are empty
+   */
+  def applyOption(inputs: TraversableOnce[A]): Option[C] =
+    reduceOption(inputs.iterator.map(prepare))
+      .map(present)
+
+  /**
+   * This returns the cumulative sum of its inputs, in the same order. If the inputs are empty, the result
+   * will be empty too.
+   */
+  def cumulativeIterator(inputs: Iterator[A]): Iterator[C] =
+    inputs
+      .scanLeft(None: Option[B]) {
+        case (None, a)    => Some(prepare(a))
+        case (Some(b), a) => Some(append(b, a))
+      }
+      .collect { case Some(b) => present(b) }
+
+  /**
+   * This returns the cumulative sum of its inputs, in the same order. If the inputs are empty, the result
+   * will be empty too.
+   */
+  def applyCumulatively[In <: TraversableOnce[A], Out](
+      inputs: In
+  )(implicit bf: CanBuildFrom[In, C, Out]): Out =
+    (bf: BuildFrom[In, C, Out]).fromSpecific(inputs)(cumulativeIterator(inputs.iterator))
+
+  def append(l: B, r: A): B = reduce(l, prepare(r))
+
+  def appendAll(old: B, items: TraversableOnce[A]): B =
+    if (items.iterator.isEmpty) old else reduce(old, reduce(items.iterator.map(prepare)))
+
+  /** Like calling andThen on the present function */
+  def andThenPresent[D](present2: C => D): Aggregator[A, B, D] =
+    new Aggregator[A, B, D] {
+      override def prepare(input: A): B = self.prepare(input)
+      override def semigroup: Semigroup[B] = self.semigroup
+      override def present(reduction: B): D = present2(self.present(reduction))
+    }
+
+  /** Like calling compose on the prepare function */
+  def composePrepare[A1](prepare2: A1 => A): Aggregator[A1, B, C] =
+    new Aggregator[A1, B, C] {
+      override def prepare(input: A1): B = self.prepare(prepare2(input))
+      override def semigroup: Semigroup[B] = self.semigroup
+      override def present(reduction: B): C = self.present(reduction)
+    }
+
+  /**
+   * This allows you to run two aggregators on the same data with a single pass
+   */
+  def join[A2 <: A, B2, C2](that: Aggregator[A2, B2, C2]): Aggregator[A2, (B, B2), (C, C2)] =
+    GeneratedTupleAggregator.from2((this, that))
+
+  /**
+   * This allows you to join two aggregators into one that takes a tuple input, which in turn allows you to
+   * chain .composePrepare onto the result if you have an initial input that has to be prepared differently
+   * for each of the joined aggregators.
+   *
+   * The law here is: ag1.zip(ag2).apply(as.zip(bs)) == (ag1(as), ag2(bs))
+   */
+  def zip[A2, B2, C2](ag2: Aggregator[A2, B2, C2]): Aggregator[(A, A2), (B, B2), (C, C2)] = {
+    val ag1 = this
+    new Aggregator[(A, A2), (B, B2), (C, C2)] {
+      override def prepare(a: (A, A2)): (B, B2) = (ag1.prepare(a._1), ag2.prepare(a._2))
+      override val semigroup = new Tuple2Semigroup()(ag1.semigroup, ag2.semigroup)
+      override def present(b: (B, B2)): (C, C2) = (ag1.present(b._1), ag2.present(b._2))
+    }
+  }
+
+  /**
+   * An Aggregator can be converted to a Fold, but not vice-versa Note, a Fold is more constrained so only do
+   * this if you require joining a Fold with an Aggregator to produce a Fold
+   */
+  def toFold: Fold[A, Option[C]] =
+    Fold.fold[Option[B], A, Option[C]](
+      {
+        case (None, a)    => Some(self.prepare(a))
+        case (Some(b), a) => Some(self.append(b, a))
+      },
+      None,
+      _.map(self.present)
+    )
+
+  def lift: MonoidAggregator[A, Option[B], Option[C]] =
+    new MonoidAggregator[A, Option[B], Option[C]] {
+      override def prepare(input: A): Option[B] = Some(self.prepare(input))
+      override def present(reduction: Option[B]): Option[C] = reduction.map(self.present)
+      override def monoid = new OptionMonoid[B]()(self.semigroup)
+    }
+}
+
+/**
+ * Aggregators are Applicatives, but this hides the middle type. If you need a join that does not hide the
+ * middle type use join on the trait, or GeneratedTupleAggregator.fromN
+ */
+class AggregatorApplicative[I] extends Applicative[({ type L[O] = Aggregator[I, ?, O] })#L] {
+  override def map[T, U](mt: Aggregator[I, ?, T])(fn: T => U): Aggregator[I, ?, U] =
+    mt.andThenPresent(fn)
+  override def apply[T](v: T): Aggregator[I, ?, T] =
+    Aggregator.const(v)
+  override def join[T, U](mt: Aggregator[I, ?, T], mu: Aggregator[I, ?, U]): Aggregator[I, ?, (T, U)] =
+    mt.join(mu)
+  override def join[T1, T2, T3](
+      m1: Aggregator[I, ?, T1],
+      m2: Aggregator[I, ?, T2],
+      m3: Aggregator[I, ?, T3]
+  ): Aggregator[I, ?, (T1, T2, T3)] =
+    GeneratedTupleAggregator.from3((m1, m2, m3))
+
+  override def join[T1, T2, T3, T4](
+      m1: Aggregator[I, ?, T1],
+      m2: Aggregator[I, ?, T2],
+      m3: Aggregator[I, ?, T3],
+      m4: Aggregator[I, ?, T4]
+  ): Aggregator[I, ?, (T1, T2, T3, T4)] =
+    GeneratedTupleAggregator.from4((m1, m2, m3, m4))
+
+  override def join[T1, T2, T3, T4, T5](
+      m1: Aggregator[I, ?, T1],
+      m2: Aggregator[I, ?, T2],
+      m3: Aggregator[I, ?, T3],
+      m4: Aggregator[I, ?, T4],
+      m5: Aggregator[I, ?, T5]
+  ): Aggregator[I, ?, (T1, T2, T3, T4, T5)] =
+    GeneratedTupleAggregator.from5((m1, m2, m3, m4, m5))
+}
+
+trait MonoidAggregator[-A, B, +C] extends Aggregator[A, B, C] { self =>
+  def monoid: Monoid[B]
+  override def semigroup: Monoid[B] = monoid
+  final override def reduce(items: TraversableOnce[B]): B =
+    monoid.sum(items)
+
+  def appendAll(items: TraversableOnce[A]): B = reduce(items.iterator.map(prepare))
+
+  override def andThenPresent[D](present2: C => D): MonoidAggregator[A, B, D] = {
+    val self = this
+    new MonoidAggregator[A, B, D] {
+      override def prepare(a: A): B = self.prepare(a)
+      override def monoid: Monoid[B] = self.monoid
+      override def present(b: B): D = present2(self.present(b))
+    }
+  }
+  override def composePrepare[A2](prepare2: A2 => A): MonoidAggregator[A2, B, C] = {
+    val self = this
+    new MonoidAggregator[A2, B, C] {
+      override def prepare(a: A2): B = self.prepare(prepare2(a))
+      override def monoid: Monoid[B] = self.monoid
+      override def present(b: B): C = self.present(b)
+    }
+  }
+
+  /**
+   * Build a MonoidAggregator that either takes left or right input and outputs the pair from both
+   */
+  def either[A2, B2, C2](
+      that: MonoidAggregator[A2, B2, C2]
+  ): MonoidAggregator[Either[A, A2], (B, B2), (C, C2)] =
+    new MonoidAggregator[Either[A, A2], (B, B2), (C, C2)] {
+      override def prepare(e: Either[A, A2]): (B, B2) = e match {
+        case Left(a)   => (self.prepare(a), that.monoid.zero)
+        case Right(a2) => (self.monoid.zero, that.prepare(a2))
+      }
+      override val monoid = new Tuple2Monoid[B, B2]()(self.monoid, that.monoid)
+      override def present(bs: (B, B2)): (C, C2) = (self.present(bs._1), that.present(bs._2))
+    }
+
+  /**
+   * Only transform values where the function is defined, else discard
+   */
+  def collectBefore[A2](fn: PartialFunction[A2, A]): MonoidAggregator[A2, B, C] =
+    new MonoidAggregator[A2, B, C] {
+      override def prepare(a: A2): B =
+        if (fn.isDefinedAt(a)) self.prepare(fn(a)) else self.monoid.zero
+      override def monoid: Monoid[B] = self.monoid
+      override def present(b: B): C = self.present(b)
+    }
+
+  /**
+   * Only aggregate items that match a predicate
+   */
+  def filterBefore[A1 <: A](pred: A1 => Boolean): MonoidAggregator[A1, B, C] =
+    new MonoidAggregator[A1, B, C] {
+      override def prepare(a: A1): B = if (pred(a)) self.prepare(a) else self.monoid.zero
+      override def monoid: Monoid[B] = self.monoid
+      override def present(b: B): C = self.present(b)
+    }
+
+  /**
+   * This maps the inputs to Bs, then sums them, effectively flattening the inputs to the MonoidAggregator
+   */
+  def sumBefore: MonoidAggregator[TraversableOnce[A], B, C] =
+    new MonoidAggregator[TraversableOnce[A], B, C] {
+      override def monoid: Monoid[B] = self.monoid
+      override def prepare(input: TraversableOnce[A]): B =
+        monoid.sum(input.iterator.map(self.prepare))
+      override def present(reduction: B): C = self.present(reduction)
+    }
+
+  /**
+   * This allows you to join two aggregators into one that takes a tuple input, which in turn allows you to
+   * chain .composePrepare onto the result if you have an initial input that has to be prepared differently
+   * for each of the joined aggregators.
+   *
+   * The law here is: ag1.zip(ag2).apply(as.zip(bs)) == (ag1(as), ag2(bs))
+   */
+  def zip[A2, B2, C2](ag2: MonoidAggregator[A2, B2, C2]): MonoidAggregator[(A, A2), (B, B2), (C, C2)] = {
+    val ag1 = self
+    new MonoidAggregator[(A, A2), (B, B2), (C, C2)] {
+      override def prepare(a: (A, A2)): (B, B2) = (ag1.prepare(a._1), ag2.prepare(a._2))
+      override val monoid = new Tuple2Monoid[B, B2]()(ag1.monoid, ag2.monoid)
+      override def present(b: (B, B2)): (C, C2) = (ag1.present(b._1), ag2.present(b._2))
+    }
+  }
+}
+
+trait RingAggregator[-A, B, +C] extends MonoidAggregator[A, B, C] {
+  def ring: Ring[B]
+  override def monoid: Monoid[B] = Ring.asTimesMonoid(ring)
+}
diff --git a/algebird-core/src/main/scala-2.13/CountMinSketch.scala b/algebird-core/src/main/scala-2.13/CountMinSketch.scala
new file mode 100644
index 000000000..826aebd5a
--- /dev/null
+++ b/algebird-core/src/main/scala-2.13/CountMinSketch.scala
@@ -0,0 +1,1420 @@
+/*
+Copyright 2012 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+ */
+
+package com.twitter.algebird
+
+import algebra.CommutativeMonoid
+
+import scala.collection.compat._
+
+/**
+ * A Count-Min sketch is a probabilistic data structure used for summarizing streams of data in sub-linear
+ * space.
+ *
+ * It works as follows. Let `(eps, delta)` be two parameters that describe the confidence in our error
+ * estimates, and let `d = ceil(ln 1/delta)` and `w = ceil(e / eps)`.
+ *
+ * Note: Throughout the code `d` and `w` are called `depth` and `width`, respectively.
+ *
+ * Then:
+ *
+ *   - Take `d` pairwise independent hash functions `h_i`, each of which maps onto the domain `[0, w - 1]`.
+ *   - Create a 2-dimensional table of counts, with `d` rows and `w` columns, initialized with all zeroes.
+ *   - When a new element x arrives in the stream, update the table of counts by setting `counts[i, h_i[x]] +=
+ *     1`, for each `1 <= i <= d`.
+ *   - (Note the rough similarity to a Bloom filter.)
+ *
+ * As an example application, suppose you want to estimate the number of times an element `x` has appeared in
+ * a data stream so far. The Count-Min sketch estimate of this frequency is
+ *
+ * min_i { counts[i, h_i[x]] }
+ *
+ * With probability at least `1 - delta`, this estimate is within `eps * N` of the true frequency (i.e., `true
+ * frequency <= estimate <= true frequency + eps * N`), where N is the total size of the stream so far.
+ *
+ * See http://www.eecs.harvard.edu/~michaelm/CS222/countmin.pdf for technical details, including proofs of the
+ * estimates and error bounds used in this implementation.
+ *
+ * Parts of this implementation are taken from
+ * https://github.com/clearspring/stream-lib/blob/master/src/main/java/com/clearspring/analytics/stream/frequency/CountMinSketch.java
+ *
+ * @author
+ *   Edwin Chen
+ */
+/**
+ * Monoid for adding CMS sketches.
+ *
+ * =Usage=
+ *
+ * `eps` and `delta` are parameters that bound the error of each query estimate. For example, errors in
+ * answering point queries (e.g., how often has element x appeared in the stream described by the sketch?) are
+ * often of the form: "with probability p >= 1 - delta, the estimate is close to the truth by some factor
+ * depending on eps."
+ *
+ * The type `K` is the type of items you want to count. You must provide an implicit `CMSHasher[K]` for `K`,
+ * and Algebird ships with several such implicits for commonly used types such as `Long` and `BigInt`.
+ *
+ * If your type `K` is not supported out of the box, you have two options: 1) You provide a "translation"
+ * function to convert items of your (unsupported) type `K` to a supported type such as Double, and then use
+ * the `contramap` function of [[CMSHasher]] to create the required `CMSHasher[K]` for your type (see the
+ * documentation of [[CMSHasher]] for an example); 2) You implement a `CMSHasher[K]` from scratch, using the
+ * existing CMSHasher implementations as a starting point.
+ *
+ * Note: Because Arrays in Scala/Java not have sane `equals` and `hashCode` implementations, you cannot safely
+ * use types such as `Array[Byte]`. Extra work is required for Arrays. For example, you may opt to convert
+ * `Array[T]` to a `Seq[T]` via `toSeq`, or you can provide appropriate wrapper classes. Algebird provides one
+ * such wrapper class, [[Bytes]], to safely wrap an `Array[Byte]` for use with CMS.
+ *
+ * @param eps
+ *   One-sided error bound on the error of each point query, i.e. frequency estimate.
+ * @param delta
+ *   A bound on the probability that a query estimate does not lie within some small interval (an interval
+ *   that depends on `eps`) around the truth.
+ * @param seed
+ *   A seed to initialize the random number generator used to create the pairwise independent hash functions.
+ * @param maxExactCountOpt
+ *   An Option parameter about how many exact counts a sparse CMS wants to keep.
+ * @tparam K
+ *   The type used to identify the elements to be counted. For example, if you want to count the occurrence of
+ *   user names, you could map each username to a unique numeric ID expressed as a `Long`, and then count the
+ *   occurrences of those `Long`s with a CMS of type `K=Long`. Note that this mapping between the elements of
+ *   your problem domain and their identifiers used for counting via CMS should be bijective. We require a
+ *   [[CMSHasher]] context bound for `K`, see [[CMSHasherImplicits]] for available implicits that can be
+ *   imported. Which type K should you pick in practice? For domains that have less than `2^64` unique
+ *   elements, you'd typically use `Long`. For larger domains you can try `BigInt`, for example. Other
+ *   possibilities include Spire's `SafeLong` and `Numerical` data types (https://github.com/non/spire),
+ *   though Algebird does not include the required implicits for CMS-hashing (cf. [[CMSHasherImplicits]].
+ */
+class CMSMonoid[K: CMSHasher](eps: Double, delta: Double, seed: Int, maxExactCountOpt: Option[Int] = None)
+    extends Monoid[CMS[K]]
+    with CommutativeMonoid[CMS[K]] {
+
+  val params: CMSParams[K] = {
+    val hashes: Seq[CMSHash[K]] = CMSFunctions.generateHashes(eps, delta, seed)
+    CMSParams(hashes, eps, delta, maxExactCountOpt)
+  }
+
+  override val zero: CMS[K] = CMSZero[K](params)
+
+  /**
+   * Combines the two sketches.
+   *
+   * The sketches must use the same hash functions.
+   */
+  override def plus(left: CMS[K], right: CMS[K]): CMS[K] = {
+    require(left.params.hashes == right.params.hashes, "The sketches must use the same hash functions.")
+    left ++ right
+  }
+
+  /**
+   * Creates a sketch out of a single item.
+   */
+  def create(item: K): CMS[K] = CMSItem[K](item, 1L, params)
+
+  /**
+   * Creates a sketch out of multiple items.
+   */
+  def create(data: Seq[K]): CMS[K] = {
+    val summation = new CMSSummation(params)
+    data.foreach(k => summation.insert(k, 1L))
+    summation.result
+  }
+
+  override def sumOption(sketches: TraversableOnce[CMS[K]]): Option[CMS[K]] =
+    if (sketches.iterator.isEmpty) None else Some(sum(sketches))
+
+  override def sum(sketches: TraversableOnce[CMS[K]]): CMS[K] = {
+    val summation = new CMSSummation(params)
+    summation.updateAll(sketches)
+    summation.result
+  }
+}
+
+/**
+ * This mutable builder can be used when speed is essential and you can be sure the scope of the mutability
+ * cannot escape in an unsafe way. The intended use is to allocate and call result in one method without
+ * letting a reference to the instance escape into a closure.
+ */
+class CMSSummation[K](params: CMSParams[K]) {
+  private[this] val hashes = params.hashes.toArray
+  private[this] val height = CMSFunctions.depth(params.delta)
+  private[this] val width = CMSFunctions.width(params.eps)
+  private[this] val cells = new Array[Long](height * width)
+  private[this] var totalCount = 0L
+
+  final def insert(k: K, count: Long): Unit = {
+    var row = 0
+    var offset = 0
+    val hs = hashes
+    while (row < hs.length) {
+      cells(offset + hs(row)(k)) += count
+      offset += width
+      row += 1
+    }
+    totalCount += count
+  }
+
+  def updateAll(sketches: TraversableOnce[CMS[K]]): Unit =
+    sketches.iterator.foreach(updateInto)
+
+  def updateInto(cms: CMS[K]): Unit =
+    cms match {
+      case CMSZero(_) =>
+        ()
+      case CMSItem(item, count, _) =>
+        insert(item, count)
+      case SparseCMS(table, _, _) =>
+        table.foreach { case (item, c) =>
+          insert(item, c)
+        }
+      case CMSInstance(CMSInstance.CountsTable(matrix), count, _) =>
+        var offset = 0
+        val rit = matrix.iterator
+        while (rit.hasNext) {
+          var col = 0
+          val cit = rit.next().iterator
+          while (cit.hasNext) {
+            cells(offset + col) += cit.next()
+            col += 1
+          }
+          offset += width
+        }
+        totalCount += count
+    }
+
+  def result: CMS[K] =
+    if (totalCount == 0L) CMSZero(params)
+    else {
+      def vectorize(row: Int): Vector[Long] = {
+        val offset = row * width
+        val b = Vector.newBuilder[Long]
+        var col = 0
+        while (col < width) {
+          b += cells(offset + col)
+          col += 1
+        }
+        b.result()
+      }
+
+      val b = Vector.newBuilder[Vector[Long]]
+      var row = 0
+      while (row < height) {
+        b += vectorize(row)
+        row += 1
+      }
+      CMSInstance(CMSInstance.CountsTable(b.result()), totalCount, params)
+    }
+}
+
+/**
+ * An Aggregator for [[CMS]]. Can be created using CMS.aggregator.
+ */
+case class CMSAggregator[K](cmsMonoid: CMSMonoid[K]) extends MonoidAggregator[K, CMS[K], CMS[K]] {
+  override val monoid: CMSMonoid[K] = cmsMonoid
+
+  override def prepare(value: K): CMS[K] = monoid.create(value)
+
+  override def present(cms: CMS[K]): CMS[K] = cms
+
+}
+
+/**
+ * Configuration parameters for [[CMS]].
+ *
+ * @param hashes
+ *   Pair-wise independent hashes functions. We need `N=depth` such functions (`depth` can be derived from
+ *   `delta`).
+ * @param eps
+ *   One-sided error bound on the error of each point query, i.e. frequency estimate.
+ * @param delta
+ *   A bound on the probability that a query estimate does not lie within some small interval (an interval
+ *   that depends on `eps`) around the truth.
+ * @param maxExactCountOpt
+ *   An Option parameter about how many exact counts a sparse CMS wants to keep.
+ * @tparam K
+ *   The type used to identify the elements to be counted.
+ */
+case class CMSParams[K](
+    hashes: Seq[CMSHash[K]],
+    eps: Double,
+    delta: Double,
+    maxExactCountOpt: Option[Int] = None
+) {
+
+  require(0 < eps && eps < 1, "eps must lie in (0, 1)")
+  require(0 < delta && delta < 1, "delta must lie in (0, 1)")
+  require(
+    hashes.size >= CMSFunctions.depth(delta),
+    s"we require at least ${CMSFunctions.depth(delta)} hash functions"
+  )
+
+}
+
+/**
+ * Helper functions to generate or to translate between various CMS parameters (cf. [[CMSParams]]).
+ */
+object CMSFunctions {
+
+  /**
+   * Translates from `width` to `eps`.
+   */
+  def eps(width: Int): Double = scala.math.exp(1.0) / width
+
+  /**
+   * Translates from `depth` to `delta`.
+   */
+  @throws[IllegalArgumentException]("if depth is too large, causing precision errors when computing delta")
+  def delta(depth: Int): Double = {
+    val i = scala.math.exp(-depth)
+    require(
+      i > 0.0,
+      s"depth must be smaller as it causes precision errors when computing delta ($depth led to an invalid delta of $i)"
+    )
+    i
+  }
+
+  /**
+   * Translates from `delta` to `depth`.
+   */
+  @throws[IllegalArgumentException]("if delta is is not in (0, 1)")
+  def depth(delta: Double): Int = {
+    require(0 < delta && delta < 1, "delta must lie in (0, 1)")
+    scala.math.ceil(scala.math.log(1.0 / delta)).toInt
+  }
+
+  /**
+   * Translates from `eps` to `width`.
+   */
+  def width(eps: Double): Int =
+    scala.math.ceil(truncatePrecisionError(scala.math.exp(1) / eps)).toInt
+
+  /**
+   * Compute maxExactCount from parameters or `depth` and `width`
+   */
+  def maxExactCount(maxExactCountOpt: Option[Int], depth: Int, width: Int): Int =
+    maxExactCountOpt.getOrElse(math.max(width * depth / 100, 50))
+
+  // Eliminates precision errors such as the following:
+  //
+  //   scala> val width = 39
+  //   scala> scala.math.exp(1) / CMSFunctions.eps(width)
+  //   res171: Double = 39.00000000000001   <<< should be 39.0
+  //
+  // Because of the actual types on which CMSFunctions operates (i.e. Int and Double), the maximum number of decimal
+  // places should be 6.
+  private def truncatePrecisionError(i: Double, decimalPlaces: Int = 6) =
+    BigDecimal(i)
+      .setScale(decimalPlaces, BigDecimal.RoundingMode.HALF_UP)
+      .toDouble
+
+  /**
+   * Generates `N=depth` pair-wise independent hash functions.
+   *
+   * @param eps
+   *   One-sided error bound on the error of each point query, i.e. frequency estimate.
+   * @param delta
+   *   Error bound on the probability that a query estimate does NOT lie within some small interval around the
+   *   truth.
+   * @param seed
+   *   Seed for the random number generator.
+   * @tparam K
+   *   The type used to identify the elements to be counted.
+   * @return
+   *   The generated hash functions.
+   */
+  def generateHashes[K: CMSHasher](eps: Double, delta: Double, seed: Int): Seq[CMSHash[K]] = {
+    // Typically, we would use d -- aka depth -- pair-wise independent hash functions of the form
+    //
+    //   h_i(x) = a_i * x + b_i (mod p)
+    //
+    // But for this particular application, setting b_i does not matter (since all it does is shift the results of a
+    // particular hash), so we omit it (by setting b_i to 0) and simply use hash functions of the form
+    //
+    //   h_i(x) = a_i * x (mod p)
+    //
+    val r = new scala.util.Random(seed)
+    val numHashes = depth(delta)
+    val numCounters = width(eps)
+    (0 to (numHashes - 1)).map(_ => CMSHash[K](r.nextInt(), 0, numCounters))
+  }
+
+}
+
+/**
+ * A trait for CMS implementations that can count elements in a data stream and that can answer point queries
+ * (i.e. frequency estimates) for these elements.
+ *
+ * Known implementations: [[CMS]], [[TopCMS]].
+ *
+ * @tparam K
+ *   The type used to identify the elements to be counted.
+ * @tparam C
+ *   The type of the actual CMS that implements this trait.
+ */
+trait CMSCounting[K, C[_]] {
+
+  /**
+   * Returns the one-sided error bound on the error of each point query, i.e. frequency estimate.
+   */
+  def eps: Double
+
+  /**
+   * Returns the bound on the probability that a query estimate does NOT lie within some small interval (an
+   * interval that depends on `eps`) around the truth.
+   */
+  def delta: Double
+
+  /**
+   * Number of hash functions (also: number of rows in the counting table). This number is derived from
+   * `delta`.
+   */
+  def depth: Int = CMSFunctions.depth(delta)
+
+  /**
+   * Number of counters per hash function (also: number of columns in the counting table). This number is
+   * derived from `eps`.
+   */
+  def width: Int = CMSFunctions.width(eps)
+
+  /**
+   * An Option parameter about how many exact counts a sparse CMS wants to keep
+   */
+  def maxExactCountOpt: Option[Int]
+
+  /**
+   * Number of exact counts a sparse CMS wants to keep. This number is derived from `maxExactCountOpt`.
+   */
+  def maxExactCount: Int =
+    CMSFunctions.maxExactCount(maxExactCountOpt, depth, width)
+
+  /**
+   * Returns a new sketch that is the combination of this sketch and the other sketch.
+   */
+  def ++(other: C[K]): C[K]
+
+  /**
+   * Counts the item and returns the result as a new sketch.
+   */
+  def +(item: K): C[K] = this + (item, 1L)
+
+  /**
+   * Counts the item `count` times and returns the result as a new sketch.
+   */
+  def +(item: K, count: Long): C[K]
+
+  /**
+   * Returns an estimate of the total number of times this item has been seen in the stream so far. This
+   * estimate is an upper bound.
+   *
+   * It is always true that `estimatedFrequency >= trueFrequency`. With probability `p >= 1 - delta`, it also
+   * holds that `estimatedFrequency <= trueFrequency + eps * totalCount`.
+   */
+  def frequency(item: K): Approximate[Long]
+
+  /**
+   * Returns an estimate of the inner product against another data stream.
+   *
+   * In other words, let a_i denote the number of times element i has been seen in the data stream summarized
+   * by this CMS, and let b_i denote the same for the other CMS. Then this returns an estimate of `<a, b> =
+   * \sum a_i b_i`.
+   *
+   * Note: This can also be viewed as the join size between two relations.
+   *
+   * It is always true that actualInnerProduct <= estimatedInnerProduct. With probability `p >= 1 - delta`, it
+   * also holds that `estimatedInnerProduct <= actualInnerProduct + eps * thisTotalCount * otherTotalCount`.
+   */
+  def innerProduct(other: C[K]): Approximate[Long]
+
+  /**
+   * Total number of elements counted (i.e. seen in the data stream) so far.
+   */
+  def totalCount: Long
+
+  /**
+   * The first frequency moment is the total number of elements in the stream.
+   */
+  def f1: Long = totalCount
+
+  /**
+   * The second frequency moment is `\sum a_i^2`, where `a_i` is the count of the i-th element.
+   */
+  def f2: Approximate[Long]
+
+}
+
+/**
+ * A trait for CMS implementations that can track heavy hitters in a data stream.
+ *
+ * It is up to the implementation how the semantics of tracking heavy hitters are defined. For instance, one
+ * implementation could track the "top %" heavy hitters whereas another implementation could track the "top N"
+ * heavy hitters.
+ *
+ * Known implementations: [[TopCMS]].
+ *
+ * @tparam K
+ *   The type used to identify the elements to be counted.
+ */
+trait CMSHeavyHitters[K] {
+
+  /**
+   * The pluggable logic of how heavy hitters are being tracked.
+   */
+  def heavyHittersLogic: HeavyHittersLogic[K]
+
+  /**
+   * Returns the set of heavy hitters.
+   */
+  def heavyHitters: Set[K]
+
+}
+
+object CMS {
+
+  def monoid[K: CMSHasher](eps: Double, delta: Double, seed: Int): CMSMonoid[K] =
+    monoid(eps, delta, seed, None)
+  def monoid[K: CMSHasher](
+      eps: Double,
+      delta: Double,
+      seed: Int,
+      maxExactCountOpt: Option[Int]
+  ): CMSMonoid[K] =
+    new CMSMonoid[K](eps, delta, seed, maxExactCountOpt)
+
+  def monoid[K: CMSHasher](depth: Int, width: Int, seed: Int): CMSMonoid[K] =
+    monoid(depth, width, seed, None)
+  def monoid[K: CMSHasher](depth: Int, width: Int, seed: Int, maxExactCountOpt: Option[Int]): CMSMonoid[K] =
+    monoid(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, maxExactCountOpt)
+
+  def aggregator[K: CMSHasher](eps: Double, delta: Double, seed: Int): CMSAggregator[K] =
+    aggregator(eps, delta, seed, None)
+  def aggregator[K: CMSHasher](
+      eps: Double,
+      delta: Double,
+      seed: Int,
+      maxExactCountOpt: Option[Int]
+  ): CMSAggregator[K] =
+    new CMSAggregator[K](monoid(eps, delta, seed, maxExactCountOpt))
+
+  def aggregator[K: CMSHasher](depth: Int, width: Int, seed: Int): CMSAggregator[K] =
+    aggregator(depth, width, seed, None)
+  def aggregator[K: CMSHasher](
+      depth: Int,
+      width: Int,
+      seed: Int,
+      maxExactCountOpt: Option[Int]
+  ): CMSAggregator[K] =
+    aggregator(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, maxExactCountOpt)
+
+  /**
+   * Returns a fresh, zeroed CMS instance.
+   */
+  def apply[K: CMSHasher](
+      eps: Double,
+      delta: Double,
+      seed: Int,
+      maxExactCountOpt: Option[Int] = None
+  ): CMS[K] = {
+    val params = {
+      val hashes: Seq[CMSHash[K]] =
+        CMSFunctions.generateHashes(eps, delta, seed)
+      CMSParams(hashes, eps, delta, maxExactCountOpt)
+    }
+    CMSZero[K](params)
+  }
+
+}
+
+/**
+ * A Count-Min sketch data structure that allows for counting and frequency estimation of elements in a data
+ * stream.
+ *
+ * Tip: If you also need to track heavy hitters ("Top N" problems), take a look at [[TopCMS]].
+ *
+ * =Usage=
+ *
+ * This example demonstrates how to count `Long` elements with [[CMS]], i.e. `K=Long`.
+ *
+ * Note that the actual counting is always performed with a `Long`, regardless of your choice of `K`. That is,
+ * the counting table behind the scenes is backed by `Long` values (at least in the current implementation),
+ * and thus the returned frequency estimates are always instances of `Approximate[Long]`.
+ *
+ * @example
+ *   {{{
+ *
+ * // Creates a monoid for a CMS that can count `Long` elements. val cmsMonoid: CMSMonoid[Long] = { val eps =
+ * 0.001 val delta = 1E-10 val seed = 1 CMS.monoid[Long](eps, delta, seed) }
+ *
+ * // Creates a CMS instance that has counted the element `1L`. val cms: CMS[Long] = cmsMonoid.create(1L)
+ *
+ * // Estimates the frequency of `1L` val estimate: Approximate[Long] = cms.frequency(1L)
+ *   }}}
+ *
+ * @tparam K
+ *   The type used to identify the elements to be counted.
+ */
+sealed abstract class CMS[K](val params: CMSParams[K]) extends java.io.Serializable with CMSCounting[K, CMS] {
+
+  override val eps: Double = params.eps
+
+  override val delta: Double = params.delta
+
+  override val maxExactCountOpt: Option[Int] = params.maxExactCountOpt
+
+  override def f2: Approximate[Long] = innerProduct(this)
+
+}
+
+/**
+ * Zero element. Used for initialization.
+ */
+case class CMSZero[K](override val params: CMSParams[K]) extends CMS[K](params) {
+
+  override val totalCount: Long = 0L
+
+  override def +(item: K, count: Long): CMS[K] = CMSItem[K](item, count, params)
+
+  override def ++(other: CMS[K]): CMS[K] = other
+
+  override def frequency(item: K): Approximate[Long] = Approximate.exact(0L)
+
+  override def innerProduct(other: CMS[K]): Approximate[Long] =
+    Approximate.exact(0L)
+
+}
+
+/**
+ * Used for holding a single element, to avoid repeatedly adding elements from sparse counts tables.
+ */
+case class CMSItem[K](item: K, override val totalCount: Long, override val params: CMSParams[K])
+    extends CMS[K](params) {
+
+  override def +(x: K, count: Long): CMS[K] =
+    SparseCMS[K](params) + (item, totalCount) + (x, count)
+
+  override def ++(other: CMS[K]): CMS[K] =
+    other match {
+      case _: CMSZero[?] => this
+      case other: CMSItem[K] =>
+        CMSInstance[K](params) + (item, totalCount) + (other.item, other.totalCount)
+      case _ => other + item
+    }
+
+  override def frequency(x: K): Approximate[Long] =
+    if (item == x) Approximate.exact(totalCount) else Approximate.exact(0L)
+
+  override def innerProduct(other: CMS[K]): Approximate[Long] =
+    Approximate.exact(totalCount) * other.frequency(item)
+
+}
+
+/**
+ * A sparse Count-Min sketch structure, used for situations where the key is highly skewed.
+ */
+case class SparseCMS[K](
+    exactCountTable: Map[K, Long],
+    override val totalCount: Long,
+    override val params: CMSParams[K]
+) extends CMS[K](params) {
+  import SparseCMS._
+
+  override def +(x: K, count: Long): CMS[K] = {
+    val currentCount = exactCountTable.getOrElse(x, 0L)
+    val newTable = exactCountTable.updated(x, currentCount + count)
+    if (newTable.size < maxExactCount) {
+      // still sparse
+      SparseCMS(newTable, totalCount = totalCount + count, params = params)
+    } else {
+      toDense(newTable, params)
+    }
+  }
+
+  override def ++(other: CMS[K]): CMS[K] =
+    other match {
+      case _: CMSZero[?]       => this
+      case other: CMSItem[K]   => this + (other.item, other.totalCount)
+      case other: SparseCMS[K] =>
+        // This SparseCMS's maxExactCount is used, so ++ is not communitive
+        val newTable = Semigroup.plus(exactCountTable, other.exactCountTable)
+        if (newTable.size < maxExactCount) {
+          // still sparse
+          SparseCMS(newTable, totalCount = totalCount + other.totalCount, params = params)
+        } else {
+          toDense(newTable, params)
+        }
+
+      case other: CMSInstance[K] => other ++ this
+    }
+
+  override def frequency(x: K): Approximate[Long] =
+    Approximate.exact(exactCountTable.getOrElse(x, 0L))
+
+  override def innerProduct(other: CMS[K]): Approximate[Long] =
+    exactCountTable.iterator
+      .map { case (x, count) => Approximate.exact(count) * other.frequency(x) }
+      .reduceOption(_ + _)
+      .getOrElse(Approximate.exact(0L))
+}
+
+object SparseCMS {
+
+  /**
+   * Creates a new [[SparseCMS]] with empty exactCountTable
+   */
+  def apply[K](params: CMSParams[K]): SparseCMS[K] = {
+    val exactCountTable = Map[K, Long]()
+    SparseCMS[K](exactCountTable, totalCount = 0, params = params)
+  }
+
+  /**
+   * Creates a new [[CMSInstance]] from a Map[K, Long]
+   */
+  def toDense[K](exactCountTable: Map[K, Long], params: CMSParams[K]): CMS[K] =
+    // Create new CMSInstace
+    exactCountTable.foldLeft(CMSInstance[K](params)) { case (cms, (x, count)) =>
+      cms + (x, count)
+    }
+}
+
+/**
+ * The general Count-Min sketch structure, used for holding any number of elements.
+ */
+case class CMSInstance[K](
+    countsTable: CMSInstance.CountsTable[K],
+    override val totalCount: Long,
+    override val params: CMSParams[K]
+) extends CMS[K](params) {
+
+  override def ++(other: CMS[K]): CMS[K] =
+    other match {
+      case _: CMSZero[?]     => this
+      case other: CMSItem[K] => this + other.item
+      case other: SparseCMS[K] =>
+        other.exactCountTable.foldLeft(this) { case (cms, (x, count)) =>
+          cms + (x, count)
+        }
+      case other: CMSInstance[K] =>
+        val newTable = countsTable ++ other.countsTable
+        val newTotalCount = totalCount + other.totalCount
+        CMSInstance[K](newTable, newTotalCount, params)
+    }
+
+  private def makeApprox(est: Long): Approximate[Long] =
+    if (est == 0L) Approximate.exact(0L)
+    else {
+      val lower = math.max(0L, est - (eps * totalCount).toLong)
+      Approximate(lower, est, est, 1 - delta)
+    }
+
+  override def frequency(item: K): Approximate[Long] = {
+    var freq = Long.MaxValue
+    val hs = params.hashes
+    val it = countsTable.counts.iterator
+    var i = 0
+    while (it.hasNext) {
+      val row = it.next()
+      val count = row(hs(i)(item))
+      if (count < freq) freq = count
+      i += 1
+    }
+    makeApprox(freq)
+  }
+
+  /**
+   * Let X be a CMS, and let count_X[j, k] denote the value in X's 2-dimensional count table at row j and
+   * column k. Then the Count-Min sketch estimate of the inner product between A and B is the minimum inner
+   * product between their rows: estimatedInnerProduct = min_j (\sum_k count_A[j, k] * count_B[j, k]|)
+   */
+  override def innerProduct(other: CMS[K]): Approximate[Long] =
+    other match {
+      case other: CMSInstance[?] =>
+        require(other.depth == depth && other.width == width, "Tables must have the same dimensions.")
+
+        def innerProductAtDepth(d: Int) =
+          (0 to (width - 1)).iterator.map { w =>
+            countsTable.getCount((d, w)) * other.countsTable.getCount((d, w))
+          }.sum
+
+        val est = (0 to (depth - 1)).iterator.map(innerProductAtDepth).min
+        val minimum =
+          math.max(est - (eps * totalCount * other.totalCount).toLong, 0)
+        Approximate(minimum, est, est, 1 - delta)
+      case _ => other.innerProduct(this)
+    }
+
+  override def +(item: K, count: Long): CMSInstance[K] = {
+    require(count >= 0, "count must be >= 0 (negative counts not implemented")
+    if (count != 0L) {
+      val newCountsTable =
+        (0 to (depth - 1)).foldLeft(countsTable) { case (table, row) =>
+          val pos = (row, params.hashes(row)(item))
+          table + (pos, count)
+        }
+      CMSInstance[K](newCountsTable, totalCount + count, params)
+    } else this
+  }
+
+}
+
+object CMSInstance {
+
+  /**
+   * Initializes a [[CMSInstance]] with all zeroes, i.e. nothing has been counted yet.
+   */
+  def apply[K](params: CMSParams[K]): CMSInstance[K] = {
+    val countsTable = CountsTable[K](CMSFunctions.depth(params.delta), CMSFunctions.width(params.eps))
+    CMSInstance[K](countsTable, 0, params)
+  }
+
+  /**
+   * The 2-dimensional table of counters used in the Count-Min sketch. Each row corresponds to a particular
+   * hash function.
+   */
+  // TODO: implement a dense matrix type, and use it here
+  case class CountsTable[K](counts: Vector[Vector[Long]]) {
+    require(depth > 0, "Table must have at least 1 row.")
+    require(width > 0, "Table must have at least 1 column.")
+
+    def depth: Int = counts.size
+
+    def width: Int = counts(0).size
+
+    def getCount(pos: (Int, Int)): Long = {
+      val (row, col) = pos
+      require(row < depth && col < width, "Position must be within the bounds of this table.")
+      counts(row)(col)
+    }
+
+    /**
+     * Updates the count of a single cell in the table.
+     */
+    def +(pos: (Int, Int), count: Long): CountsTable[K] = {
+      val (row, col) = pos
+      val currCount = getCount(pos)
+      val newCounts =
+        counts.updated(row, counts(row).updated(col, currCount + count))
+      CountsTable[K](newCounts)
+    }
+
+    /**
+     * Adds another counts table to this one, through element-wise addition.
+     */
+    def ++(other: CountsTable[K]): CountsTable[K] = {
+      require(depth == other.depth && width == other.width, "Tables must have the same dimensions.")
+      val xss = this.counts.iterator
+      val yss = other.counts.iterator
+      val rows = Vector.newBuilder[Vector[Long]]
+      while (xss.hasNext) {
+        val xs = xss.next().iterator
+        val ys = yss.next().iterator
+        val row = Vector.newBuilder[Long]
+        while (xs.hasNext) row += (xs.next() + ys.next())
+        rows += row.result()
+      }
+      CountsTable[K](rows.result())
+    }
+  }
+
+  object CountsTable {
+
+    /**
+     * Creates a new [[CountsTable]] with counts initialized to all zeroes.
+     */
+    def apply[K](depth: Int, width: Int): CountsTable[K] =
+      CountsTable[K](Vector.fill[Long](depth, width)(0L))
+
+  }
+
+}
+
+case class TopCMSParams[K](logic: HeavyHittersLogic[K])
+
+/**
+ * A Count-Min sketch data structure that allows for (a) counting and frequency estimation of elements in a
+ * data stream and (b) tracking the heavy hitters among these elements.
+ *
+ * The logic of how heavy hitters are computed is pluggable, see [[HeavyHittersLogic]].
+ *
+ * Tip: If you do not need to track heavy hitters, take a look at [[CMS]], which is more efficient in this
+ * case.
+ *
+ * =Usage=
+ *
+ * This example demonstrates how to count `Long` elements with [[TopCMS]], i.e. `K=Long`.
+ *
+ * Note that the actual counting is always performed with a `Long`, regardless of your choice of `K`. That is,
+ * the counting table behind the scenes is backed by `Long` values (at least in the current implementation),
+ * and thus the returned frequency estimates are always instances of `Approximate[Long]`.
+ *
+ * @example
+ *   {{{ // Creates a monoid for a CMS that can count `Long` elements. val topPctCMSMonoid:
+ *   TopPctCMSMonoid[Long] = { val eps = 0.001 val delta = 1E-10 val seed = 1 val heavyHittersPct = 0.1
+ *   TopPctCMS.monoid[Long](eps, delta, seed, heavyHittersPct) }
+ *
+ * // Creates a TopCMS instance that has counted the element `1L`. val topCMS: TopCMS[Long] =
+ * topPctCMSMonoid.create(1L)
+ *
+ * // Estimates the frequency of `1L` val estimate: Approximate[Long] = topCMS.frequency(1L)
+ *
+ * // What are the heavy hitters so far? val heavyHitters: Set[Long] = topCMS.heavyHitters }}}
+ *
+ * @tparam K
+ *   The type used to identify the elements to be counted.
+ */
+sealed abstract class TopCMS[K](val cms: CMS[K], params: TopCMSParams[K])
+    extends java.io.Serializable
+    with CMSCounting[K, TopCMS]
+    with CMSHeavyHitters[K] {
+
+  override val eps: Double = cms.eps
+
+  override val delta: Double = cms.delta
+
+  override val totalCount: Long = cms.totalCount
+
+  override val maxExactCountOpt: Option[Int] = cms.maxExactCountOpt
+
+  override def frequency(item: K): Approximate[Long] = cms.frequency(item)
+
+  override def innerProduct(other: TopCMS[K]): Approximate[Long] =
+    cms.innerProduct(other.cms)
+
+  override def f2: Approximate[Long] = innerProduct(this)
+
+  /**
+   * The pluggable logic with which heavy hitters are being tracked.
+   */
+  override def heavyHittersLogic: HeavyHittersLogic[K] = params.logic
+
+}
+
+/**
+ * Zero element. Used for initialization.
+ */
+case class TopCMSZero[K](override val cms: CMS[K], params: TopCMSParams[K]) extends TopCMS[K](cms, params) {
+
+  override val heavyHitters: Set[K] = Set.empty[K]
+
+  override def +(item: K, count: Long): TopCMS[K] =
+    TopCMSInstance(cms, params) + (item, count)
+
+  override def ++(other: TopCMS[K]): TopCMS[K] = other
+
+}
+
+/**
+ * Used for holding a single element, to avoid repeatedly adding elements from sparse counts tables.
+ */
+case class TopCMSItem[K](item: K, override val cms: CMS[K], params: TopCMSParams[K])
+    extends TopCMS[K](cms, params) {
+
+  override val heavyHitters: Set[K] = Set(item)
+
+  override def +(x: K, count: Long): TopCMS[K] = toCMSInstance + (x, count)
+
+  override def ++(other: TopCMS[K]): TopCMS[K] = other match {
+    case _: TopCMSZero[?]         => this
+    case other: TopCMSItem[K]     => toCMSInstance + other.item
+    case other: TopCMSInstance[K] => other + item
+  }
+
+  private def toCMSInstance: TopCMSInstance[K] = {
+    val hhs = HeavyHitters.from(HeavyHitter(item, 1L))
+    TopCMSInstance(cms, hhs, params)
+  }
+
+}
+
+object TopCMSInstance {
+
+  def apply[K](cms: CMS[K], params: TopCMSParams[K]): TopCMSInstance[K] =
+    TopCMSInstance[K](cms, HeavyHitters.empty[K], params)
+
+}
+
+case class TopCMSInstance[K](override val cms: CMS[K], hhs: HeavyHitters[K], params: TopCMSParams[K])
+    extends TopCMS[K](cms, params) {
+
+  override def heavyHitters: Set[K] = hhs.items
+
+  override def +(item: K, count: Long): TopCMSInstance[K] = {
+    require(count >= 0, "count must be >= 0 (negative counts not implemented")
+    if (count != 0L) {
+      val newCms = cms + (item, count)
+      val newHhs =
+        heavyHittersLogic.updateHeavyHitters(cms, newCms)(hhs, item, count)
+      TopCMSInstance[K](newCms, newHhs, params)
+    } else this
+  }
+
+  override def ++(other: TopCMS[K]): TopCMS[K] = other match {
+    case _: TopCMSZero[?]     => this
+    case other: TopCMSItem[K] => this + other.item
+    case other: TopCMSInstance[K] =>
+      val newCms = cms ++ other.cms
+      val newHhs = heavyHittersLogic.updateHeavyHitters(newCms)(hhs, other.hhs)
+      TopCMSInstance(newCms, newHhs, params)
+  }
+
+}
+
+class TopCMSMonoid[K](emptyCms: CMS[K], logic: HeavyHittersLogic[K]) extends Monoid[TopCMS[K]] {
+
+  val params: TopCMSParams[K] = TopCMSParams(logic)
+
+  override val zero: TopCMS[K] = TopCMSZero[K](emptyCms, params)
+
+  /**
+   * Combines the two sketches.
+   *
+   * The sketches must use the same hash functions.
+   */
+  override def plus(left: TopCMS[K], right: TopCMS[K]): TopCMS[K] = {
+    require(
+      left.cms.params.hashes == right.cms.params.hashes,
+      "The sketches must use the same hash functions."
+    )
+    left ++ right
+  }
+
+  /**
+   * Creates a sketch out of a single item.
+   */
+  def create(item: K): TopCMS[K] =
+    TopCMSItem[K](item, emptyCms + item, params)
+
+  /**
+   * Creates a sketch out of multiple items.
+   */
+  def create(data: Seq[K]): TopCMS[K] =
+    data.foldLeft(zero) { case (acc, x) => plus(acc, create(x)) }
+
+  override def sum(sketches: TraversableOnce[TopCMS[K]]): TopCMS[K] = {
+    val topCandidates = scala.collection.mutable.Set.empty[K]
+    val summation = new CMSSummation(emptyCms.params)
+    sketches.iterator.foreach { sketch =>
+      summation.updateInto(sketch.cms)
+      topCandidates ++= sketch.heavyHitters
+    }
+    val cms = summation.result
+    val ests =
+      topCandidates.map(k => HeavyHitter(k, cms.frequency(k).estimate)).toSet
+    val hhs = logic.purgeHeavyHitters(cms)(HeavyHitters(ests))
+    TopCMSInstance(cms, hhs, params)
+  }
+
+  override def sumOption(sketches: TraversableOnce[TopCMS[K]]): Option[TopCMS[K]] =
+    if (sketches.iterator.isEmpty) None else Some(sum(sketches))
+}
+
+class TopCMSAggregator[K](cmsMonoid: TopCMSMonoid[K]) extends MonoidAggregator[K, TopCMS[K], TopCMS[K]] {
+
+  override def monoid: TopCMSMonoid[K] = cmsMonoid
+
+  override def prepare(value: K): TopCMS[K] = monoid.create(value)
+
+  override def present(cms: TopCMS[K]): TopCMS[K] = cms
+
+}
+
+/**
+ * Controls how a CMS that implements [[CMSHeavyHitters]] tracks heavy hitters.
+ */
+abstract class HeavyHittersLogic[K] extends java.io.Serializable {
+
+  def updateHeavyHitters(
+      oldCms: CMS[K],
+      newCms: CMS[K]
+  )(hhs: HeavyHitters[K], item: K, count: Long): HeavyHitters[K] = {
+    val oldItemCount = oldCms.frequency(item).estimate
+    val oldHh = HeavyHitter[K](item, oldItemCount)
+    val newItemCount = oldItemCount + count
+    val newHh = HeavyHitter[K](item, newItemCount)
+    purgeHeavyHitters(newCms)(hhs - oldHh + newHh)
+  }
+
+  def updateHeavyHitters(cms: CMS[K])(left: HeavyHitters[K], right: HeavyHitters[K]): HeavyHitters[K] = {
+    val candidates = (left.items ++ right.items).map { case i =>
+      HeavyHitter[K](i, cms.frequency(i).estimate)
+    }
+    val newHhs = HeavyHitters.from(candidates)
+    purgeHeavyHitters(cms)(newHhs)
+  }
+
+  def purgeHeavyHitters(cms: CMS[K])(hhs: HeavyHitters[K]): HeavyHitters[K]
+
+}
+
+/**
+ * Finds all heavy hitters, i.e., elements in the stream that appear at least `(heavyHittersPct * totalCount)`
+ * times.
+ *
+ * Every item that appears at least `(heavyHittersPct * totalCount)` times is output, and with probability `p
+ * >= 1 - delta`, no item whose count is less than `(heavyHittersPct - eps) * totalCount` is output.
+ *
+ * This also means that this parameter is an upper bound on the number of heavy hitters that will be tracked:
+ * the set of heavy hitters contains at most `1 / heavyHittersPct` elements. For example, if
+ * `heavyHittersPct=0.01` (or 0.25), then at most `1 / 0.01 = 100` items (or `1 / 0.25 = 4` items) will be
+ * tracked/returned as heavy hitters. This parameter can thus control the memory footprint required for
+ * tracking heavy hitters.
+ */
+case class TopPctLogic[K](heavyHittersPct: Double) extends HeavyHittersLogic[K] {
+
+  require(0 < heavyHittersPct && heavyHittersPct < 1, "heavyHittersPct must lie in (0, 1)")
+
+  override def purgeHeavyHitters(cms: CMS[K])(hitters: HeavyHitters[K]): HeavyHitters[K] = {
+    val minCount = heavyHittersPct * cms.totalCount
+    HeavyHitters[K](hitters.hhs.filter(_.count >= minCount))
+  }
+
+}
+
+/**
+ * Tracks the top N heavy hitters, where `N` is defined by `heavyHittersN`.
+ *
+ * '''Warning:''' top-N computations are not associative. The effect is that a top-N CMS has an ordering bias
+ * (with regard to heavy hitters) when merging instances. This means merging heavy hitters across CMS
+ * instances may lead to incorrect, biased results: the outcome is biased by the order in which CMS instances
+ * / heavy hitters are being merged, with the rule of thumb being that the earlier a set of heavy hitters is
+ * being merged, the more likely is the end result biased towards these heavy hitters.
+ *
+ * @see
+ *   Discussion in [[https://github.com/twitter/algebird/issues/353 Algebird issue 353]]
+ */
+case class TopNLogic[K](heavyHittersN: Int) extends HeavyHittersLogic[K] {
+
+  require(heavyHittersN > 0, "heavyHittersN must be > 0")
+
+  override def purgeHeavyHitters(cms: CMS[K])(hitters: HeavyHitters[K]): HeavyHitters[K] = {
+    val sorted =
+      hitters.hhs.toSeq.sortBy(hh => hh.count).takeRight(heavyHittersN)
+    HeavyHitters[K](sorted.toSet)
+  }
+
+}
+
+/**
+ * Containers for holding heavy hitter items and their associated counts.
+ */
+case class HeavyHitters[K](hhs: Set[HeavyHitter[K]]) extends java.io.Serializable {
+
+  def -(hh: HeavyHitter[K]): HeavyHitters[K] = HeavyHitters[K](hhs - hh)
+
+  def +(hh: HeavyHitter[K]): HeavyHitters[K] = HeavyHitters[K](hhs + hh)
+
+  def ++(other: HeavyHitters[K]): HeavyHitters[K] =
+    HeavyHitters[K](hhs ++ other.hhs)
+
+  def items: Set[K] = hhs.map(_.item)
+
+}
+
+object HeavyHitters {
+
+  def empty[K]: HeavyHitters[K] = HeavyHitters(emptyHhs)
+
+  private def emptyHhs[K]: Set[HeavyHitter[K]] = Set[HeavyHitter[K]]()
+
+  def from[K](hhs: Set[HeavyHitter[K]]): HeavyHitters[K] =
+    hhs.foldLeft(empty[K])(_ + _)
+
+  def from[K](hh: HeavyHitter[K]): HeavyHitters[K] = HeavyHitters(emptyHhs + hh)
+
+}
+
+case class HeavyHitter[K](item: K, count: Long) extends java.io.Serializable
+
+/**
+ * Monoid for Top-% based [[TopCMS]] sketches.
+ *
+ * =Usage=
+ *
+ * The type `K` is the type of items you want to count. You must provide an implicit `CMSHasher[K]` for `K`,
+ * and Algebird ships with several such implicits for commonly used types such as `Long` and `BigInt`.
+ *
+ * If your type `K` is not supported out of the box, you have two options: 1) You provide a "translation"
+ * function to convert items of your (unsupported) type `K` to a supported type such as Double, and then use
+ * the `contramap` function of [[CMSHasher]] to create the required `CMSHasher[K]` for your type (see the
+ * documentation of [[CMSHasher]] for an example); 2) You implement a `CMSHasher[K]` from scratch, using the
+ * existing CMSHasher implementations as a starting point.
+ *
+ * Note: Because Arrays in Scala/Java not have sane `equals` and `hashCode` implementations, you cannot safely
+ * use types such as `Array[Byte]`. Extra work is required for Arrays. For example, you may opt to convert
+ * `Array[T]` to a `Seq[T]` via `toSeq`, or you can provide appropriate wrapper classes. Algebird provides one
+ * such wrapper class, [[Bytes]], to safely wrap an `Array[Byte]` for use with CMS.
+ *
+ * @param cms
+ *   A [[CMS]] instance, which is used for the counting and the frequency estimation performed by this class.
+ * @param heavyHittersPct
+ *   A threshold for finding heavy hitters, i.e., elements that appear at least (heavyHittersPct * totalCount)
+ *   times in the stream.
+ * @tparam K
+ *   The type used to identify the elements to be counted. For example, if you want to count the occurrence of
+ *   user names, you could map each username to a unique numeric ID expressed as a `Long`, and then count the
+ *   occurrences of those `Long`s with a CMS of type `K=Long`. Note that this mapping between the elements of
+ *   your problem domain and their identifiers used for counting via CMS should be bijective. We require a
+ *   [[CMSHasher]] context bound for `K`, see [[CMSHasher]] for available implicits that can be imported.
+ *   Which type K should you pick in practice? For domains that have less than `2^64` unique elements, you'd
+ *   typically use `Long`. For larger domains you can try `BigInt`, for example.
+ */
+class TopPctCMSMonoid[K](cms: CMS[K], heavyHittersPct: Double = 0.01)
+    extends TopCMSMonoid[K](cms, TopPctLogic[K](heavyHittersPct))
+
+object TopPctCMS {
+
+  def monoid[K: CMSHasher](
+      eps: Double,
+      delta: Double,
+      seed: Int,
+      heavyHittersPct: Double
+  ): TopPctCMSMonoid[K] =
+    new TopPctCMSMonoid[K](CMS(eps, delta, seed), heavyHittersPct)
+
+  def monoid[K: CMSHasher](depth: Int, width: Int, seed: Int, heavyHittersPct: Double): TopPctCMSMonoid[K] =
+    monoid(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersPct)
+
+  def aggregator[K: CMSHasher](
+      eps: Double,
+      delta: Double,
+      seed: Int,
+      heavyHittersPct: Double
+  ): TopPctCMSAggregator[K] =
+    new TopPctCMSAggregator[K](monoid(eps, delta, seed, heavyHittersPct))
+
+  def aggregator[K: CMSHasher](
+      depth: Int,
+      width: Int,
+      seed: Int,
+      heavyHittersPct: Double
+  ): TopPctCMSAggregator[K] =
+    aggregator(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersPct)
+
+}
+
+/**
+ * An Aggregator for [[TopPctCMS]]. Can be created using [[TopPctCMS.aggregator]].
+ */
+case class TopPctCMSAggregator[K](cmsMonoid: TopPctCMSMonoid[K]) extends TopCMSAggregator(cmsMonoid)
+
+/**
+ * Monoid for top-N based [[TopCMS]] sketches. '''Use with care! (see warning below)'''
+ *
+ * =Warning: Adding top-N CMS instances (`++`) is an unsafe operation=
+ *
+ * Top-N computations are not associative. The effect is that a top-N CMS has an ordering bias (with regard to
+ * heavy hitters) when ''merging'' CMS instances (e.g. via `++`). This means merging heavy hitters across CMS
+ * instances may lead to incorrect, biased results: the outcome is biased by the order in which CMS instances
+ * / heavy hitters are being merged, with the rule of thumb being that the earlier a set of heavy hitters is
+ * being merged, the more likely is the end result biased towards these heavy hitters.
+ *
+ * The warning above only applies when ''adding CMS instances'' (think: `cms1 ++ cms2`). In comparison, heavy
+ * hitters are correctly computed when:
+ *
+ *   - a top-N CMS instance is created from a single data stream, i.e. `Seq[K]`
+ *   - items are added/counted individually, i.e. `cms + item` or `cms + (item, count)`.
+ *
+ * See the discussion in [[https://github.com/twitter/algebird/issues/353 Algebird issue 353]] for further
+ * details.
+ *
+ * =Alternatives=
+ *
+ * The following, alternative data structures may be better picks than a top-N based CMS given the warning
+ * above:
+ *
+ *   - [[TopPctCMS]]: Has safe merge semantics for its instances including heavy hitters.
+ *   - [[SpaceSaver]]: Has the same ordering bias than a top-N CMS, but at least it provides bounds on the
+ *     bias.
+ *
+ * =Usage=
+ *
+ * The type `K` is the type of items you want to count. You must provide an implicit `CMSHasher[K]` for `K`,
+ * and Algebird ships with several such implicits for commonly used types such as `Long` and `BigInt`.
+ *
+ * If your type `K` is not supported out of the box, you have two options: 1) You provide a "translation"
+ * function to convert items of your (unsupported) type `K` to a supported type such as [[Double]], and then
+ * use the `contramap` function of [[CMSHasher]] to create the required `CMSHasher[K]` for your type (see the
+ * documentation of [[CMSHasher]] for an example); 2) You implement a `CMSHasher[K]` from scratch, using the
+ * existing CMSHasher implementations as a starting point.
+ *
+ * Note: Because Arrays in Scala/Java not have sane `equals` and `hashCode` implementations, you cannot safely
+ * use types such as `Array[Byte]`. Extra work is required for Arrays. For example, you may opt to convert
+ * `Array[T]` to a `Seq[T]` via `toSeq`, or you can provide appropriate wrapper classes. Algebird provides one
+ * such wrapper class, [[Bytes]], to safely wrap an `Array[Byte]` for use with CMS.
+ *
+ * @param cms
+ *   A [[CMS]] instance, which is used for the counting and the frequency estimation performed by this class.
+ * @param heavyHittersN
+ *   The maximum number of heavy hitters to track.
+ * @tparam K
+ *   The type used to identify the elements to be counted. For example, if you want to count the occurrence of
+ *   user names, you could map each username to a unique numeric ID expressed as a `Long`, and then count the
+ *   occurrences of those `Long`s with a CMS of type `K=Long`. Note that this mapping between the elements of
+ *   your problem domain and their identifiers used for counting via CMS should be bijective. We require a
+ *   [[CMSHasher]] context bound for `K`, see [[CMSHasher]] for available implicits that can be imported.
+ *   Which type K should you pick in practice? For domains that have less than `2^64` unique elements, you'd
+ *   typically use `Long`. For larger domains you can try `BigInt`, for example.
+ */
+class TopNCMSMonoid[K](cms: CMS[K], heavyHittersN: Int = 100)
+    extends TopCMSMonoid[K](cms, TopNLogic[K](heavyHittersN))
+
+object TopNCMS {
+
+  def monoid[K: CMSHasher](eps: Double, delta: Double, seed: Int, heavyHittersN: Int): TopNCMSMonoid[K] =
+    new TopNCMSMonoid[K](CMS(eps, delta, seed), heavyHittersN)
+
+  def monoid[K: CMSHasher](depth: Int, width: Int, seed: Int, heavyHittersN: Int): TopNCMSMonoid[K] =
+    monoid(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersN)
+
+  def aggregator[K: CMSHasher](
+      eps: Double,
+      delta: Double,
+      seed: Int,
+      heavyHittersN: Int
+  ): TopNCMSAggregator[K] =
+    new TopNCMSAggregator[K](monoid(eps, delta, seed, heavyHittersN))
+
+  def aggregator[K: CMSHasher](depth: Int, width: Int, seed: Int, heavyHittersN: Int): TopNCMSAggregator[K] =
+    aggregator(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersN)
+
+}
+
+/**
+ * An Aggregator for [[TopNCMS]]. Can be created using [[TopNCMS.aggregator]].
+ */
+case class TopNCMSAggregator[K](cmsMonoid: TopNCMSMonoid[K]) extends TopCMSAggregator(cmsMonoid)
+
+/**
+ * K1 defines a scope for the CMS. For each k1, keep the top heavyHittersN associated k2 values.
+ */
+case class ScopedTopNLogic[K1, K2](heavyHittersN: Int) extends HeavyHittersLogic[(K1, K2)] {
+
+  require(heavyHittersN > 0, "heavyHittersN must be > 0")
+
+  override def purgeHeavyHitters(
+      cms: CMS[(K1, K2)]
+  )(hitters: HeavyHitters[(K1, K2)]): HeavyHitters[(K1, K2)] = {
+    val grouped = hitters.hhs.groupBy(hh => hh.item._1)
+    val (underLimit, overLimit) = grouped.partition {
+      _._2.size <= heavyHittersN
+    }
+    val sorted = overLimit.transform { case (_, hhs) =>
+      hhs.toSeq.sortBy(hh => hh.count)
+    }
+    val purged = sorted.transform { case (_, hhs) =>
+      hhs.takeRight(heavyHittersN)
+    }
+    HeavyHitters[(K1, K2)](purged.values.flatten.toSet ++ underLimit.values.flatten.toSet)
+  }
+
+}
+
+/*
+ * Monoid for Top-N values per key in an associative [[TopCMS]].
+ *
+ * Typical use case for this might be (Country, City) pairs.  For a stream of such
+ * pairs, we might want to keep track of the most popular cities for each country.
+ *
+ * This can, of course, be achieved using a Map[Country, TopNCMS[City]], but this
+ * requires storing one CMS per distinct Country.
+ *
+ * Similarly, one could attempt to use a TopNCMS[(Country, City)], but less common
+ * countries may not make the cut if N is not "very large".
+ *
+ * ScopedTopNCMSMonoid[Country, City] will avoid having one Country drown others
+ * out, while still only using a single CMS.
+ *
+ * In general the eviction of K1 is not supported, and all distinct K1 values must
+ * be retained.  Therefore it is important to only use this Monoid when the number
+ * of distinct K1 values is known to be reasonably bounded.
+ */
+class ScopedTopNCMSMonoid[K1, K2](cms: CMS[(K1, K2)], heavyHittersN: Int = 100)
+    extends TopCMSMonoid[(K1, K2)](cms, ScopedTopNLogic[K1, K2](heavyHittersN))
+
+object ScopedTopNCMS {
+
+  def scopedHasher[K1: CMSHasher, K2: CMSHasher]: CMSHasher[(K1, K2)] = new CMSHasher[(K1, K2)] {
+    private val k1Hasher = implicitly[CMSHasher[K1]]
+    private val k2Hasher = implicitly[CMSHasher[K2]]
+
+    override def hash(a: Int, b: Int, width: Int)(x: (K1, K2)): Int = {
+      val (k1, k2) = x
+      val xs = Seq(k1Hasher.hash(a, b, width)(k1), k2Hasher.hash(a, b, width)(k2), a, b)
+      (scala.util.hashing.MurmurHash3.seqHash(xs) & Int.MaxValue) % width
+    }
+  }
+
+  def monoid[K1: CMSHasher, K2: CMSHasher](
+      eps: Double,
+      delta: Double,
+      seed: Int,
+      heavyHittersN: Int
+  ): ScopedTopNCMSMonoid[K1, K2] =
+    new ScopedTopNCMSMonoid[K1, K2](CMS(eps, delta, seed)(scopedHasher[K1, K2]), heavyHittersN)
+
+  def monoid[K1: CMSHasher, K2: CMSHasher](
+      depth: Int,
+      width: Int,
+      seed: Int,
+      heavyHittersN: Int
+  ): ScopedTopNCMSMonoid[K1, K2] =
+    monoid(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersN)
+
+  def aggregator[K1: CMSHasher, K2: CMSHasher](
+      eps: Double,
+      delta: Double,
+      seed: Int,
+      heavyHittersN: Int
+  ): TopCMSAggregator[(K1, K2)] =
+    new TopCMSAggregator(monoid(eps, delta, seed, heavyHittersN))
+
+  def aggregator[K1: CMSHasher, K2: CMSHasher](
+      depth: Int,
+      width: Int,
+      seed: Int,
+      heavyHittersN: Int
+  ): TopCMSAggregator[(K1, K2)] =
+    aggregator(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersN)
+
+}
+
+case class CMSHash[K: CMSHasher](a: Int, b: Int, width: Int) extends java.io.Serializable {
+
+  /**
+   * Returns `a * x + b (mod p) (mod width)`.
+   */
+  def apply(x: K): Int = implicitly[CMSHasher[K]].hash(a, b, width)(x)
+
+}
+
+/**
+ * This formerly held the instances that moved to object CMSHasher
+ *
+ * These instances are slow, but here for compatibility with old serialized data. For new code, avoid these
+ * and instead use the implicits found in the CMSHasher companion object.
+ */
+object CMSHasherImplicits {
+
+  implicit object CMSHasherBigInt extends CMSHasher[BigInt] {
+    override def hash(a: Int, b: Int, width: Int)(x: BigInt): Int =
+      CMSHasher.hashBytes(a, b, width)(x.toByteArray)
+  }
+
+  implicit object CMSHasherString extends CMSHasher[String] {
+    override def hash(a: Int, b: Int, width: Int)(x: String): Int =
+      CMSHasher.hashBytes(a, b, width)(x.getBytes("UTF-8"))
+  }
+
+  def cmsHasherShort: CMSHasher[Short] = CMSHasher.cmsHasherShort
+}
diff --git a/algebird-core/src/main/scala-2.13/DecayedVector.scala b/algebird-core/src/main/scala-2.13/DecayedVector.scala
new file mode 100644
index 000000000..18e816fe4
--- /dev/null
+++ b/algebird-core/src/main/scala-2.13/DecayedVector.scala
@@ -0,0 +1,75 @@
+/*
+Copyright 2012 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+ */
+
+package com.twitter.algebird
+
+/**
+ * Represents a container class together with time. Its monoid consists of exponentially scaling the older
+ * value and summing with the newer one.
+ */
+object DecayedVector extends CompatDecayedVector {
+  def buildWithHalflife[C[_]](vector: C[Double], time: Double, halfLife: Double): DecayedVector[C] =
+    DecayedVector(vector, time * scala.math.log(2.0) / halfLife)
+
+  def monoidWithEpsilon[C[_]](
+      eps: Double
+  )(implicit vs: VectorSpace[Double, C], metric: Metric[C[Double]]): Monoid[DecayedVector[C]] =
+    new Monoid[DecayedVector[C]] {
+      override val zero = DecayedVector(vs.group.zero, Double.NegativeInfinity)
+      override def plus(left: DecayedVector[C], right: DecayedVector[C]) =
+        if (left.scaledTime <= right.scaledTime) {
+          scaledPlus(right, left, eps)
+        } else {
+          scaledPlus(left, right, eps)
+        }
+    }
+
+  def forMap[K](m: Map[K, Double], scaledTime: Double): DecayedVector[Map[K, _]] =
+    DecayedVector[Map[K, _]](m, scaledTime)
+  def forMapWithHalflife[K](m: Map[K, Double], time: Double, halfLife: Double): DecayedVector[Map[K, _]] =
+    forMap(m, time * scala.math.log(2.0) / halfLife)
+
+  def mapMonoidWithEpsilon[K](
+      eps: Double
+  )(implicit
+      vs: VectorSpace[Double, Map[K, _]],
+      metric: Metric[Map[K, Double]]
+  ): Monoid[DecayedVector[Map[K, _]]] =
+    monoidWithEpsilon[Map[K, _]](eps)
+
+  implicit def mapMonoid[K](implicit
+      vs: VectorSpace[Double, Map[K, _]],
+      metric: Metric[Map[K, Double]]
+  ): Monoid[DecayedVector[Map[K, _]]] =
+    mapMonoidWithEpsilon(-1.0)
+
+  def scaledPlus[C[_]](newVal: DecayedVector[C], oldVal: DecayedVector[C], eps: Double)(implicit
+      vs: VectorSpace[Double, C],
+      metric: Metric[C[Double]]
+  ): DecayedVector[C] = {
+    implicit val mon: Monoid[C[Double]] = vs.group
+    val expFactor = scala.math.exp(oldVal.scaledTime - newVal.scaledTime)
+    val newVector =
+      Monoid.plus(newVal.vector, vs.scale(expFactor, oldVal.vector))
+    if (eps < 0.0 || Metric.norm(newVector) > eps) {
+      DecayedVector(newVector, newVal.scaledTime)
+    } else {
+      DecayedVector(mon.zero, Double.NegativeInfinity)
+    }
+  }
+}
+
+case class DecayedVector[C[_]](vector: C[Double], scaledTime: Double)
diff --git a/algebird-core/src/main/scala-2.13/DecayingCMS.scala b/algebird-core/src/main/scala-2.13/DecayingCMS.scala
new file mode 100644
index 000000000..54809e2a8
--- /dev/null
+++ b/algebird-core/src/main/scala-2.13/DecayingCMS.scala
@@ -0,0 +1,650 @@
+package com.twitter.algebird
+
+import java.lang.Double.{compare => cmp}
+import java.lang.Math
+import java.util.Arrays.deepHashCode
+import scala.concurrent.duration.Duration
+import scala.util.Random
+
+/**
+ * DecayingCMS is a module to build count-min sketch instances whose counts decay exponentially.
+ *
+ * Similar to a Map[K, com.twitter.algebird.DecayedValue], each key is associated with a single count value
+ * that decays over time. Unlike a map, the decyaing CMS is an approximate count -- in exchange for the
+ * possibility of over-counting, we can bound its size in memory.
+ *
+ * The intended use case is for metrics or machine learning where exact values aren't needed.
+ *
+ * You can expect the keys with the biggest values to be fairly accurate but the very small values (rare keys
+ * or very old keys) to be lost in the noise. For both metrics and ML this should be fine: you can't learn too
+ * much from very rare values.
+ *
+ * We recommend depth of at least 5, and width of at least 100, but you should do some experiments to
+ * determine the smallest parameters that will work for your use case.
+ */
+final class DecayingCMS[K](
+    seed: Long,
+    val halfLife: Duration,
+    val depth: Int, // number of hashing functions
+    val width: Int, // number of table cells per hashing function
+    hasher: CMSHasher[K]
+) extends Serializable { module =>
+
+  override def toString: String =
+    s"DecayingCMS(seed=$seed, halfLife=$halfLife, depth=$depth, width=$width)"
+
+  @inline private def getNextLogScale(
+      logScale: Double,
+      oldTimeInHL: Double,
+      nowInHL: Double
+  ): Double =
+    if (nowInHL == oldTimeInHL) logScale else logScale + (nowInHL - oldTimeInHL) * log2
+
+  @inline private def getScale(logScale: Double, oldTimeInHL: Double, nowInHL: Double): Double = {
+    val logScale1 = getNextLogScale(logScale, oldTimeInHL, nowInHL)
+    Math.exp(-logScale1)
+  }
+
+  val empty: CMS =
+    new CMS(Array.fill(depth)(Vector.fill[Double](width)(0.0)), 0.0, Double.NegativeInfinity)
+
+  /**
+   * Represents a decaying scalar value at a particular point in time.
+   *
+   * The value decays according to halfLife. Another way to think about DoubleAt is that it represents a
+   * particular decay curve (and in particular, a point along that curve). Two DoubleAt values may be
+   * equivalent if they are two points on the same curve.
+   *
+   * The `timeToZero` and `timeToUnit` methods can be used to "normalize" DoubleAt values. If two DoubleAt
+   * values do not produce the same (approximate) Double values from these methods, they represent different
+   * curves.
+   */
+  class DoubleAt private[algebird] (val value: Double, val timeInHL: Double) extends Serializable {
+    lhs =>
+
+    // this is not public because it's not safe in general -- you need
+    // to run a function that is time-commutative.
+    private[algebird] def map(f: Double => Double): DoubleAt =
+      new DoubleAt(f(value), timeInHL)
+
+    // this is not public because it's not safe in general -- you need
+    // to run a function that is time-commutative.
+    private[algebird] def map2(rhs: DoubleAt)(f: (Double, Double) => Double): DoubleAt =
+      if (lhs.timeInHL < rhs.timeInHL) {
+        val x = lhs.scaledAt(rhs.timeInHL)
+        new DoubleAt(f(x, rhs.value), rhs.timeInHL)
+      } else if (lhs.timeInHL == rhs.timeInHL) {
+        new DoubleAt(f(lhs.value, rhs.value), rhs.timeInHL)
+      } else {
+        val y = rhs.scaledAt(lhs.timeInHL)
+        new DoubleAt(f(lhs.value, y), lhs.timeInHL)
+      }
+
+    def unary_- : DoubleAt = new DoubleAt(-value, timeInHL)
+    def abs: DoubleAt = new DoubleAt(Math.abs(value), timeInHL)
+    def *(n: Double): DoubleAt = new DoubleAt(value * n, timeInHL)
+
+    def +(rhs: DoubleAt): DoubleAt = map2(rhs)(_ + _)
+    def -(rhs: DoubleAt): DoubleAt = map2(rhs)(_ - _)
+    def min(rhs: DoubleAt): DoubleAt = map2(rhs)(Math.min)
+    def max(rhs: DoubleAt): DoubleAt = map2(rhs)(Math.max)
+
+    def /(rhs: DoubleAt): Double = map2(rhs)(_ / _).value
+
+    /**
+     * We consider two DoubleAt values equal not just if their elements are equal, but also if they represent
+     * the same value at different points of decay.
+     */
+    def compare(rhs: DoubleAt): Int = {
+      val vc = cmp(lhs.value, rhs.value)
+      val tc = cmp(lhs.timeInHL, rhs.timeInHL)
+      if (vc == tc) vc
+      else if (tc == 0) vc
+      else if (vc == 0) tc
+      else if (tc < 0) cmp(lhs.scaledAt(rhs.timeInHL), rhs.value)
+      else cmp(lhs.value, rhs.scaledAt(lhs.timeInHL))
+    }
+
+    /**
+     * Time when this value will reach the smallest double value bigger than zero, unless we are already at
+     * zero in which case we return the current time
+     */
+    def timeToZero: Double =
+      if (java.lang.Double.isNaN(value)) Double.NaN
+      else if (java.lang.Double.isInfinite(value)) Double.PositiveInfinity
+      else if (value == 0.0) timeInHL
+      else timeToUnit + DoubleAt.TimeFromUnitToZero
+
+    /**
+     * This is the scaled time when the current value will reach 1 (or -1 for negative values)
+     *
+     * This method is a way of collapsing a DoubleAt into a single value (the time in the past or future where
+     * its value would be 1, the unit value).
+     */
+    def timeToUnit: Double =
+      if (java.lang.Double.isNaN(value)) Double.NaN
+      else if (java.lang.Double.isInfinite(value)) Double.PositiveInfinity
+      else if (value == 0.0) Double.NegativeInfinity
+      else {
+        // solve for result:
+        //
+        //   1 = value * module.getScale(0.0, timeInHL, result)
+        //   1 = value * Math.exp(-getNextLogScale(0.0, timeInHL, result))
+        //   1 / value = Math.exp(-getNextLogScale(0.0, timeInHL, result))
+        //   log(1 / value) = -getNextLogScale(0.0, timeInHL, result)
+        //   -log(1 / value) = getNextLogScale(0.0, timeInHL, result)
+        //   log(value) = getNextLogScale(0.0, timeInHL, result)
+        //   log(value) = if (result == timeInHL) 0 else 0 + (result - timeInHL) * log2
+        //   log(value) = if (result == timeInHL) 0 else (result - timeInHL) * log2
+        //
+        //   log(value) = (result - timeInHL) * log2
+        //   log(value) / log2 = result - timeInHL
+        //   log(value) / log2 + timeInHL = result
+        Math.log(Math.abs(value)) / log2 + timeInHL
+      }
+
+    override def equals(that: Any): Boolean =
+      that match {
+        case d: DoubleAt => compare(d) == 0
+        case _           => false
+      }
+
+    override def hashCode: Int =
+      timeToUnit.##
+
+    override def toString: String =
+      s"DoubleAt($value, $timeInHL)"
+
+    def <(rhs: DoubleAt): Boolean = (lhs.compare(rhs)) < 0
+    def <=(rhs: DoubleAt): Boolean = (lhs.compare(rhs)) <= 0
+    def >(rhs: DoubleAt): Boolean = (lhs.compare(rhs)) > 0
+    def >=(rhs: DoubleAt): Boolean = (lhs.compare(rhs)) >= 0
+
+    def time: Long =
+      toTimestamp(timeInHL)
+
+    private def scaledAt(t: Double): Double =
+      if (value == 0.0) 0.0
+      else value * module.getScale(0.0, timeInHL, t)
+
+    def at(time: Long): Double =
+      if (value == 0.0) 0.0
+      else value * module.getScale(0.0, timeInHL, fromTimestamp(time))
+  }
+
+  object DoubleAt {
+    def apply(x: Double, t: Long): DoubleAt =
+      new DoubleAt(x, fromTimestamp(t))
+
+    val zero: DoubleAt =
+      new DoubleAt(0.0, Double.NegativeInfinity)
+
+    private val TimeFromUnitToZero: Double =
+      -Math.log(Double.MinPositiveValue) / log2
+  }
+
+  val totalCells: Int = depth * width
+
+  val halfLifeSecs: Double =
+    halfLife.toMillis.toDouble / 1000.0
+
+  // TODO: consider a smaller number?
+  // we are trading accuracy for possible performence
+  private[this] val maxLogScale: Double = 20.0
+
+  /**
+   * Allocate an empty array of row.
+   *
+   * The elements start as null. It's an important optimization _not_ to allocate vectors here, since we're
+   * often building up cells mutably.
+   */
+  private def allocCells(): Array[Vector[Double]] =
+    new Array[Vector[Double]](depth)
+
+  def toTimestamp(t: Double): Long =
+    (t * halfLifeSecs * 1000.0).toLong
+
+  def fromTimestamp(t: Long): Double =
+    (t.toDouble / 1000.0) / halfLifeSecs
+
+  val hashFns: Array[K => Int] = {
+    val rng = new Random(seed)
+    def genPos(): Int =
+      rng.nextInt() match {
+        case 0 => genPos()
+        case n => n & 0x7fffffff
+      }
+
+    (0 until depth).map { _ =>
+      val n = genPos()
+      (k: K) => hasher.hash(n, 0, width)(k)
+    }.toArray
+  }
+
+  private final val log2 = Math.log(2.0)
+
+  /**
+   * The idealized formula for the updating current value for a key (y0 -> y1) is given as:
+   *
+   * delta = (t1 - t0) / halflife y1 = y0 * 2^(-delta) + n
+   *
+   * However, we want to avoid having to rescale every single cell every time we update; i.e. a cell with a
+   * zero value should continue to have a zero value when n=0.
+   *
+   * Therefore, we introduce a change of variable to cell values (z) along with a scale factor (scale), and
+   * the following formula:
+   *
+   * (1) zN = yN * scaleN
+   *
+   * Our constraint is expressed as:
+   *
+   * (2) If n=0, z1 = z0
+   *
+   * In that case:
+   *
+   * (3) If n=0, (y1 * scale1) = (y0 * scale0) (4) Substituting for y1, (y0 * 2^(-delta) + 0) * scale1 = y0 *
+   * scale0 (5) 2^(-delta) * scale1 = scale0 (6) scale1 = scale0 * 2^(delta)
+   *
+   * Also, to express z1 in terms of z0, we say:
+   *
+   * (7) z1 = y1 * scale1 (8) z1 = (y0 * 2^(-delta) + n) * scale1 (9) z1 = ((z0 / scale0) * 2^(-delta) + n) *
+   * scale1 (10) z1 / scale1 = (z0 / (scale1 * 2^(-delta))) * 2^(-delta) + n (11) z1 / scale1 = z0 / scale1 +
+   * n (12) z1 = z0 + n * scale1
+   *
+   * So, for cells where n=0, we just update scale0 to scale1, and for cells where n is non-zero, we update z1
+   * in terms of z0 and scale1.
+   *
+   * If we convert scale to logscale, we have:
+   *
+   * (13) logscale1 = logscale0 + delta * log(2) (14) z1 = z0 + n * exp(logscale1)
+   *
+   * When logscale1 gets big, we start to distort z1. For example, exp(36) is close to 2^53. We can measure
+   * when n * exp(logscale1) gets big, and in those cases we can rescale all our cells (set each z to its
+   * corresponding y) and set the logscale to 0.
+   *
+   * (15) y1 = z1 / scale1 (16) y1 = z1 / exp(logscale1) (17) y1 = z1 * exp(-logscale1)
+   */
+  final class CMS(
+      val cells: Array[Vector[Double]],
+      val logScale: Double,
+      val timeInHL: Double
+  ) extends Serializable {
+
+    @inline private def scale: Double =
+      Math.exp(-logScale)
+
+    override def toString: String = {
+      val s = cells.iterator.map(_.toString).mkString("Array(", ", ", ")")
+      s"CMS($s, $logScale, $timeInHL)"
+    }
+
+    override def hashCode: Int =
+      deepHashCode(cells.asInstanceOf[Array[Object]]) * 59 +
+        logScale.## * 17 +
+        timeInHL.## * 37 +
+        19
+
+    // unfortunately we can't check the path-dependent type of this
+    // CMS, which we signal by using a type projection here.
+    override def equals(any: Any): Boolean =
+      any match {
+        case that: DecayingCMS[?]#CMS =>
+          this.logScale == that.logScale &&
+          this.timeInHL == that.timeInHL &&
+          this.cells.length == that.cells.length && {
+            var i = 0
+            while (i < depth) {
+              if (this.cells(i) != that.cells(i)) return false
+              i += 1
+            }
+            true
+          }
+        case _ =>
+          false
+      }
+
+    def lastUpdateTime: Long =
+      toTimestamp(timeInHL)
+
+    /**
+     * Provide lower and upper bounds on values returned for any possible key.
+     *
+     * The first value is a lower bound: even keys that have never been counted will return this value or
+     * greater. This will be zero unless the CMS is saturated.
+     *
+     * The second value is an upper bound: the key with the largest cardinality will not be reported as being
+     * larger than this value (though it might be reported as being smaller).
+     *
+     * Together these values indicate how saturated and skewed the CMS might be.
+     */
+    def range: (DoubleAt, DoubleAt) = {
+      var minMinimum = Double.PositiveInfinity
+      var minMaximum = Double.PositiveInfinity
+      var i = 0
+      while (i < cells.length) {
+        val it = cells(i).iterator
+        var localMax = it.next() // we know it doesn't start empty
+        if (localMax < minMinimum) minMinimum = localMax
+        while (it.hasNext) {
+          val n = it.next()
+          if (n > localMax) localMax = n
+          else if (n < minMinimum) minMinimum = n
+        }
+        if (localMax < minMaximum) minMaximum = localMax
+        i += 1
+      }
+
+      val s = scale
+      def sc(x: Double): DoubleAt =
+        new DoubleAt(if (x == 0.0) 0.0 else x * s, timeInHL)
+
+      (sc(minMinimum), sc(minMaximum))
+    }
+
+    /**
+     * Returns the square-root of the inner product of two decaying CMSs.
+     *
+     * We want the result to decay at the same rate as the CMS for this method to be valid. Taking the square
+     * root ensures that this is true. Without it, we would violate the following equality (assuming we had
+     * at() on a CMS):
+     *
+     * x.innerProduct(y).at(t) = x.at(t).innerProduct(y.at(t))
+     *
+     * This is why we don't support innerProduct, only innerProductRoot.
+     */
+    def innerProductRoot(that: CMS): DoubleAt = {
+      var i = 0
+      var res = Double.PositiveInfinity
+      val t = Math.max(this.timeInHL, that.timeInHL)
+      val scale = this.getScale(t) * that.getScale(t)
+      while (i < depth) {
+        var sum = 0.0
+        val it0 = this.cells(i).iterator
+        val it1 = that.cells(i).iterator
+        while (it0.hasNext) {
+          val x = it0.next() * it1.next()
+          if (x != 0.0) sum += x
+        }
+        if (sum < res) res = sum
+        i += 1
+      }
+      val x = if (res != 0.0) Math.sqrt(res * scale) else 0.0
+      new DoubleAt(x, t)
+    }
+
+    def l2Norm: DoubleAt =
+      innerProductRoot(this)
+
+    def scale(x: Double): CMS =
+      if (java.lang.Double.isNaN(x)) {
+        throw new IllegalArgumentException(s"invalid scale: $x")
+      } else if (x < 0.0) {
+        throw new IllegalArgumentException(s"negative scale is not allowed: $x")
+      } else if (x == 0.0) {
+        module.empty
+      } else {
+        val s = logScale + Math.log(x)
+        val c = new CMS(cells, s, timeInHL)
+        if (s > maxLogScale) c.rescaleTo(timeInHL) else c
+      }
+
+    /**
+     * Get the total count of all items in the CMS.
+     *
+     * The total is the same as the l1Norm, since we don't allow negative values.
+     *
+     * Total is one of the few non-approximate statistics that DecayingCMS supports. We expect the total to be
+     * exact (except for floating-point error).
+     */
+    def total: DoubleAt = {
+      val n = cells(0).sum
+      val x = if (n == 0.0) 0.0 else scale * n
+      new DoubleAt(x, timeInHL)
+    }
+
+    def get(k: K): DoubleAt = {
+      var minValue = Double.PositiveInfinity
+      var didx = 0
+      while (didx < depth) {
+        val i = hashFns(didx)(k)
+        val inner = cells(didx)
+        val value = inner(i)
+        if (value < minValue) minValue = value
+        didx += 1
+      }
+      val x = if (minValue == 0.0) 0.0 else scale * minValue
+      new DoubleAt(x, timeInHL)
+    }
+
+    def getScale(t: Double): Double =
+      module.getScale(logScale, timeInHL, t)
+
+    private final def nextLogScale(t: Double): Double =
+      module.getNextLogScale(logScale, timeInHL, t)
+
+    def +(other: CMS): CMS = {
+      val x = this
+      val y = other
+      val timeInHL = Math.max(x.timeInHL, y.timeInHL)
+      val cms = new CMS(allocCells(), 0.0, timeInHL)
+
+      val xscale = x.getScale(timeInHL)
+      val yscale = y.getScale(timeInHL)
+
+      // a zero count is zero, no matter, how big the scale is.
+      @inline def prod(x: Double, y: Double): Double =
+        if (x == 0.0) 0.0 else x * y
+
+      var i = 0
+      while (i < depth) {
+        val left = x.cells(i)
+        val right = y.cells(i)
+        var j = 0
+        val bldr = rowBuilder()
+        while (j < width) {
+          bldr += prod(left(j), xscale) + prod(right(j), yscale)
+          j += 1
+        }
+        cms.cells(i) = bldr.result()
+        i += 1
+      }
+      cms
+    }
+
+    def add(t: Long, k: K, n: Double): CMS =
+      scaledAdd(fromTimestamp(t), k, n)
+
+    // TODO: we could allocate a mutable scratch pad, write all the
+    // values into it, and then build a CMS out of it. if items is
+    // very small, this would be less efficient than what we're doing
+    // now. probably the "ideal" solution would be determine how many
+    // items there are. if we have fewer than ~width items, this
+    // approach is fine. for more, a scratch pad would be better
+    // (assuming we wrote that code).
+    //
+    // alternately, you could map items into (zero + item) and then
+    // use the monoid's sum to boil it down.
+    //
+    // we only use this in testing currently so the current code is
+    // fine until we rely on it in production. any change here should
+    // probably include benchmarks justifying the design.
+    def bulkAdd(items: Iterable[(Long, K, Double)]): CMS =
+      items.foldLeft(this) { case (c, (t, k, v)) => c.add(t, k, v) }
+
+    private[algebird] def scaledAdd(ts1: Double, k: K, n: Double): CMS =
+      if (n < 0.0) {
+        val t = toTimestamp(ts1)
+        throw new IllegalArgumentException(
+          s"we can only add non-negative numbers to a CMS, got $n for key: $k at time: $t"
+        )
+      } else if (n == 0.0) {
+        this
+      } else {
+        val logScale1 = nextLogScale(ts1)
+        if (logScale1 > maxLogScale) {
+          rescaleTo(ts1).scaledAdd(ts1, k, n)
+        } else {
+          val increment = n * Math.exp(logScale1)
+          val cells1 = allocCells()
+          var didx = 0
+          while (didx < depth) {
+            val cell = cells(didx)
+            val w = hashFns(didx)(k)
+            cells1(didx) = cell.updated(w, cell(w) + increment)
+            didx += 1
+          }
+          new CMS(cells1, logScale1, ts1)
+        }
+      }
+
+    // Set the scale back to 0.0
+    // input time is in half-lives
+    private[algebird] def rescaleTo(ts: Double): CMS = {
+      val logScale1 = nextLogScale(ts)
+      val expL = Math.exp(-logScale1)
+      if (expL == 0.0) {
+        new CMS(monoid.zero.cells, 0.0, ts)
+      } else {
+        val cms = new CMS(allocCells(), 0.0, ts)
+        var i = 0
+        while (i < depth) {
+          val ci = cells(i)
+          cms.cells(i) = ci.map(_ * expL)
+          i += 1
+        }
+        cms
+      }
+    }
+  }
+
+  private def rowBuilder() = {
+    val bldr = Vector.newBuilder[Double]
+    bldr.sizeHint(width)
+    bldr
+  }
+
+  object CMS {
+
+    implicit val monoidForCMS: Monoid[CMS] =
+      new Monoid[CMS] {
+
+        def zero: CMS = module.empty
+
+        def plus(x: CMS, y: CMS): CMS =
+          x + y
+
+        /**
+         * Turn a flat array into an array of vectors.
+         */
+        private def scratchToCells(scratch: Array[Double]): Array[Vector[Double]] = {
+          val cells = new Array[Vector[Double]](depth)
+          var i = 0
+          while (i < depth) {
+            var j = i * width
+            val limit = j + width
+            val bldr = rowBuilder()
+            while (j < limit) {
+              bldr += scratch(j)
+              j += 1
+            }
+            cells(i) = bldr.result()
+            i += 1
+          }
+          cells
+        }
+
+        /**
+         * This method sums the first `num` items in `arr`.
+         */
+        private def innerSum(arr: Array[CMS], num: Int): CMS =
+          if (num == 0) zero
+          else if (num == 1) arr(0)
+          else if (num == 2) plus(arr(0), arr(1))
+          else {
+            // start with zero
+            val scratch: Array[Double] = new Array(totalCells)
+
+            val latestTimeInHL: Double =
+              arr.iterator.take(num).map(cms => cms.timeInHL).max
+
+            var i = 0
+            while (i < num) {
+              val cms = arr(i)
+              val scale = cms.getScale(latestTimeInHL)
+              var j = 0
+              while (j < depth) {
+                val row = cms.cells(j)
+                val stride = j * width
+                var k = 0
+                while (k < width) {
+                  val n = row(k)
+                  if (n > 0.0) {
+                    scratch(stride + k) += scale * n
+                  }
+                  k += 1
+                }
+                j += 1
+              }
+              i += 1
+            }
+
+            val cells = scratchToCells(scratch)
+
+            new CMS(cells, 0.0, latestTimeInHL)
+          }
+
+        override def sumOption(xs: TraversableOnce[CMS]): Option[CMS] = {
+
+          val it: Iterator[CMS] = xs.toIterator
+          val ChunkSize = 1000
+
+          // the idea here is that we read up to 1000 CMS values into
+          // a fixed array, crunch them down to a single CMS, store it
+          // in the first array index, read up to 999 more CMS values
+          // in, crunch them down, and so on.
+          var i = 0
+          val arr = new Array[CMS](ChunkSize)
+          while (it.hasNext) {
+            while (it.hasNext && i < ChunkSize) {
+              arr(i) = it.next()
+              i += 1
+            }
+            if (i > 1) {
+              arr(0) = innerSum(arr, i)
+            }
+            i = 1
+          }
+          if (i == 0) None else Some(arr(0))
+        }
+      }
+  }
+
+  val monoid: Monoid[CMS] = CMS.monoidForCMS
+}
+
+object DecayingCMS {
+
+  /**
+   * Construct a DecayingCMS module.
+   *
+   * The seed is used to initialize the hash families used by the count-min sketch. Using the same seed will
+   * always produce the same hash family.
+   *
+   * Half-life determines the rate at which values in the CMS decay. If a key was counted once at time t, by
+   * time (t + halfLife), the value for that key will be 0.5. After enough half lives the value will decay to
+   * zero.
+   *
+   * The size of the CMS in bytes is O(depth * width).
+   *
+   * Width controls the relative error due to over-counting (approximately 1/width). For 1% error, use
+   * width=100, for 0.1% error, use width=1000, etc.
+   *
+   * Depth controls the probability the error bounds are broken and that probability scales with exp(-alpha *
+   * depth) so, a small depth (e.g. 5-10) is fine. Each update requires O(depth) work so you want to keep this
+   * as small as possible.
+   */
+  def apply[K](seed: Long, halfLife: Duration, depth: Int, width: Int)(implicit
+      hasher: CMSHasher[K]
+  ): DecayingCMS[K] =
+    new DecayingCMS(seed, halfLife, depth, width, hasher)
+}
diff --git a/algebird-core/src/main/scala-2.13/Fold.scala b/algebird-core/src/main/scala-2.13/Fold.scala
new file mode 100644
index 000000000..0b89f2d62
--- /dev/null
+++ b/algebird-core/src/main/scala-2.13/Fold.scala
@@ -0,0 +1,352 @@
+/*
+Copyright 2014 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+ */
+package com.twitter.algebird
+
+import java.io.Serializable
+import scala.collection.compat._
+
+/**
+ * Folds are first-class representations of "Traversable.foldLeft." They have the nice property that they can
+ * be fused to work in parallel over an input sequence.
+ *
+ * A Fold accumulates inputs (I) into some internal type (X), converting to a defined output type (O) when
+ * done. We use existential types to hide internal details and to allow for internal and external (X and O)
+ * types to differ for "map" and "join."
+ *
+ * In discussing this type we draw parallels to Function1 and related types. You can think of a fold as a
+ * function "Seq[I] => O" but in reality we do not have to materialize the input sequence at once to "run" the
+ * fold.
+ *
+ * The traversal of the input data structure is NOT done by Fold itself. Instead we expose some methods like
+ * "overTraversable" that know how to iterate through various sequence types and drive the fold. We also
+ * expose some internal state so library authors can fold over their own types.
+ *
+ * See the companion object for constructors.
+ */
+sealed trait Fold[-I, +O] extends Serializable {
+
+  /**
+   * Users can ignore this type.
+   *
+   * The internal accumulator type. No one outside this Fold needs to know what this is, and that's a good
+   * thing. It keeps type signatures sane and makes this easy to use for the amount of flexibility it
+   * provides.
+   */
+  type X
+
+  /**
+   * Users can ignore this method. It is exposed so library authors can run folds over their own sequence
+   * types.
+   *
+   * "build" constructs a FoldState, which tells us how to run the fold. It is expected that we can run the
+   * same Fold many times over different data structures, but we must build a new FoldState every time.
+   *
+   * See FoldState for information on how to use this for your own sequence types.
+   */
+  def build(): FoldState[X, I, O]
+
+  /**
+   * Transforms the output of the Fold after iteration is complete. This is analogous to "Future.map" or
+   * "Function1.compose."
+   */
+  def map[P](f: O => P): Fold[I, P] = {
+    val self = this
+    new Fold[I, P] {
+      type X = self.X
+      override def build(): FoldState[X, I, P] =
+        self.build().map(f)
+    }
+  }
+
+  /**
+   * Joins two folds into one and combines the results. The fused fold accumulates with both at the same time
+   * and combines at the end.
+   */
+  def joinWith[I2 <: I, P, Q](other: Fold[I2, P])(f: (O, P) => Q): Fold[I2, Q] = {
+    val self = this
+    new Fold[I2, Q] {
+      type X = (self.X, other.X)
+      override def build(): FoldState[X, I2, Q] = {
+        val first = self.build()
+        val second = other.build()
+        new FoldState(
+          { case ((x, y), i) => (first.add(x, i), second.add(y, i)) },
+          (first.start, second.start),
+          { case (x, y) => f(first.end(x), second.end(y)) }
+        )
+      }
+    }
+  }
+
+  /**
+   * Convenient shorthand for joining Folds without combining at the end.
+   */
+  def join[I2 <: I, P](other: Fold[I2, P]): Fold[I2, (O, P)] =
+    joinWith(other) { case (o, p) => (o, p) }
+
+  /**
+   * Transforms the input of the fold before every accumulation. (The name comes from "contravariant map.")
+   * This is analogous to "Function1.andThen."
+   */
+  def contramap[H](f: H => I): Fold[H, O] = {
+    val self = this
+    new Fold[H, O] {
+      type X = self.X
+      override def build(): FoldState[X, H, O] =
+        self.build().contramap(f)
+    }
+  }
+
+  /**
+   * Trivially runs a Fold over an empty sequence.
+   */
+  def overEmpty: O = {
+    // build is a "def" so we construct the state once and use the pieces to run the fold
+    val state = build()
+    state.end(state.start)
+  }
+
+  /**
+   * Trivially runs a Fold over a single element sequence.
+   */
+  def overSingleton(i: I): O = {
+    val state = build()
+    state.end(state.add(state.start, i))
+  }
+
+  /**
+   * Runs a Fold over a Traversable.
+   */
+  def overTraversable(is: TraversableOnce[I]): O = {
+    val state = build()
+    state.end(is.iterator.foldLeft(state.start)(state.add))
+  }
+}
+
+/**
+ * A FoldState defines a left fold with a "hidden" accumulator type. It is exposed so library authors can run
+ * Folds over their own sequence types.
+ *
+ * The fold can be executed correctly according to the properties of "add" and your traversed data structure.
+ * For example, the "add" function of a monoidal fold will be associative. A FoldState is valid for only one
+ * iteration because the accumulator (seeded by "start" may be mutable.
+ *
+ * The three components of a fold are add: (X, I) => X - updates and returns internal state for every input I
+ * start: X - the initial state end: X => O - transforms internal state to a final result
+ *
+ * Folding over Seq(x, y) would produce the result end(add(add(start, x), y))
+ */
+final class FoldState[X, -I, +O] private[algebird] (val add: (X, I) => X, val start: X, val end: X => O)
+    extends Serializable {
+
+  /**
+   * Transforms the output type of the FoldState (see Fold.map).
+   */
+  def map[P](f: O => P): FoldState[X, I, P] =
+    new FoldState(add, start, end.andThen(f))
+
+  /**
+   * Transforms the input type of the FoldState (see Fold.contramap).
+   */
+  def contramap[H](f: H => I): FoldState[X, H, O] =
+    new FoldState((x, h) => add(x, f(h)), start, end)
+}
+
+/**
+ * Methods to create and run Folds.
+ *
+ * The Folds defined here are immutable and serializable, which we expect by default. It is important that you
+ * as a user indicate mutability or non-serializability when defining new Folds. Additionally, it is
+ * recommended that "end" functions not mutate the accumulator in order to support scans (producing a stream
+ * of intermediate outputs by calling "end" at each step).
+ */
+object Fold extends CompatFold {
+
+  /**
+   * "import Fold.applicative" will bring the Applicative instance into scope. See FoldApplicative.
+   */
+  implicit def applicative[I]: Applicative[Fold[I, _]] =
+    new FoldApplicative[I]
+
+  /**
+   * Turn a common Scala foldLeft into a Fold. The accumulator MUST be immutable and serializable.
+   */
+  def foldLeft[I, O](o: O)(add: (O, I) => O): Fold[I, O] =
+    fold[O, I, O](add, o, o => o)
+
+  /**
+   * A general way of defining Folds that supports a separate accumulator type. The accumulator MUST be
+   * immutable and serializable.
+   */
+  def fold[M, I, O](add: (M, I) => M, start: M, end: M => O): Fold[I, O] =
+    new Fold[I, O] {
+      type X = M
+      override def build(): FoldState[X, I, O] =
+        new FoldState(add, start, end)
+    }
+
+  /**
+   * A general way of defining Folds that supports constructing mutable or non-serializable accumulators.
+   */
+  def foldMutable[M, I, O](add: (M, I) => M, start: Unit => M, end: M => O): Fold[I, O] =
+    new Fold[I, O] {
+      type X = M
+      override def build(): FoldState[X, I, O] =
+        new FoldState(add, start(()), end)
+    }
+
+  /**
+   * Fuse a sequence of Folds into one that outputs the result of each.
+   */
+  def sequence[I, O](ms: Seq[Fold[I, O]]): Fold[I, Seq[O]] =
+    new Fold[I, Seq[O]] {
+      type X = Seq[Any]
+      override def build(): FoldState[Seq[Any], I, Seq[O]] = {
+        val bs: Seq[FoldState[Any, I, O]] =
+          ms.map(_.build().asInstanceOf[FoldState[Any, I, O]])
+        val adds =
+          bs.map(_.add)
+        val ends =
+          bs.map(_.end)
+        val starts: Seq[Any] =
+          bs.map(_.start)
+        val add: (Seq[Any], I) => Seq[Any] = { (xs, i) => adds.zip(xs).map { case (f, x) => f(x, i) } }
+        val end: (Seq[Any] => Seq[O]) = { xs => ends.zip(xs).map { case (f, x) => f(x) } }
+        new FoldState(add, starts, end)
+      }
+    }
+
+  /**
+   * An even simpler Fold that collects into a Seq. Shorthand for "container[I, Seq];" fewer type arguments,
+   * better type inferrence.
+   */
+  def seq[I]: Fold[I, Seq[I]] =
+    container[I, Seq]
+
+  /**
+   * A Fold that does no work and returns a constant. Analogous to Function1 const: def const[A, B](b: B): (A
+   * \=> B) = { _ => b }
+   */
+  def const[O](value: O): Fold[Any, O] =
+    Fold.foldLeft(value) { case (u, _) => u }
+
+  /**
+   * A Fold that runs the given side effect for every element.
+   */
+  def foreach[I](e: I => Unit): Fold[I, Unit] =
+    Fold.foldLeft(()) { case (_, i) => e(i) }
+
+  /**
+   * A Fold that returns the first value in a sequence.
+   */
+  def first[I]: Fold[I, Option[I]] =
+    Fold.foldLeft[I, Option[I]](None) {
+      case (None, i) => Some(i)
+      case (x, _)    => x
+    }
+
+  /**
+   * A Fold that returns the last value in a sequence.
+   */
+  def last[I]: Fold[I, Option[I]] =
+    Fold.foldLeft[I, Option[I]](None) { case (_, i) => Some(i) }
+
+  /**
+   * A Fold that returns the max value in a sequence. (Biased to earlier equal values.)
+   */
+  def max[I](implicit ordering: Ordering[I]): Fold[I, Option[I]] =
+    Fold.foldLeft[I, Option[I]](None) {
+      case (None, i)                                  => Some(i)
+      case (Some(y), i) if ordering.compare(y, i) < 0 => Some(i)
+      case (x, _)                                     => x
+    }
+
+  /**
+   * A Fold that returns a min value in a sequence. (Biased to earlier equal values.)
+   */
+  def min[I](implicit ordering: Ordering[I]): Fold[I, Option[I]] =
+    Fold.foldLeft[I, Option[I]](None) {
+      case (None, i)                                  => Some(i)
+      case (Some(y), i) if ordering.compare(y, i) > 0 => Some(i)
+      case (x, _)                                     => x
+    }
+
+  /**
+   * A Fold that returns the sum of a numeric sequence. Does not protect against overflow.
+   */
+  def sum[I](implicit numeric: Monoid[I]): Fold[I, I] =
+    Fold.foldLeft(numeric.zero) { case (x, i) => numeric.plus(x, i) }
+
+  /**
+   * For a semigroup, if we get more than 0 items, use plus
+   */
+  def sumOption[T](implicit sg: Semigroup[T]): Fold[T, Option[T]] =
+    Fold.foldLeft(None: Option[T]) {
+      case (None, i)    => Some(i)
+      case (Some(l), r) => Some(sg.plus(l, r))
+    }
+
+  /**
+   * A Fold that returns the product of a numeric sequence. Does not protect against overflow.
+   */
+  def product[I](implicit numeric: Ring[I]): Fold[I, I] =
+    Fold.foldLeft(numeric.one) { case (x, i) => numeric.times(x, i) }
+
+  /**
+   * A Fold that returns the length of a sequence.
+   */
+  def size: Fold[Any, Long] =
+    Fold.foldLeft(0L) { case (x, _) => x + 1 }
+
+  /**
+   * A Fold that returns "true" if all elements of the sequence statisfy the predicate. Note this does not
+   * short-circuit enumeration of the sequence.
+   */
+  def forall[I](pred: I => Boolean): Fold[I, Boolean] =
+    foldLeft(true)((b, i) => b && pred(i))
+
+  /**
+   * A Fold that returns "true" if any element of the sequence statisfies the predicate. Note this does not
+   * short-circuit enumeration of the sequence.
+   */
+  def exists[I](pred: I => Boolean): Fold[I, Boolean] =
+    foldLeft(false)((b, i) => b || pred(i))
+
+  /**
+   * A Fold that counts the number of elements satisfying the predicate.
+   */
+  def count[I](pred: I => Boolean): Fold[I, Long] =
+    foldLeft(0L) {
+      case (c, i) if pred(i) => c + 1L
+      case (c, _)            => c
+    }
+}
+
+/**
+ * Folds are Applicatives!
+ */
+class FoldApplicative[I] extends Applicative[Fold[I, _]] {
+  override def map[T, U](mt: Fold[I, T])(fn: T => U): Fold[I, U] =
+    mt.map(fn)
+  override def apply[T](v: T): Fold[I, T] =
+    Fold.const(v)
+  override def join[T, U](mt: Fold[I, T], mu: Fold[I, U]): Fold[I, (T, U)] =
+    mt.join(mu)
+  override def sequence[T](ms: Seq[Fold[I, T]]): Fold[I, Seq[T]] =
+    Fold.sequence(ms)
+  override def joinWith[T, U, V](mt: Fold[I, T], mu: Fold[I, U])(fn: (T, U) => V): Fold[I, V] =
+    mt.joinWith(mu)(fn)
+}
diff --git a/algebird-core/src/main/scala-2.13/Interval.scala b/algebird-core/src/main/scala-2.13/Interval.scala
new file mode 100644
index 000000000..6a1645d16
--- /dev/null
+++ b/algebird-core/src/main/scala-2.13/Interval.scala
@@ -0,0 +1,380 @@
+/*
+ Copyright 2013 Twitter, Inc.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package com.twitter.algebird
+
+// TODO this is clearly more general than summingbird, and should be extended to be a ring (add union, etc...)
+
+/**
+ * Represents a single interval on a T with an Ordering
+ */
+sealed trait Interval[T] extends java.io.Serializable {
+  def contains(t: T)(implicit ord: Ordering[T]): Boolean
+
+  def intersect(that: Interval[T])(implicit ord: Ordering[T]): Interval[T]
+  final def apply(t: T)(implicit ord: Ordering[T]): Boolean = contains(t)
+  final def &&(that: Interval[T])(implicit ord: Ordering[T]): Interval[T] = intersect(that)
+
+  /**
+   * Map the Interval with a non-decreasing function. If you use a non-monotonic function (like x^2) then the
+   * result is meaningless. TODO: It might be good to have types for these properties in algebird.
+   */
+  def mapNonDecreasing[U](fn: T => U): Interval[U]
+}
+
+case class Universe[T]() extends Interval[T] {
+  override def contains(t: T)(implicit ord: Ordering[T]): Boolean = true
+  override def intersect(that: Interval[T])(implicit ord: Ordering[T]): Interval[T] =
+    that
+  override def mapNonDecreasing[U](fn: T => U): Interval[U] = Universe()
+}
+
+case class Empty[T]() extends Interval[T] {
+  override def contains(t: T)(implicit ord: Ordering[T]): Boolean = false
+  override def intersect(that: Interval[T])(implicit ord: Ordering[T]): Interval[T] =
+    this
+  override def mapNonDecreasing[U](fn: T => U): Interval[U] = Empty()
+}
+
+object Interval extends java.io.Serializable {
+
+  /**
+   * Class that only exists so that [[leftClosedRightOpen]] and [[leftOpenRightClosed]] can retain the type
+   * information of the returned interval. The compiler doesn't know anything about ordering, so without
+   * [[MaybeEmpty]] the only valid return type is Interval[T].
+   */
+  sealed abstract class MaybeEmpty[T, NonEmpty[t] <: Interval[t]] {
+    def isEmpty: Boolean
+  }
+  object MaybeEmpty {
+
+    /**
+     * Represents an empty interval.
+     */
+    case class SoEmpty[T, NonEmpty[t] <: Interval[t]]() extends MaybeEmpty[T, NonEmpty] {
+      override def isEmpty: Boolean = true
+    }
+
+    /**
+     * Represents a non-empty interval.
+     */
+    case class NotSoEmpty[T, NonEmpty[t] <: Interval[t]](get: NonEmpty[T]) extends MaybeEmpty[T, NonEmpty] {
+      override def isEmpty: Boolean = false
+    }
+  }
+
+  type GenIntersection[T] = Intersection[Lower, Upper, T]
+  type InLowExUp[T] = Intersection[InclusiveLower, ExclusiveUpper, T]
+  type InLowInUp[T] = Intersection[InclusiveLower, InclusiveUpper, T]
+  type ExLowExUp[T] = Intersection[ExclusiveLower, ExclusiveUpper, T]
+  type ExLowInUp[T] = Intersection[ExclusiveLower, InclusiveUpper, T]
+
+  implicit def monoid[T: Ordering]: Monoid[Interval[T]] =
+    Monoid.from[Interval[T]](Universe[T]())(_ && _)
+
+  // Automatically convert from a MaybeEmpty instance
+  implicit def fromMaybeEmpty[T, NonEmpty[t] <: Interval[t]](me: MaybeEmpty[T, NonEmpty]): Interval[T] =
+    me match {
+      case MaybeEmpty.SoEmpty()     => Empty()
+      case MaybeEmpty.NotSoEmpty(i) => i
+    }
+
+  def leftClosedRightOpen[T: Ordering](lower: T, upper: T): MaybeEmpty[T, InLowExUp] =
+    if (Ordering[T].lt(lower, upper))
+      MaybeEmpty.NotSoEmpty[T, InLowExUp](Intersection(InclusiveLower(lower), ExclusiveUpper(upper)))
+    else MaybeEmpty.SoEmpty[T, InLowExUp]()
+
+  def leftOpenRightClosed[T: Ordering](lower: T, upper: T): MaybeEmpty[T, ExLowInUp] =
+    if (Ordering[T].lt(lower, upper))
+      MaybeEmpty.NotSoEmpty[T, ExLowInUp](Intersection(ExclusiveLower(lower), InclusiveUpper(upper)))
+    else MaybeEmpty.SoEmpty[T, ExLowInUp]()
+
+  def closed[T: Ordering](lower: T, upper: T): MaybeEmpty[T, InLowInUp] =
+    if (Ordering[T].lteq(lower, upper))
+      MaybeEmpty.NotSoEmpty[T, InLowInUp](Intersection(InclusiveLower(lower), InclusiveUpper(upper)))
+    else MaybeEmpty.SoEmpty[T, InLowInUp]()
+
+  def open[T: Ordering](lower: T, upper: T): MaybeEmpty[T, ExLowExUp] =
+    if (Ordering[T].lt(lower, upper))
+      MaybeEmpty.NotSoEmpty[T, ExLowExUp](Intersection(ExclusiveLower(lower), ExclusiveUpper(upper)))
+    else MaybeEmpty.SoEmpty[T, ExLowExUp]()
+
+  /**
+   * This is here for binary compatibility reasons. These methods should be moved to Interval, which should
+   * also be an abstract class for better binary compatibility at the next incompatible change
+   */
+  implicit final class IntervalMethods[T](val intr: Interval[T]) extends AnyVal {
+    def isEmpty(implicit succ: Successible[T], pred: Predecessible[T]): Boolean = intr match {
+      case Empty()    => true
+      case Universe() => false
+      case Intersection(InclusiveLower(l), ExclusiveUpper(u)) =>
+        !succ.ordering.lt(l, u)
+      case Intersection(InclusiveLower(l), InclusiveUpper(u)) =>
+        !succ.ordering.lteq(l, u)
+      case Intersection(ExclusiveLower(l), ExclusiveUpper(u)) =>
+        !succ.next(l).exists(succ.ordering.lt(_, u))
+      case Intersection(ExclusiveLower(l), InclusiveUpper(u)) =>
+        !succ.next(l).exists(succ.ordering.lteq(_, u))
+      case InclusiveLower(_) => false // we at least have l
+      case InclusiveUpper(_) => false // false // we at least have u
+      case ExclusiveLower(l) =>
+        succ.next(l).isEmpty
+      case ExclusiveUpper(u) =>
+        pred.prev(u).isEmpty
+    }
+
+    /**
+     * If this returns Some(t), then intr.contains(t) and there is no s less than t such that intr.contains(s)
+     *
+     * if this returns None, it may be Empty, Upper or Universe
+     */
+    def boundedLeast(implicit succ: Successible[T]): Option[T] = intr match {
+      case Empty()                => None
+      case Universe()             => None
+      case _: Upper[?]            => None
+      case i @ Intersection(_, _) => i.least
+      case l: Lower[?]            => l.least
+    }
+
+    /**
+     * If this returns Some(t), then intr.contains(t) and there is no s greater than t such that
+     * intr.contains(s)
+     *
+     * if this returns None, it may be Empty, Lower, or Universe
+     */
+    def boundedGreatest(implicit pred: Predecessible[T]): Option[T] =
+      intr match {
+        case Empty()                => None
+        case Universe()             => None
+        case _: Lower[?]            => None
+        case i @ Intersection(_, _) => i.greatest
+        case u: Upper[?]            => u.greatest
+      }
+  }
+}
+
+// Marker traits to keep lower on the left in Intersection
+sealed trait Lower[T] extends Interval[T] {
+
+  /**
+   * This may give a false positive (but should try not to). Note the case of (0,1) for the integers. If they
+   * were doubles, this would intersect, but since there are no members of the set Int that are bigger than 0
+   * and less than 1, they don't really intersect. So, ordering is not enough here. You need a stronger
+   * notion, which we don't have a typeclass for.
+   */
+  def intersects(u: Upper[T])(implicit ord: Ordering[T]): Boolean
+
+  /**
+   * The smallest value that is contained here This is an Option, because of cases like
+   * ExclusiveLower(Int.MaxValue) which are pathological and equivalent to Empty
+   */
+  def least(implicit s: Successible[T]): Option[T]
+  def strictLowerBound(implicit p: Predecessible[T]): Option[T]
+
+  /**
+   * Iterates all the items in this Lower[T] from lowest to highest
+   */
+  def toIterable(implicit s: Successible[T]): Iterable[T] =
+    least match {
+      case Some(l) => s.iterateNext(l)
+      case None    => Iterable.empty
+    }
+}
+sealed trait Upper[T] extends Interval[T] {
+
+  /**
+   * The smallest value that is contained here This is an Option, because of cases like
+   * ExclusiveUpper(Int.MinValue), which are pathological and equivalent to Empty
+   */
+  def greatest(implicit p: Predecessible[T]): Option[T]
+  // The smallest value that is not present
+  def strictUpperBound(implicit s: Successible[T]): Option[T]
+
+  /**
+   * Iterates all the items in this Upper[T] from highest to lowest
+   */
+  def toIterable(implicit p: Predecessible[T]): Iterable[T] =
+    greatest match {
+      case Some(g) => p.iteratePrev(g)
+      case None    => Iterable.empty
+    }
+}
+
+case class InclusiveLower[T](lower: T) extends Interval[T] with Lower[T] {
+  override def contains(t: T)(implicit ordering: Ordering[T]): Boolean =
+    ordering.lteq(lower, t)
+  override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match {
+    case Universe() => this
+    case Empty()    => that
+    case ub @ InclusiveUpper(_) =>
+      if (intersects(ub)) Intersection(this, ub) else Empty()
+    case ub @ ExclusiveUpper(_) =>
+      if (intersects(ub)) Intersection(this, ub) else Empty()
+    case InclusiveLower(thatlb) =>
+      if (ordering.gt(lower, thatlb)) this else that
+    case ExclusiveLower(thatlb) =>
+      if (ordering.gt(lower, thatlb)) this else that
+    case Intersection(thatL, thatU) => (this && thatL) && thatU
+  }
+  override def intersects(u: Upper[T])(implicit ordering: Ordering[T]): Boolean =
+    u match {
+      case InclusiveUpper(upper) => ordering.lteq(lower, upper)
+      case ExclusiveUpper(upper) => ordering.lt(lower, upper)
+    }
+  override def least(implicit s: Successible[T]): Option[T] = Some(lower)
+  override def strictLowerBound(implicit p: Predecessible[T]): Option[T] = p.prev(lower)
+  override def mapNonDecreasing[U](fn: T => U): Interval[U] = InclusiveLower(fn(lower))
+}
+case class ExclusiveLower[T](lower: T) extends Interval[T] with Lower[T] {
+  override def contains(t: T)(implicit ordering: Ordering[T]): Boolean =
+    ordering.lt(lower, t)
+  override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match {
+    case Universe() => this
+    case Empty()    => that
+    case ub @ InclusiveUpper(_) =>
+      if (intersects(ub)) Intersection(this, ub) else Empty()
+    case ub @ ExclusiveUpper(_) =>
+      if (intersects(ub)) Intersection(this, ub) else Empty()
+    case InclusiveLower(thatlb) =>
+      if (ordering.gteq(lower, thatlb)) this else that
+    case ExclusiveLower(thatlb) =>
+      if (ordering.gteq(lower, thatlb)) this else that
+    case Intersection(thatL, thatU) => (this && thatL) && thatU
+  }
+  override def intersects(u: Upper[T])(implicit ordering: Ordering[T]): Boolean =
+    u match {
+      case InclusiveUpper(upper) => ordering.lt(lower, upper)
+      case ExclusiveUpper(upper) =>
+        ordering.lt(lower, upper) // This is a false positive for (x, next(x))
+    }
+  override def least(implicit s: Successible[T]): Option[T] = s.next(lower)
+  override def strictLowerBound(implicit p: Predecessible[T]): Option[T] = Some(lower)
+  override def mapNonDecreasing[U](fn: T => U): Interval[U] = ExclusiveLower(fn(lower))
+}
+case class InclusiveUpper[T](upper: T) extends Interval[T] with Upper[T] {
+  override def contains(t: T)(implicit ordering: Ordering[T]): Boolean =
+    ordering.lteq(t, upper)
+  override def greatest(implicit p: Predecessible[T]): Option[T] = Some(upper)
+  // The smallest value that is not present
+  override def strictUpperBound(implicit s: Successible[T]): Option[T] = s.next(upper)
+  override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match {
+    case Universe() => this
+    case Empty()    => that
+    case lb @ InclusiveLower(_) =>
+      if (lb.intersects(this)) Intersection(lb, this) else Empty()
+    case lb @ ExclusiveLower(_) =>
+      if (lb.intersects(this)) Intersection(lb, this) else Empty()
+    case InclusiveUpper(thatub) =>
+      if (ordering.lt(upper, thatub)) this else that
+    case ExclusiveUpper(thatub) =>
+      if (ordering.lt(upper, thatub)) this else that
+    case Intersection(thatL, thatU) => thatL && (this && thatU)
+  }
+  override def mapNonDecreasing[U](fn: T => U): Interval[U] = InclusiveUpper(fn(upper))
+}
+case class ExclusiveUpper[T](upper: T) extends Interval[T] with Upper[T] {
+  override def contains(t: T)(implicit ordering: Ordering[T]): Boolean =
+    ordering.lt(t, upper)
+  override def greatest(implicit p: Predecessible[T]): Option[T] = p.prev(upper)
+  // The smallest value that is not present
+  override def strictUpperBound(implicit s: Successible[T]): Option[T] = Some(upper)
+  override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match {
+    case Universe() => this
+    case Empty()    => that
+    case lb @ InclusiveLower(_) =>
+      if (lb.intersects(this)) Intersection(lb, this) else Empty()
+    case lb @ ExclusiveLower(_) =>
+      if (lb.intersects(this)) Intersection(lb, this) else Empty()
+    case InclusiveUpper(thatub) =>
+      if (ordering.lteq(upper, thatub)) this else that
+    case ExclusiveUpper(thatub) =>
+      if (ordering.lteq(upper, thatub)) this else that
+    case Intersection(thatL, thatU) => thatL && (this && thatU)
+  }
+  override def mapNonDecreasing[U](fn: T => U): Interval[U] = ExclusiveUpper(fn(upper))
+}
+
+case class Intersection[L[t] <: Lower[t], U[t] <: Upper[t], T](lower: L[T], upper: U[T]) extends Interval[T] {
+  override def contains(t: T)(implicit ordering: Ordering[T]): Boolean =
+    lower.contains(t) && upper.contains(t)
+  override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match {
+    case Universe()                 => this
+    case Empty()                    => that
+    case lb @ InclusiveLower(_)     => (lb && lower) && upper
+    case lb @ ExclusiveLower(_)     => (lb && lower) && upper
+    case ub @ InclusiveUpper(_)     => lower && (ub && upper)
+    case ub @ ExclusiveUpper(_)     => lower && (ub && upper)
+    case Intersection(thatL, thatU) => (lower && thatL) && (upper && thatU)
+  }
+  override def mapNonDecreasing[T1](fn: T => T1): Interval[T1] = {
+    val newLower = lower match {
+      case InclusiveLower(l) => InclusiveLower(fn(l))
+      case ExclusiveLower(l) => ExclusiveLower(fn(l))
+    }
+    val newUpper = upper match {
+      case InclusiveUpper(u) => InclusiveUpper(fn(u))
+      case ExclusiveUpper(u) => ExclusiveUpper(fn(u))
+    }
+    Intersection(newLower, newUpper)
+  }
+
+  def least(implicit s: Successible[T]): Option[T] =
+    lower.least.filter(upper.contains(_)(s.ordering))
+
+  /**
+   * Goes from lowest to highest for all items that are contained in this Intersection
+   */
+  def leastToGreatest(implicit s: Successible[T]): Iterable[T] = {
+    val self = this
+    implicit val ord: Ordering[T] = s.ordering
+    // TODO https://github.com/twitter/algebird/issues/263
+    new AbstractIterable[T] {
+      // We have to do this because the normal takeWhile causes OOM on big intervals
+      override def iterator: Iterator[T] = lower.toIterable.iterator.takeWhile(self.upper.contains(_))
+    }
+  }
+
+  def greatest(implicit p: Predecessible[T]): Option[T] =
+    upper.greatest.filter(lower.contains(_)(p.ordering))
+
+  /**
+   * Goes from highest to lowest for all items that are contained in this Intersection
+   */
+  def greatestToLeast(implicit p: Predecessible[T]): Iterable[T] = {
+    val self = this
+    implicit val ord: Ordering[T] = p.ordering
+    // TODO https://github.com/twitter/algebird/issues/263
+    new AbstractIterable[T] {
+      // We have to do this because the normal takeWhile causes OOM on big intervals
+      override def iterator: Iterator[T] = upper.toIterable.iterator.takeWhile(self.lower.contains(_))
+    }
+  }
+
+  /**
+   * Some intervals can actually be synonyms for empty: (0,0) for instance, contains nothing. This cannot be
+   * normalized to [a, b) form, thus we return an option Also, there are cases like [Int.MinValue,
+   * Int.MaxValue] that cannot are actually equivalent to Universe. The bottom line: if this returns None, it
+   * just means you can't express it this way, it does not mean it is empty or universe, etc... (there are
+   * other cases).
+   */
+  def toLeftClosedRightOpen(implicit
+      s: Successible[T]
+  ): Option[Intersection[InclusiveLower, ExclusiveUpper, T]] =
+    for {
+      l <- lower.least
+      g <- upper.strictUpperBound if s.ordering.lt(l, g)
+    } yield Intersection(InclusiveLower(l), ExclusiveUpper(g))
+}
diff --git a/algebird-core/src/main/scala-2.13/InvariantAlgebras.scala b/algebird-core/src/main/scala-2.13/InvariantAlgebras.scala
new file mode 100644
index 000000000..6f30ebc1c
--- /dev/null
+++ b/algebird-core/src/main/scala-2.13/InvariantAlgebras.scala
@@ -0,0 +1,48 @@
+package com.twitter.algebird
+
+class InvariantSemigroup[T, U](val forward: T => U, val reverse: U => T)(implicit val semigroup: Semigroup[T])
+    extends Semigroup[U] {
+  override def plus(l: U, r: U): U =
+    forward(semigroup.plus(reverse(l), reverse(r)))
+  override def sumOption(iter: TraversableOnce[U]): Option[U] =
+    semigroup.sumOption(iter.map(reverse)).map(forward)
+
+  /*
+   * Note these work for the subclasses since in those cases semigroup
+   * will be the appropriate algebra.
+   */
+  override val hashCode: Int = (forward, reverse, semigroup).hashCode
+  override def equals(that: Any): Boolean =
+    that match {
+      case r: InvariantSemigroup[?, ?] =>
+        (hashCode == r.hashCode) &&
+        (forward == r.forward) &&
+        (reverse == r.reverse) &&
+        (semigroup == r.semigroup)
+      case _ => false
+    }
+}
+
+class InvariantMonoid[T, U](forward: T => U, reverse: U => T)(implicit val monoid: Monoid[T])
+    extends InvariantSemigroup[T, U](forward, reverse)
+    with Monoid[U] {
+  override val zero: U = forward(monoid.zero)
+}
+
+class InvariantGroup[T, U](forward: T => U, reverse: U => T)(implicit val group: Group[T])
+    extends InvariantMonoid[T, U](forward, reverse)
+    with Group[U] {
+  override def negate(u: U): U = forward(group.negate(reverse(u)))
+  override def minus(l: U, r: U): U =
+    forward(group.minus(reverse(l), reverse(r)))
+}
+
+class InvariantRing[T, U](forward: T => U, reverse: U => T)(implicit val ring: Ring[T])
+    extends InvariantGroup[T, U](forward, reverse)
+    with Ring[U] {
+  override val one: U = forward(ring.one)
+  override def times(l: U, r: U): U =
+    forward(ring.times(reverse(l), reverse(r)))
+  override def product(iter: TraversableOnce[U]): U =
+    forward(ring.product(iter.map(reverse)))
+}
diff --git a/algebird-core/src/main/scala-2.13/JavaMonoids.scala b/algebird-core/src/main/scala-2.13/JavaMonoids.scala
new file mode 100644
index 000000000..26ce54f0a
--- /dev/null
+++ b/algebird-core/src/main/scala-2.13/JavaMonoids.scala
@@ -0,0 +1,147 @@
+/*
+Copyright 2012 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+ */
+package com.twitter.algebird
+
+import java.lang.{
+  Boolean => JBool,
+  Double => JDouble,
+  Float => JFloat,
+  Integer => JInt,
+  Long => JLong,
+  Short => JShort
+}
+import java.util.{ArrayList => JArrayList, HashMap => JHashMap, List => JList, Map => JMap}
+
+import scala.collection.JavaConverters._
+
+object JIntRing extends Ring[JInt] {
+  override val zero: JInt = JInt.valueOf(0)
+  override val one: JInt = JInt.valueOf(1)
+  override def plus(x: JInt, y: JInt): JInt = x + y
+  override def negate(x: JInt): JInt = -x
+  override def minus(x: JInt, y: JInt): JInt = x - y
+  override def times(x: JInt, y: JInt): JInt = x * y
+}
+
+object JShortRing extends Ring[JShort] {
+  override val zero: JShort = Short.box(0)
+  override val one: JShort = Short.box(1)
+  override def plus(x: JShort, y: JShort): JShort = (x + y).toShort
+  override def negate(x: JShort): JShort = (-x).toShort
+  override def minus(x: JShort, y: JShort): JShort = (x - y).toShort
+  override def times(x: JShort, y: JShort): JShort = (x * y).toShort
+}
+
+object JLongRing extends Ring[JLong] {
+  override val zero: JLong = JLong.valueOf(0L)
+  override val one: JLong = JLong.valueOf(1L)
+  override def plus(x: JLong, y: JLong): JLong = x + y
+  override def negate(x: JLong): JLong = -x
+  override def minus(x: JLong, y: JLong): JLong = x - y
+  override def times(x: JLong, y: JLong): JLong = x * y
+}
+
+object JFloatRing extends Ring[JFloat] {
+  override val zero: JFloat = JFloat.valueOf(0.0f)
+  override val one: JFloat = JFloat.valueOf(1.0f)
+  override def plus(x: JFloat, y: JFloat): JFloat = x + y
+  override def negate(x: JFloat): JFloat = -x
+  override def minus(x: JFloat, y: JFloat): JFloat = x - y
+  override def times(x: JFloat, y: JFloat): JFloat = x * y
+}
+
+object JDoubleRing extends Ring[JDouble] {
+  override val zero: JDouble = JDouble.valueOf(0.0)
+  override val one: JDouble = JDouble.valueOf(1.0)
+  override def plus(x: JDouble, y: JDouble): JDouble = x + y
+  override def negate(x: JDouble): JDouble = -x
+  override def minus(x: JDouble, y: JDouble): JDouble = x - y
+  override def times(x: JDouble, y: JDouble): JDouble = x * y
+}
+
+object JBoolRing extends Ring[JBool] {
+  override val zero: JBool = JBool.FALSE
+  override val one: JBool = JBool.TRUE
+  override def plus(x: JBool, y: JBool): JBool =
+    JBool.valueOf(x.booleanValue ^ y.booleanValue)
+  override def negate(x: JBool): JBool = x
+  override def minus(x: JBool, y: JBool): JBool = plus(x, y)
+  override def times(x: JBool, y: JBool): JBool =
+    JBool.valueOf(x.booleanValue & y.booleanValue)
+}
+
+/**
+ * Since Lists are mutable, this always makes a full copy. Prefer scala immutable Lists if you use scala
+ * immutable lists, the tail of the result of plus is always the right argument
+ */
+class JListMonoid[T] extends Monoid[JList[T]] {
+  override def isNonZero(x: JList[T]): Boolean = !x.isEmpty
+  override lazy val zero: JArrayList[T] = new JArrayList[T](0)
+  override def plus(x: JList[T], y: JList[T]): JArrayList[T] = {
+    val res = new JArrayList[T](x.size + y.size)
+    res.addAll(x)
+    res.addAll(y)
+    res
+  }
+}
+
+/**
+ * Since maps are mutable, this always makes a full copy. Prefer scala immutable maps if you use scala
+ * immutable maps, this operation is much faster TODO extend this to Group, Ring
+ */
+class JMapMonoid[K, V: Semigroup] extends Monoid[JMap[K, V]] {
+  override lazy val zero: JHashMap[K, V] = new JHashMap[K, V](0)
+
+  val nonZero: (V => Boolean) = implicitly[Semigroup[V]] match {
+    case mon: Monoid[?] => mon.isNonZero(_)
+    case _              => _ => true
+  }
+
+  override def isNonZero(x: JMap[K, V]): Boolean =
+    !x.isEmpty && (implicitly[Semigroup[V]] match {
+      case mon: Monoid[?] =>
+        x.values.asScala.exists(v => mon.isNonZero(v))
+      case _ => true
+    })
+  override def plus(x: JMap[K, V], y: JMap[K, V]): JHashMap[K, V] = {
+    val (big, small, bigOnLeft) =
+      if (x.size > y.size) {
+        (x, y, true)
+      } else {
+        (y, x, false)
+      }
+    val vsemi = implicitly[Semigroup[V]]
+    val result = new JHashMap[K, V](big.size + small.size)
+    result.putAll(big)
+    small.entrySet.asScala.foreach { kv =>
+      val smallK = kv.getKey
+      val smallV = kv.getValue
+      if (big.containsKey(smallK)) {
+        val bigV = big.get(smallK)
+        val newV =
+          if (bigOnLeft) vsemi.plus(bigV, smallV) else vsemi.plus(smallV, bigV)
+        if (nonZero(newV))
+          result.put(smallK, newV)
+        else
+          result.remove(smallK)
+      } else {
+        // No need to explicitly add with zero on V, just put in the small value
+        result.put(smallK, smallV)
+      }
+    }
+    result
+  }
+}
diff --git a/algebird-core/src/main/scala-2.13/MapAlgebra.scala b/algebird-core/src/main/scala-2.13/MapAlgebra.scala
new file mode 100644
index 000000000..9ca370eaf
--- /dev/null
+++ b/algebird-core/src/main/scala-2.13/MapAlgebra.scala
@@ -0,0 +1,320 @@
+/*
+Copyright 2012 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+ */
+package com.twitter.algebird
+
+import com.twitter.algebird.macros.{Cuber, Roller}
+import scala.collection.mutable.{Builder, Map => MMap}
+import scala.collection.{Map => ScMap}
+import algebra.ring.Rng
+import scala.collection.compat._
+
+trait MapOperations[K, V, M <: ScMap[K, V]] {
+  def add(oldMap: M, kv: (K, V)): M
+  def remove(oldMap: M, k: K): M
+  def fromMutable(mut: MMap[K, V]): M
+}
+
+abstract class GenericMapMonoid[K, V, M <: ScMap[K, V]](implicit val semigroup: Semigroup[V])
+    extends Monoid[M]
+    with MapOperations[K, V, M] {
+
+  val nonZero: (V => Boolean) = semigroup match {
+    case mon: Monoid[?] => mon.isNonZero(_)
+    case _              => _ => true
+  }
+
+  override def isNonZero(x: M): Boolean =
+    !x.isEmpty && (semigroup match {
+      case mon: Monoid[?] =>
+        x.valuesIterator.exists(v => mon.isNonZero(v))
+      case _ => true
+    })
+
+  override def plus(x: M, y: M): M = {
+    // Scala maps can reuse internal structure, so don't copy just add into the bigger one:
+    // This really saves computation when adding lots of small maps into big ones (common)
+    val (big, small, bigOnLeft) =
+      if (x.size > y.size) {
+        (x, y, true)
+      } else {
+        (y, x, false)
+      }
+    small match {
+      // Mutable maps create new copies of the underlying data on add so don't use the
+      // handleImmutable method.
+      // Cannot have a None so 'get' is safe here.
+      case _: MMap[?, ?] => sumOption(Seq(big, small)).get
+      case _             => handleImmutable(big, small, bigOnLeft)
+    }
+  }
+
+  private def handleImmutable(big: M, small: M, bigOnLeft: Boolean) =
+    small.foldLeft(big) { (oldMap, kv) =>
+      val newV = big
+        .get(kv._1)
+        .map { bigV =>
+          if (bigOnLeft)
+            semigroup.plus(bigV, kv._2)
+          else
+            semigroup.plus(kv._2, bigV)
+        }
+        .getOrElse(kv._2)
+      if (nonZero(newV))
+        add(oldMap, kv._1 -> newV)
+      else
+        remove(oldMap, kv._1)
+    }
+  override def sumOption(items: TraversableOnce[M]): Option[M] =
+    if (items.iterator.isEmpty) None
+    else {
+      val mutable = MMap[K, V]()
+      items.iterator.foreach { m =>
+        m.foreach { case (k, v) =>
+          val oldVOpt = mutable.get(k)
+          // sorry for the micro optimization here: avoiding a closure
+          val newV =
+            if (oldVOpt.isEmpty) v else Semigroup.plus(oldVOpt.get, v)
+          if (nonZero(newV))
+            mutable.update(k, newV)
+          else
+            mutable.remove(k)
+        }
+      }
+      Some(fromMutable(mutable))
+    }
+}
+
+class MapMonoid[K, V](implicit semigroup: Semigroup[V]) extends GenericMapMonoid[K, V, Map[K, V]] {
+  override lazy val zero: Map[K, V] = Map[K, V]()
+  override def add(oldMap: Map[K, V], kv: (K, V)): Map[K, V] = oldMap + kv
+  override def remove(oldMap: Map[K, V], k: K): Map[K, V] = oldMap - k
+  override def fromMutable(mut: MMap[K, V]): Map[K, V] =
+    new MutableBackedMap(mut)
+}
+
+class ScMapMonoid[K, V](implicit semigroup: Semigroup[V]) extends GenericMapMonoid[K, V, ScMap[K, V]] {
+  override lazy val zero: ScMap[K, V] = ScMap[K, V]()
+  override def add(oldMap: ScMap[K, V], kv: (K, V)): ScMap[K, V] = oldMap + kv
+  override def remove(oldMap: ScMap[K, V], k: K): ScMap[K, V] = oldMap - k
+  override def fromMutable(mut: MMap[K, V]): ScMap[K, V] =
+    new MutableBackedMap(mut)
+}
+
+/**
+ * You can think of this as a Sparse vector group
+ */
+class MapGroup[K, V](implicit val group: Group[V]) extends MapMonoid[K, V]()(group) with Group[Map[K, V]] {
+  override def negate(kv: Map[K, V]): Map[K, V] =
+    kv.iterator.map { case (k, v) =>
+      (k, group.negate(v))
+    }.toMap
+}
+
+class ScMapGroup[K, V](implicit val group: Group[V])
+    extends ScMapMonoid[K, V]()(group)
+    with Group[ScMap[K, V]] {
+  override def negate(kv: ScMap[K, V]): ScMap[K, V] =
+    kv.iterator.map { case (k, v) =>
+      (k, group.negate(v))
+    }.toMap
+}
+
+/**
+ * You can think of this as a Sparse vector ring
+ */
+trait GenericMapRing[K, V, M <: ScMap[K, V]] extends Rng[M] with MapOperations[K, V, M] {
+
+  implicit def ring: Ring[V]
+
+  override def times(x: M, y: M): M = {
+    val (big, small, bigOnLeft) =
+      if (x.size > y.size) {
+        (x, y, true)
+      } else {
+        (y, x, false)
+      }
+    small.foldLeft(zero) { (oldMap, kv) =>
+      val bigV = big.getOrElse(kv._1, ring.zero)
+      val newV =
+        if (bigOnLeft) ring.times(bigV, kv._2) else ring.times(kv._2, bigV)
+      if (ring.isNonZero(newV)) {
+        add(oldMap, kv._1 -> newV)
+      } else {
+        remove(oldMap, kv._1)
+      }
+    }
+  }
+}
+
+class MapRing[K, V](implicit override val ring: Ring[V])
+    extends MapGroup[K, V]()(ring)
+    with GenericMapRing[K, V, Map[K, V]]
+
+class ScMapRing[K, V](implicit override val ring: Ring[V])
+    extends ScMapGroup[K, V]()(ring)
+    with GenericMapRing[K, V, ScMap[K, V]]
+
+object MapAlgebra {
+  def rightContainsLeft[K, V: Equiv](l: Map[K, V], r: Map[K, V]): Boolean =
+    l.forall { case (k, v) =>
+      r.get(k).exists(Equiv[V].equiv(_, v))
+    }
+
+  implicit def sparseEquiv[K, V: Monoid: Equiv]: Equiv[Map[K, V]] =
+    Equiv.fromFunction { (m1, m2) =>
+      val cleanM1 = removeZeros(m1)
+      val cleanM2 = removeZeros(m2)
+      rightContainsLeft(cleanM1, cleanM2) && rightContainsLeft(cleanM2, cleanM1)
+    }
+
+  def mergeLookup[T, U, V: Monoid](
+      keys: TraversableOnce[T]
+  )(lookup: T => Option[V])(present: T => U): Map[U, V] =
+    sumByKey {
+      keys.iterator.map(k => present(k) -> lookup(k).getOrElse(Monoid.zero[V]))
+    }
+
+  // Returns a new map with zero-value entries removed
+  def removeZeros[K, V: Monoid](m: Map[K, V]): Map[K, V] =
+    m.filter { case (_, v) => Monoid.isNonZero(v) }
+
+  /**
+   * For each key, sum all the values. Note that if V is a Monoid, the current implementation will drop from
+   * the output any key where the values are all Monoid.zero. If the Semigroup is a Monoid, This function is
+   * equivalent to:
+   *
+   * pairs.filter(_._2 != Monoid.zero).groupBy(_._1).mapValues(_.map(_._2).sum)
+   *
+   * Otherwise, the function is equivalent to:
+   *
+   * pairs.groupBy(_._1).mapValues(_.map(_._2).sum)
+   */
+  def sumByKey[K, V: Semigroup](pairs: TraversableOnce[(K, V)]): Map[K, V] =
+    Monoid.sum(pairs.iterator.map(Map(_)))
+
+  /**
+   * For each key, creates a list of all values. This function is equivalent to:
+   *
+   * pairs.groupBy(_._1).mapValues(_.map(_._2))
+   */
+  def group[K, V](pairs: TraversableOnce[(K, V)]): Map[K, List[V]] =
+    if (pairs.iterator.isEmpty) Map.empty
+    else {
+      val mutable = MMap[K, Builder[V, List[V]]]()
+      pairs.iterator.foreach { case (k, v) =>
+        val oldVOpt = mutable.get(k)
+        // sorry for the micro optimization here: avoiding a closure
+        val bldr = if (oldVOpt.isEmpty) {
+          val b = List.newBuilder[V]
+          mutable.update(k, b)
+          b
+        } else oldVOpt.get
+        bldr += v
+      }
+      mutable.iterator.map { case (k, bldr) => (k, bldr.result()) }.toMap
+    }
+
+  // Consider this as edges from k -> v, produce a Map[K,Set[V]]
+  def toGraph[K, V](pairs: TraversableOnce[(K, V)]): Map[K, Set[V]] =
+    Monoid.sum(pairs.map { case (k, v) => Map(k -> Set(v)) })
+
+  /** join the keys of two maps (similar to outer-join in a DB) */
+  def join[K, V, W](map1: Map[K, V], map2: Map[K, W]): Map[K, (Option[V], Option[W])] =
+    Monoid
+      .plus(
+        map1.transform { case (_, v) =>
+          (List(v), List[W]())
+        },
+        map2.transform { case (_, w) =>
+          (List[V](), List(w))
+        }
+      )
+      .transform { case (_, (v, w)) => (v.headOption, w.headOption) }
+
+  /**
+   * Reverses a graph losslessly None key is for v's with no sources.
+   */
+  def invertExact[K, V](m: Map[Option[K], Set[V]]): Map[Option[V], Set[K]] = {
+    def nonEmptyIter[T](i: Iterable[T]): Iterable[Option[T]] =
+      if (i.isEmpty) Iterable(None)
+      else {
+        i.map(Some(_))
+      }
+
+    Monoid.sum {
+      for {
+        (k, sv) <- m.view.toIterable
+        v <- nonEmptyIter(sv)
+      } yield Map(v -> k.toSet)
+    }
+  }
+
+  /**
+   * Invert the Common case of exactly one value for each key
+   */
+  def invert[K, V](m: Map[K, V]): Map[V, Set[K]] =
+    Monoid.sum(m.view.toIterable.map { case (k, v) => Map(v -> Set(k)) })
+
+  def dot[K, V](left: Map[K, V], right: Map[K, V])(implicit mring: Ring[Map[K, V]], mon: Monoid[V]): V =
+    Monoid.sum(mring.times(left, right).values)
+
+  def cube[K, V](it: TraversableOnce[(K, V)])(implicit c: Cuber[K]): Map[c.K, List[V]] = {
+    val map: MMap[c.K, List[V]] = MMap[c.K, List[V]]()
+    it.iterator.foreach { case (k, v) =>
+      c(k).iterator.foreach { ik =>
+        map.get(ik) match {
+          case Some(vs) => map += ik -> (v :: vs)
+          case None     => map += ik -> List(v)
+        }
+      }
+    }
+    map.foreach { case (k, v) => map(k) = v.reverse }
+    new MutableBackedMap(map)
+  }
+
+  def cubeSum[K, V](it: TraversableOnce[(K, V)])(implicit c: Cuber[K], sg: Semigroup[V]): Map[c.K, V] =
+    sumByKey(it.iterator.flatMap { case (k, v) => c(k).map((_, v)) })
+
+  def cubeAggregate[T, K, U, V](it: TraversableOnce[T], agg: Aggregator[T, U, V])(
+      fn: T => K
+  )(implicit c: Cuber[K]): Map[c.K, V] =
+    sumByKey(it.iterator.flatMap(t => c(fn(t)).iterator.map((_, agg.prepare(t)))))(agg.semigroup)
+      .map { case (k, v) => (k, agg.present(v)) }
+
+  def rollup[K, V](it: TraversableOnce[(K, V)])(implicit r: Roller[K]): Map[r.K, List[V]] = {
+    val map: MMap[r.K, List[V]] = MMap[r.K, List[V]]()
+    it.iterator.foreach { case (k, v) =>
+      r(k).iterator.foreach { ik =>
+        map.get(ik) match {
+          case Some(vs) => map += ik -> (v :: vs)
+          case None     => map += ik -> List(v)
+        }
+      }
+    }
+    map.foreach { case (k, v) => map(k) = v.reverse }
+    new MutableBackedMap(map)
+  }
+
+  def rollupSum[K, V](it: TraversableOnce[(K, V)])(implicit r: Roller[K], sg: Semigroup[V]): Map[r.K, V] =
+    sumByKey(it.iterator.flatMap { case (k, v) => r(k).iterator.map((_, v)) })
+
+  def rollupAggregate[T, K, U, V](it: TraversableOnce[T], agg: Aggregator[T, U, V])(
+      fn: T => K
+  )(implicit r: Roller[K]): Map[r.K, V] =
+    sumByKey(it.iterator.flatMap(t => r(fn(t)).iterator.map((_, agg.prepare(t)))))(agg.semigroup)
+      .map { case (k, v) => (k, agg.present(v)) }
+
+}
diff --git a/algebird-core/src/main/scala-2.13/Scan.scala b/algebird-core/src/main/scala-2.13/Scan.scala
new file mode 100644
index 000000000..2dc2ff9c2
--- /dev/null
+++ b/algebird-core/src/main/scala-2.13/Scan.scala
@@ -0,0 +1,333 @@
+package com.twitter.algebird
+
+import scala.collection.compat._
+
+object Scan {
+
+  /**
+   * Most consumers of Scan don't care about the type of the type State type variable. But for those that do,
+   * we make an effort to expose it in all of our combinators.
+   * @tparam I
+   * @tparam S
+   * @tparam O
+   */
+  type Aux[-I, S, +O] = Scan[I, O] { type State = S }
+
+  implicit def applicative[I]: Applicative[Scan[I, _]] = new ScanApplicative[I]
+
+  def from[I, S, O](initState: S)(presentAndNextStateFn: (I, S) => (O, S)): Aux[I, S, O] =
+    new Scan[I, O] {
+      override type State = S
+      override val initialState = initState
+      override def presentAndNextState(i: I, s: State): (O, State) = presentAndNextStateFn(i, s)
+    }
+
+  def fromFunction[I, O](f: I => O): Aux[I, Unit, O] = new Scan[I, O] {
+    override type State = Unit
+    override val initialState = ()
+    override def presentAndNextState(i: I, stateBeforeProcessingI: Unit): (O, State) = (f(i), ())
+  }
+
+  /**
+   * Scans take streams of inputs to streams of outputs, but some scans have trivial inputs and just produce a
+   * stream of outputs. Streams can be thought of as being a hidden state that is queryable for a head
+   * element, and another hidden state that represents the rest of the stream.
+   * @param initState
+   *   The initial state of the scan; think of this as an infinite stream.
+   * @param destructor
+   *   This function decomposes a stream into the its head-element and tail-stream.
+   * @tparam S
+   *   The hidden state of the stream that we are turning into a Scan.
+   * @tparam O
+   *   The type of the elments of the stream that we are turning into a Scan
+   * @return
+   *   A Scan whose inputs are irrelevant, and whose outputs are those that we would get from implementing a
+   *   stream using the information provided to this method.
+   */
+  def iterate[S, O](initState: S)(destructor: S => (O, S)): Aux[Any, S, O] = new Scan[Any, O] {
+    override type State = S
+    override val initialState = initState
+    override def presentAndNextState(i: Any, stateBeforeProcessingI: S): (O, S) =
+      destructor(stateBeforeProcessingI)
+  }
+
+  /**
+   * A Scan whose `Nth` output is the number `N` (starting from 0).
+   */
+  val index: Aux[Any, Long, Long] = iterate(0L)(n => (n, n + 1))
+
+  def identity[A]: Aux[A, Unit, A] = fromFunction[A, A](x => x)
+
+  /**
+   * @param initStateCreator
+   *   A call-by-name method that allocates new mutable state
+   * @param presentAndUpdateStateFn
+   *   A function that both presents the output value, and has the side-effect of updating the mutable state
+   * @tparam I
+   * @tparam S
+   * @tparam O
+   * @return
+   *   A Scan that safely encapsulates state while it's doing its thing.
+   */
+  def mutable[I, S, O](initStateCreator: => S)(presentAndUpdateStateFn: (I, S) => O): Aux[I, S, O] =
+    new Scan[I, O] {
+      override type State = S
+      override def initialState = initStateCreator
+      override def presentAndNextState(i: I, s: S): (O, S) = (presentAndUpdateStateFn(i, s), s)
+    }
+
+  /**
+   * The trivial scan that always returns the same value, regardless of input
+   * @param t
+   * @tparam T
+   */
+  def const[T](t: T): Aux[Any, Unit, T] = fromFunction(_ => t)
+
+  /**
+   * @param aggregator
+   * @param initState
+   * @tparam A
+   * @tparam B
+   * @tparam C
+   * @return
+   *   A scan which, when given `[a_1, ..., a_n]` outputs `[c_1, ..., c_n]` where `c_i = initState +
+   *   aggregator.prepare(a_1) + ... + aggregator.prepare(a_i)`
+   */
+  def fromAggregator[A, B, C](aggregator: Aggregator[A, B, C], initState: B): Aux[A, B, C] =
+    from(initState) { (a: A, stateBeforeProcessingI: B) =>
+      // nb: the order of the arguments to semigroup.plus here is what determines the order of the final summation;
+      // this matters because not all semigroups are commutative
+      val stateAfterProcessingA =
+        aggregator.append(stateBeforeProcessingI, a)
+      (aggregator.present(stateAfterProcessingA), stateAfterProcessingA)
+    }
+
+  /**
+   * @param monoidAggregator
+   * @tparam A
+   * @tparam B
+   * @tparam C
+   * @return
+   *   A scan which, when given `[a_1, ..., a_n]` outputs `[c_1, ..., c_n]` where `c_i =
+   *   monoidAggregator.monoid.zero + aggregator.prepare(a_1) + ... + aggregator.prepare(a_i)`
+   */
+  def fromMonoidAggregator[A, B, C](monoidAggregator: MonoidAggregator[A, B, C]): Aux[A, B, C] =
+    fromAggregator(monoidAggregator, monoidAggregator.monoid.zero)
+
+}
+
+/**
+ * The Scan trait is an alternative to the `scanLeft` method on iterators/other collections for a range of of
+ * use-cases where `scanLeft` is awkward to use. At a high level it provides some of the same functionality as
+ * `scanLeft`, but with a separation of "what is the state of the scan" from "what are the elements that I'm
+ * scanning over?". In particular, when scanning over an iterator with `N` elements, the output is an iterator
+ * with `N` elements (in contrast to scanLeft's `N+1`).
+ *
+ * If you find yourself writing a `scanLeft` over pairs of elements, where you only use one element of the
+ * pair within the `scanLeft`, then throw that element away in a `map` immediately after the scanLeft is done,
+ * then this abstraction is for you.
+ *
+ * The canonical method to use a scan is `apply`.
+ *
+ * @tparam I
+ *   The type of elements that the computation is scanning over.
+ * @tparam O
+ *   The output type of the scan (typically distinct from the hidden `State` of the scan).
+ */
+sealed abstract class Scan[-I, +O] extends Serializable {
+
+  import Scan.{from, Aux}
+
+  /**
+   * The computation of any given scan involves keeping track of a hidden state.
+   */
+  type State
+
+  /**
+   * The state of the scan before any elements have been processed
+   * @return
+   */
+  def initialState: State
+
+  /**
+   * @param i
+   *   An element in the stream to process
+   * @param stateBeforeProcessingI
+   *   The state of the scan before processing i
+   * @return
+   *   The output of the scan corresponding to processing i with state stateBeforeProcessing, along with the
+   *   result of updating stateBeforeProcessing with the information from i.
+   */
+  def presentAndNextState(i: I, stateBeforeProcessingI: State): (O, State)
+
+  /**
+   * @param iter
+   * @return
+   *   If `iter = Iterator(a_1, ..., a_n)`, return:` `Iterator(o_1, ..., o_n)` where `(o_(i+1), state_(i+1)) =
+   *   presentAndNextState(a_i, state_i)` and `state_0 = initialState`
+   */
+  def scanIterator(iter: Iterator[I]): Iterator[O] = new AbstractIterator[O] {
+    override def hasNext: Boolean = iter.hasNext
+    var state: State = initialState
+    override def next(): O = {
+      val thisState = state
+      val thisA = iter.next()
+      val (thisC, nextState) = presentAndNextState(thisA, thisState)
+      state = nextState
+      thisC
+    }
+  }
+
+  /**
+   * @param inputs
+   * @param bf
+   * @tparam In
+   *   The type of the input collection
+   * @tparam Out
+   *   The type of the output collection
+   * @return
+   *   Given inputs as a collection of the form `[a_1, ..., a_n]` the output will be a collection of the form:
+   *   `[o_1, ..., o_n]` where `(o_(i+1), state_(i+1)) = presentAndNextState(a_i, state_i)` and `state_0 =
+   *   initialState`.
+   */
+  def apply[In <: TraversableOnce[I], Out](
+      inputs: In
+  )(implicit bf: BuildFrom[In, O, Out]): Out =
+    bf.fromSpecific(inputs)(scanIterator(inputs.toIterator))
+
+  // combinators
+
+  /**
+   * Return a new scan that is the same as this scan, but with a different `initialState`.
+   * @param newInitialState
+   * @return
+   */
+  def replaceState(newInitialState: => State): Aux[I, State, O] =
+    from(newInitialState)(presentAndNextState(_, _))
+
+  def composePrepare[I1](f: I1 => I): Aux[I1, State, O] = from(initialState) { (i, stateBeforeProcessingI) =>
+    presentAndNextState(f(i), stateBeforeProcessingI)
+  }
+
+  def andThenPresent[O1](g: O => O1): Aux[I, State, O1] = from(initialState) { (i, stateBeforeProcessingI) =>
+    val (c, stateAfterProcessingA) = presentAndNextState(i, stateBeforeProcessingI)
+    (g(c), stateAfterProcessingA)
+  }
+
+  /**
+   * Return a scan that is semantically identical to `this.join(Scan.identity[I1])`, but where we don't
+   * pollute the `State` by pairing it redundantly with `Unit`.
+   * @tparam I1
+   * @return
+   *   If this Scan's `apply` method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1,
+   *   ..., o_n`, then this results in a Scan whose `apply` method returns `[(o_1, a_1), ..., (o_n, a_n)]`
+   *   when given the same input.
+   */
+  def joinWithInput[I1 <: I]: Aux[I1, State, (O, I1)] = from(initialState) { (i, stateBeforeProcessingI) =>
+    val (o, stateAfterProcessingI) = presentAndNextState(i, stateBeforeProcessingI)
+    ((o, i), stateAfterProcessingI)
+  }
+
+  /**
+   * Return a scan whose output is paired with the state of the scan before each input updates the state.
+   * @return
+   *   If this Scan's `apply` method is given inputs [a_1, ..., a_n] resulting in outputs of the form `[o_1,
+   *   ..., o_n]`, where `(o_(i+1), state_(i+1)) = presentAndNextState(a_i, state_i)` and `state_0 =
+   *   initialState`, return a scan that whose apply method, when given inputs `[a_1, ..., a_n]` will return
+   *   `[(o_1, state_0), ..., (o_n, state_(n-1))]`.
+   */
+  def joinWithPriorState: Aux[I, State, (State, O)] = from(initialState) { (i, stateBeforeProcessingI) =>
+    val (o, stateAfterProcessingA) = presentAndNextState(i, stateBeforeProcessingI)
+    ((stateBeforeProcessingI, o), stateAfterProcessingA)
+  }
+
+  /**
+   * Return a scan whose output is paired with the state of the scan after each input updates the state.
+   * @return
+   *   If this Scan's `apply` method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1,
+   *   ..., o_n]`, where `(o_(i+1), state_(i+1)) = presentAndNextState(a_i, state_i)`` and state_0 =
+   *   initialState, return a scan that whose apply method, when given inputs `[a_1, ..., a_n]` will return
+   *   `[(o_1, state_1), ..., (o_n, state_n]`.
+   */
+  def joinWithPosteriorState: Aux[I, State, (O, State)] = from(initialState) { (i, stateBeforeProcessingI) =>
+    val (c, stateAfterProcessingA) = presentAndNextState(i, stateBeforeProcessingI)
+    ((c, stateAfterProcessingA), stateAfterProcessingA)
+  }
+
+  /**
+   * For every `foo`, `scan.joinWithIndex(foo) == scan(foo).zipWithIndex`.
+   * @return
+   *   If this Scan's `apply` method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1,
+   *   ..., o_n]`, return a scan that whose apply method, when given the same input, will return `[(o_1, 1),
+   *   ..., (o_n, n)]`.
+   */
+  def joinWithIndex: Aux[I, (State, Long), (O, Long)] = join(Scan.index)
+
+  /**
+   * Compose two scans pairwise such that, when given pairwise zipped inputs, the resulting scan will output
+   * pairwise zipped outputs.
+   * @param scan2
+   * @tparam I2
+   * @tparam O2
+   * @return
+   *   If this Scan's apply method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1,
+   *   ..., o_n]`, and `scan2.apply([b_1, ..., b_n] = [p_1, ..., p_n]` then `zip` will return a scan whose
+   *   apply method, when given input `[(a_1, b_1), ..., (a_n, b_n)]` results in the output `[(o_1, p_1), ...,
+   *   (o_2, p_2)]`. In other words: `scan.zip(scan2)(foo.zip(bar)) == scan(foo).zip(scan2(bar))`
+   */
+  def zip[I2, O2](scan2: Scan[I2, O2]): Aux[(I, I2), (State, scan2.State), (O, O2)] =
+    from((initialState, scan2.initialState)) { (i1i2, stateBeforeProcessingI1I2) =>
+      val (o1, state1AfterProcesingI1) =
+        presentAndNextState(i1i2._1, stateBeforeProcessingI1I2._1)
+      val (o2, state2AfterProcesingI2) =
+        scan2.presentAndNextState(i1i2._2, stateBeforeProcessingI1I2._2)
+      ((o1, o2), (state1AfterProcesingI1, state2AfterProcesingI2))
+    }
+
+  /**
+   * Given a scan that takes compatible input to this one, pairwise compose the state and outputs of each scan
+   * on a common input stream.
+   * @param scan2
+   * @tparam I2
+   * @tparam O2
+   * @return
+   *   If this Scan's apply method is given inputs [a_1, ..., a_n] resulting in outputs of the form `[o_1,
+   *   ..., o_n]`, and `scan2.apply([a_1, ..., a_n] = [p_1, ..., p_n]` then `join` will return a scan whose
+   *   apply method returns `[(o_1, p_1), ..., (o_2, p_2)]`. In other words: `scan.join(scan2)(foo) ==
+   *   scan(foo).zip(scan2(foo))`
+   */
+  def join[I2 <: I, O2](scan2: Scan[I2, O2]): Aux[I2, (State, scan2.State), (O, O2)] =
+    from((initialState, scan2.initialState)) { (i, stateBeforeProcessingI) =>
+      val (o1, state1AfterProcesingI1) = presentAndNextState(i, stateBeforeProcessingI._1)
+      val (o2, state2AfterProcesingI2) = scan2.presentAndNextState(i, stateBeforeProcessingI._2)
+      ((o1, o2), (state1AfterProcesingI1, state2AfterProcesingI2))
+    }
+
+  /**
+   * Takes the output of this scan and feeds as input into scan2.
+   * @param scan2
+   * @tparam P
+   * @return
+   *   If this Scan's apply method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1,
+   *   ..., o_n]`, and `scan2.apply([o_1, ..., o_n] = [p_1, ..., p_n]` then `compose` will return a scan which
+   *   returns `[p_1, ..., p_n]`.
+   */
+  def compose[P](scan2: Scan[O, P]): Aux[I, (State, scan2.State), P] =
+    from((initialState, scan2.initialState)) { (i, stateBeforeProcessingI) =>
+      val (o, state1AfterProcesingI) = presentAndNextState(i, stateBeforeProcessingI._1)
+      val (p, state2AfterProcesingO) = scan2.presentAndNextState(o, stateBeforeProcessingI._2)
+      (p, (state1AfterProcesingI, state2AfterProcesingO))
+    }
+
+}
+
+class ScanApplicative[I] extends Applicative[Scan[I, _]] {
+  override def map[T, U](mt: Scan[I, T])(fn: T => U): Scan[I, U] =
+    mt.andThenPresent(fn)
+
+  override def apply[T](v: T): Scan[I, T] =
+    Scan.const(v)
+
+  override def join[T, U](mt: Scan[I, T], mu: Scan[I, U]): Scan[I, (T, U)] =
+    mt.join(mu)
+}
diff --git a/algebird-core/src/main/scala-2.13/SpaceSaver.scala b/algebird-core/src/main/scala-2.13/SpaceSaver.scala
new file mode 100644
index 000000000..5f9eee7e6
--- /dev/null
+++ b/algebird-core/src/main/scala-2.13/SpaceSaver.scala
@@ -0,0 +1,296 @@
+package com.twitter.algebird
+
+import java.nio.ByteBuffer
+
+import scala.collection.immutable.SortedMap
+import scala.util.{Failure, Success, Try}
+
+object SpaceSaver {
+
+  /**
+   * Construct SpaceSaver with given capacity containing a single item. This is the public api to create a new
+   * SpaceSaver.
+   */
+  def apply[T](capacity: Int, item: T): SpaceSaver[T] = SSOne(capacity, item)
+
+  /**
+   * Construct SpaceSaver with given capacity containing a single item with provided exact count. This is the
+   * public api to create a new SpaceSaver.
+   */
+  def apply[T](capacity: Int, item: T, count: Long): SpaceSaver[T] =
+    SSMany(capacity, Map(item -> ((count, 0L))))
+
+  private[algebird] val ordering =
+    Ordering.by[(?, (Long, Long)), (Long, Long)] { case (_, (count, err)) =>
+      (-count, err)
+    }
+
+  implicit def spaceSaverSemiGroup[T]: Semigroup[SpaceSaver[T]] =
+    new SpaceSaverSemigroup[T]
+
+  /**
+   * Encodes the SpaceSaver as a sequence of bytes containing in order
+   *   - 1 byte: 1/2 => 1 = SSOne, 2 = SSMany
+   *   - 4 bytes: the capacity
+   *   - N bytes: the item/counters (counters as length + N*(item size + item + 2 * counters)
+   */
+  def toBytes[T](ss: SpaceSaver[T], tSerializer: T => Array[Byte]): Array[Byte] =
+    ss match {
+      case SSOne(capacity, item) =>
+        val itemAsBytes = tSerializer(item)
+        val itemLength = itemAsBytes.length
+        // 1 for the type, 4 for capacity, 4 for itemAsBytes.length
+        val buffer = new Array[Byte](1 + 4 + 4 + itemLength)
+        ByteBuffer
+          .wrap(buffer)
+          .put(1: Byte)
+          .putInt(capacity)
+          .putInt(itemLength)
+          .put(itemAsBytes)
+        buffer
+
+      case SSMany(
+            capacity,
+            counters,
+            _
+          ) => // We do not care about the buckets are thery are created by SSMany.apply
+        val buffer = scala.collection.mutable.ArrayBuffer.newBuilder[Byte]
+        buffer += (2: Byte)
+
+        var buff = ByteBuffer.allocate(4)
+        buff.putInt(capacity)
+        buffer ++= buff.array()
+
+        buff = ByteBuffer.allocate(4)
+        buff.putInt(counters.size)
+        buffer ++= buff.array()
+        counters.foreach { case (item, (a, b)) =>
+          val itemAsBytes = tSerializer(item)
+
+          buff = ByteBuffer.allocate(4)
+          buff.putInt(itemAsBytes.length)
+          buffer ++= buff.array()
+
+          buffer ++= itemAsBytes
+
+          buff = ByteBuffer.allocate(8 * 2)
+          buff.putLong(a)
+          buff.putLong(b)
+          buffer ++= buff.array()
+        }
+        buffer.result().toArray
+    }
+
+  // Make sure to be reversible so fromBytes(toBytes(x)) == x
+  def fromBytes[T](bytes: Array[Byte], tDeserializer: Array[Byte] => Try[T]): Try[SpaceSaver[T]] =
+    fromByteBuffer(ByteBuffer.wrap(bytes), buffer => tDeserializer(buffer.array()))
+
+  def fromByteBuffer[T](bb: ByteBuffer, tDeserializer: ByteBuffer => Try[T]): Try[SpaceSaver[T]] =
+    Try {
+      bb.get.toInt match {
+        case 1 =>
+          val capacity = bb.getInt
+          val itemLength = bb.getInt
+          val itemAsBytes = new Array[Byte](itemLength)
+          bb.get(itemAsBytes)
+          tDeserializer(ByteBuffer.wrap(itemAsBytes)).map(item => SSOne(capacity, item))
+        case 2 =>
+          val capacity = bb.getInt
+
+          var countersToDeserialize = bb.getInt
+          val counters = scala.collection.mutable.Map.empty[T, (Long, Long)]
+          while (countersToDeserialize != 0) {
+            val itemLength = bb.getInt()
+            val itemAsBytes = new Array[Byte](itemLength)
+            bb.get(itemAsBytes)
+            val item = tDeserializer(ByteBuffer.wrap(itemAsBytes))
+
+            val a = bb.getLong
+            val b = bb.getLong
+
+            item match {
+              case Failure(e) => return Failure(e)
+              case Success(i) =>
+                counters += ((i, (a, b)))
+            }
+
+            countersToDeserialize -= 1
+          }
+
+          Success(SSMany(capacity, counters.toMap))
+      }
+    }.flatten
+}
+
+/**
+ * Data structure used in the Space-Saving Algorithm to find the approximate most frequent and top-k elements.
+ * The algorithm is described in "Efficient Computation of Frequent and Top-k Elements in Data Streams". See
+ * here: www.cs.ucsb.edu/research/tech_reports/reports/2005-23.pdf In the paper the data structure is called
+ * StreamSummary but we chose to call it SpaceSaver instead. Note that the adaptation to hadoop and
+ * parallelization were not described in the article and have not been proven to be mathematically correct or
+ * preserve the guarantees or benefits of the algorithm.
+ */
+sealed abstract class SpaceSaver[T] {
+  import SpaceSaver.ordering
+
+  /**
+   * Maximum number of counters to keep (parameter "m" in the research paper).
+   */
+  def capacity: Int
+
+  /**
+   * Current lowest value for count
+   */
+  def min: Long
+
+  /**
+   * Map of item to counter, where each counter consists of an observed count and possible over-estimation
+   * (error)
+   */
+  def counters: Map[T, (Long, Long)]
+
+  def ++(other: SpaceSaver[T]): SpaceSaver[T]
+
+  /**
+   * returns the frequency estimate for the item
+   */
+  def frequency(item: T): Approximate[Long] = {
+    val (count, err) = counters.getOrElse(item, (min, min))
+    Approximate(count - err, count, count, 1.0)
+  }
+
+  /**
+   * Get the elements that show up more than thres times. Returns sorted in descending order: (item,
+   * Approximate[Long], guaranteed)
+   */
+  def mostFrequent(thres: Int): Seq[(T, Approximate[Long], Boolean)] =
+    counters.iterator
+      .filter { case (_, (count, _)) => count >= thres }
+      .toList
+      .sorted(ordering)
+      .map { case (item, (count, err)) =>
+        (item, Approximate(count - err, count, count, 1.0), thres <= count - err)
+      }
+
+  /**
+   * Get the top-k elements. Returns sorted in descending order: (item, Approximate[Long], guaranteed)
+   */
+  def topK(k: Int): Seq[(T, Approximate[Long], Boolean)] = {
+    require(k < capacity)
+    val si = counters.toList
+      .sorted(ordering)
+    val siK = si.take(k)
+    val countKPlus1 = si.drop(k).headOption.map(_._2._1).getOrElse(0L)
+    siK.map { case (item, (count, err)) =>
+      (item, Approximate(count - err, count, count, 1.0), countKPlus1 < count - err)
+    }
+  }
+
+  /**
+   * Check consistency with other SpaceSaver, useful for testing. Returns boolean indicating if they are
+   * consistent
+   */
+  def consistentWith(that: SpaceSaver[T]): Boolean =
+    (counters.keys ++ that.counters.keys).forall(item => (frequency(item) - that.frequency(item)) ~ 0)
+}
+
+case class SSOne[T] private[algebird] (override val capacity: Int, item: T) extends SpaceSaver[T] {
+  require(capacity > 1)
+
+  override def min: Long = 0L
+
+  override def counters: Map[T, (Long, Long)] = Map(item -> ((1L, 1L)))
+
+  override def ++(other: SpaceSaver[T]): SpaceSaver[T] = other match {
+    case other: SSOne[?]  => SSMany(this).add(other)
+    case other: SSMany[?] => other.add(this)
+  }
+}
+
+object SSMany {
+  private def bucketsFromCounters[T](counters: Map[T, (Long, Long)]): SortedMap[Long, Set[T]] =
+    SortedMap[Long, Set[T]]() ++ counters.groupBy(_._2._1).mapValues(_.keySet).toMap
+
+  private[algebird] def apply[T](capacity: Int, counters: Map[T, (Long, Long)]): SSMany[T] =
+    SSMany(capacity, counters, bucketsFromCounters(counters))
+
+  private[algebird] def apply[T](one: SSOne[T]): SSMany[T] =
+    SSMany(one.capacity, Map(one.item -> ((1L, 0L))), SortedMap(1L -> Set(one.item)))
+}
+
+case class SSMany[T] private (
+    override val capacity: Int,
+    override val counters: Map[T, (Long, Long)],
+    buckets: SortedMap[Long, Set[T]]
+) extends SpaceSaver[T] {
+  private val exact: Boolean = counters.size < capacity
+
+  override val min: Long = if (counters.size < capacity) 0L else buckets.firstKey
+
+  // item is already present and just needs to be bumped up one
+  private def bump(item: T) = {
+    val (count, err) = counters(item)
+    val counters1 = counters + (item -> ((count + 1L, err))) // increment by one
+    val currBucket = buckets(count) // current bucket
+    val buckets1 = {
+      if (currBucket.size == 1) // delete current bucket since it will be empty
+        buckets - count
+      else // remove item from current bucket
+        buckets + (count -> (currBucket - item))
+    } + (count + 1L -> (buckets.getOrElse(count + 1L, Set()) + item))
+    SSMany(capacity, counters1, buckets1)
+  }
+
+  // lose one item to meet capacity constraint
+  private def loseOne = {
+    val firstBucket = buckets(buckets.firstKey)
+    val itemToLose = firstBucket.head
+    val counters1 = counters - itemToLose
+    val buckets1 =
+      if (firstBucket.size == 1)
+        buckets - min
+      else
+        buckets + (min -> (firstBucket - itemToLose))
+    SSMany(capacity, counters1, buckets1)
+  }
+
+  // introduce new item
+  private def introduce(item: T, count: Long, err: Long) = {
+    val counters1 = counters + (item -> ((count, err)))
+    val buckets1 = buckets + (count -> (buckets.getOrElse(count, Set()) + item))
+    SSMany(capacity, counters1, buckets1)
+  }
+
+  // add a single element
+  private[algebird] def add(x: SSOne[T]): SSMany[T] = {
+    require(x.capacity == capacity)
+    if (counters.contains(x.item))
+      bump(x.item)
+    else
+      (if (exact) this else this.loseOne).introduce(x.item, min + 1L, min)
+  }
+
+  // merge two stream summaries
+  private def merge(x: SSMany[T]): SSMany[T] = {
+    require(x.capacity == capacity)
+    val counters1 = Map() ++
+      (counters.keySet ++ x.counters.keySet).toList
+        .map { key =>
+          val (count1, err1) = counters.getOrElse(key, (min, min))
+          val (count2, err2) = x.counters.getOrElse(key, (x.min, x.min))
+          key -> ((count1 + count2, err1 + err2))
+        }
+        .sorted(SpaceSaver.ordering)
+        .take(capacity)
+    SSMany(capacity, counters1)
+  }
+
+  override def ++(other: SpaceSaver[T]): SpaceSaver[T] = other match {
+    case other: SSOne[?]  => add(other)
+    case other: SSMany[?] => merge(other)
+  }
+}
+
+class SpaceSaverSemigroup[T] extends Semigroup[SpaceSaver[T]] {
+  override def plus(x: SpaceSaver[T], y: SpaceSaver[T]): SpaceSaver[T] = x ++ y
+}
diff --git a/algebird-core/src/main/scala-2.13/VectorSpace.scala b/algebird-core/src/main/scala-2.13/VectorSpace.scala
new file mode 100644
index 000000000..f8818600c
--- /dev/null
+++ b/algebird-core/src/main/scala-2.13/VectorSpace.scala
@@ -0,0 +1,59 @@
+/*
+Copyright 2012 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+ */
+
+package com.twitter.algebird
+
+import scala.annotation.implicitNotFound
+
+/**
+ * This class represents a vector space. For the required properties see:
+ *
+ * http://en.wikipedia.org/wiki/Vector_space#Definition
+ */
+object VectorSpace extends VectorSpaceOps with Implicits
+
+sealed trait VectorSpaceOps {
+  def scale[F, C[_]](v: F, c: C[F])(implicit vs: VectorSpace[F, C]): C[F] =
+    vs.scale(v, c)
+  def from[F, C[_]](scaleFn: (F, C[F]) => C[F])(implicit r: Ring[F], cGroup: Group[C[F]]): VectorSpace[F, C] =
+    new VectorSpace[F, C] {
+      override def ring: Ring[F] = r
+      override def group: Group[C[F]] = cGroup
+      override def scale(v: F, c: C[F]): C[F] =
+        if (r.isNonZero(v)) scaleFn(v, c) else cGroup.zero
+    }
+}
+private object VectorSpaceOps extends VectorSpaceOps
+
+sealed trait Implicits extends LowPrioImpicits {
+  implicit def indexedSeqSpace[T: Ring]: VectorSpace[T, IndexedSeq] =
+    VectorSpaceOps.from[T, IndexedSeq]((s, seq) => seq.map(Ring.times(s, _)))
+}
+
+sealed trait LowPrioImpicits {
+  implicit def mapSpace[K, T: Ring]: VectorSpace[T, Map[K, _]] =
+    VectorSpaceOps.from[T, Map[K, _]] { (s, m) =>
+      m.transform { case (_, v) => Ring.times(s, v) }
+    }
+}
+
+@implicitNotFound(msg = "Cannot find VectorSpace type class for Container: ${C} and Ring: ${F}")
+trait VectorSpace[F, C[_]] extends java.io.Serializable {
+  implicit def ring: Ring[F]
+  def field: Ring[F] = ring // this is for compatibility with older versions
+  implicit def group: Group[C[F]]
+  def scale(v: F, c: C[F]): C[F]
+}
diff --git a/algebird-core/src/main/scala-2.13/monad/EitherMonad.scala b/algebird-core/src/main/scala-2.13/monad/EitherMonad.scala
new file mode 100644
index 000000000..b6d5e2ffc
--- /dev/null
+++ b/algebird-core/src/main/scala-2.13/monad/EitherMonad.scala
@@ -0,0 +1,37 @@
+/*
+ Copyright 2013 Twitter, Inc.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package com.twitter.algebird.monad
+
+import com.twitter.algebird.Monad
+
+// Monad for either, used for modeling Error where L is the type of the error
+object EitherMonad {
+  class Error[L] extends Monad[Either[L, *]] {
+    override def apply[R](r: R): Right[L, R] = Right(r)
+
+    override def flatMap[T, U](self: Either[L, T])(next: T => Either[L, U]): Either[L, U] =
+      self.right.flatMap(next)
+
+    override def map[T, U](self: Either[L, T])(fn: T => U): Either[L, U] =
+      self.right.map(fn)
+  }
+
+  implicit def monad[L]: Monad[Either[L, _]] = new Error[L]
+
+  def assert[L](truth: Boolean, failure: => L): Either[L, Unit] =
+    if (truth) Right(()) else Left(failure)
+}
diff --git a/algebird-core/src/main/scala-2.13/monad/Reader.scala b/algebird-core/src/main/scala-2.13/monad/Reader.scala
new file mode 100644
index 000000000..e0747af20
--- /dev/null
+++ b/algebird-core/src/main/scala-2.13/monad/Reader.scala
@@ -0,0 +1,76 @@
+/*
+ Copyright 2013 Twitter, Inc.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package com.twitter.algebird.monad
+
+import com.twitter.algebird.Monad
+
+// TODO this is general, move somewhere better
+
+// Reader Monad, represents a series of operations that mutate some environment
+// type (the input to the function)
+
+sealed trait Reader[-Env, +T] {
+  def apply(env: Env): T
+  def flatMap[E1 <: Env, U](next: T => Reader[E1, U]): Reader[E1, U] =
+    FlatMappedReader[E1, T, U](this, next)
+  def map[U](thatFn: T => U): Reader[Env, U] =
+    FlatMappedReader(this, (t: T) => ConstantReader(thatFn(t)))
+}
+
+final case class ConstantReader[+T](get: T) extends Reader[Any, T] {
+  override def apply(env: Any): T = get
+  override def map[U](fn: T => U): ConstantReader[U] = ConstantReader(fn(get))
+  override def flatMap[E1 <: Any, U](next: T => Reader[E1, U]): Reader[E1, U] =
+    next(get)
+}
+final case class ReaderFn[E, +T](fn: E => T) extends Reader[E, T] {
+  override def apply(env: E): T = fn(env)
+}
+final case class FlatMappedReader[E, U, +T](first: Reader[E, U], fn: U => Reader[E, T]) extends Reader[E, T] {
+  override def apply(env: E): T = {
+    @annotation.tailrec
+    def loop(r: Reader[E, Any], stack: List[(Any) => Reader[E, Any]]): Any =
+      r match {
+        case ConstantReader(get) =>
+          stack match {
+            case head :: tail => loop(head(get), tail)
+            case Nil          => get
+          }
+        case ReaderFn(fn) =>
+          stack match {
+            case head :: tail => loop(head(fn(env)), tail)
+            case Nil          => fn(env)
+          }
+        case FlatMappedReader(first, nextFn) => loop(first, nextFn :: stack)
+      }
+    loop(first, List(fn.asInstanceOf[(Any) => Reader[E, Any]])).asInstanceOf[T]
+  }
+}
+
+object Reader {
+  def const[T](t: T): Reader[Any, T] = ConstantReader(t)
+  implicit def apply[E, T](fn: (E) => T): Reader[E, T] = ReaderFn(fn)
+
+  class ReaderM[Env] extends Monad[Reader[Env, _]] {
+    override def apply[T](t: T): ConstantReader[T] = ConstantReader(t)
+    override def flatMap[T, U](self: Reader[Env, T])(next: T => Reader[Env, U]): Reader[Env, U] =
+      self.flatMap(next)
+    override def map[T, U](self: Reader[Env, T])(fn: T => U): Reader[Env, U] = self.map(fn)
+  }
+
+  implicit def monad[Env]: Monad[Reader[Env, _]] = new ReaderM[Env]
+}
diff --git a/algebird-core/src/main/scala-2.13/monad/StateWithError.scala b/algebird-core/src/main/scala-2.13/monad/StateWithError.scala
new file mode 100644
index 000000000..e15a9ebc3
--- /dev/null
+++ b/algebird-core/src/main/scala-2.13/monad/StateWithError.scala
@@ -0,0 +1,130 @@
+/*
+ Copyright 2013 Twitter, Inc.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package com.twitter.algebird.monad
+
+import com.twitter.algebird.{Monad, Semigroup}
+
+/**
+ * Monad to handle mutating input state and possible failures. This is used to interact in the planning phase
+ * with existing mutable APIs (like storm or cascading), but retain the ability to compose carefully.
+ */
+sealed trait StateWithError[S, +F, +T] {
+  def join[F1 >: F, U](
+      that: StateWithError[S, F1, U],
+      mergeErr: (F1, F1) => F1,
+      mergeState: (S, S) => S
+  ): StateWithError[S, F1, (T, U)] =
+    join(that)(Semigroup.from(mergeErr), Semigroup.from(mergeState))
+
+  def join[F1 >: F, U](that: StateWithError[S, F1, U])(implicit
+      sgf: Semigroup[F1],
+      sgs: Semigroup[S]
+  ): // TODO: deep joins could blow the stack, not yet using trampoline here
+  StateWithError[S, F1, (T, U)] =
+    StateFn { (requested: S) =>
+      (run(requested), that.run(requested)) match {
+        case (Right((s1, r1)), Right((s2, r2))) =>
+          Right((sgs.plus(s1, s2), (r1, r2)))
+        case (Left(err1), Left(err2)) =>
+          Left(sgf.plus(err1, err2)) // Our earlier is not ready
+        case (Left(err), _) => Left(err)
+        case (_, Left(err)) => Left(err)
+      }
+    }
+
+  def apply(state: S): Either[F, (S, T)] = run(state)
+
+  def run(state: S): Either[F, (S, T)]
+
+  def flatMap[F1 >: F, U](next: T => StateWithError[S, F1, U]): StateWithError[S, F1, U] =
+    FlatMappedState(this, next)
+
+  def map[U](fn: (T) => U): StateWithError[S, F, U] =
+    FlatMappedState(this, (t: T) => StateWithError.const(fn(t)))
+}
+
+/** Simple wrapper of a function in the Monad */
+final case class StateFn[S, F, T](fn: S => Either[F, (S, T)]) extends StateWithError[S, F, T] {
+  override def run(state: S): Either[F, (S, T)] = fn(state)
+}
+
+/**
+ * A Trampolining instance that should prevent stack overflow at the expense of performance
+ */
+final case class FlatMappedState[S, F, T, U](start: StateWithError[S, F, T], fn: T => StateWithError[S, F, U])
+    extends StateWithError[S, F, U] {
+  override def run(state: S): Either[F, (S, U)] = {
+    @annotation.tailrec
+    def loop(inState: S, st: StateWithError[S, F, Any], stack: List[Any => StateWithError[S, F, Any]]): Any =
+      st match {
+        case StateFn(fn) =>
+          fn(inState) match {
+            case err @ Left(_) => err // bail at first error
+            case noError @ Right((newState, out)) =>
+              stack match {
+                case head :: tailStack => loop(newState, head(out), tailStack)
+                case Nil               => noError // recursion ends
+              }
+          }
+        case FlatMappedState(st, next) => loop(inState, st, next :: stack)
+      }
+    loop(state, this, Nil).asInstanceOf[Either[F, (S, U)]]
+  }
+}
+
+object StateWithError {
+  def getState[S]: StateWithError[S, Nothing, S] =
+    StateFn((state: S) => Right((state, state)))
+  def putState[S](newState: S): StateWithError[S, Nothing, Unit] =
+    StateFn((_: S) => Right((newState, ())))
+  def swapState[S](newState: S): StateWithError[S, Nothing, S] =
+    StateFn((old: S) => Right((newState, old)))
+
+  def const[S, T](t: T): StateWithError[S, Nothing, T] =
+    StateFn((state: S) => Right((state, t)))
+  def lazyVal[S, T](t: => T): StateWithError[S, Nothing, T] =
+    StateFn((state: S) => Right((state, t)))
+  def failure[S, F](f: F): StateWithError[S, F, Nothing] =
+    StateFn(_ => Left(f))
+
+  /**
+   * Use like fromEither[Int](Right("good")) to get a constant Either in the monad
+   */
+  def fromEither[S]: ConstantStateMaker[S] = new ConstantStateMaker[S]
+  class ConstantStateMaker[S] {
+    def apply[F, T](either: Either[F, T]): StateWithError[S, F, T] = { (s: S) => either.right.map((s, _)) }
+  }
+
+  class FunctionLifter[S] {
+    def apply[I, F, T](fn: I => Either[F, T]): (I => StateWithError[S, F, T]) = { (i: I) =>
+      StateFn((s: S) => fn(i).right.map((s, _)))
+    }
+  }
+  // TODO this should move to Monad and work for any Monad
+  def toKleisli[S]: FunctionLifter[S] = new FunctionLifter[S]
+
+  implicit def apply[S, F, T](fn: S => Either[F, (S, T)]): StateWithError[S, F, T] = StateFn(fn)
+  implicit def monad[S, F]: Monad[StateWithError[S, F, _]] = new StateFMonad[F, S]
+
+  class StateFMonad[F, S] extends Monad[StateWithError[S, F, _]] {
+    override def apply[T](const: T): StateWithError[S, Nothing, T] = { (s: S) => Right((s, const)) }
+    override def flatMap[T, U](
+        earlier: StateWithError[S, F, T]
+    )(next: T => StateWithError[S, F, U]): StateWithError[S, F, U] =
+      earlier.flatMap(next)
+  }
+}
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/macros/Cuber.scala b/algebird-core/src/main/scala-2/com/twitter/algebird/macros/Cuber.scala
similarity index 100%
rename from algebird-core/src/main/scala/com/twitter/algebird/macros/Cuber.scala
rename to algebird-core/src/main/scala-2/com/twitter/algebird/macros/Cuber.scala
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/macros/GroupMacro.scala b/algebird-core/src/main/scala-2/com/twitter/algebird/macros/GroupMacro.scala
similarity index 100%
rename from algebird-core/src/main/scala/com/twitter/algebird/macros/GroupMacro.scala
rename to algebird-core/src/main/scala-2/com/twitter/algebird/macros/GroupMacro.scala
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/macros/MonoidMacro.scala b/algebird-core/src/main/scala-2/com/twitter/algebird/macros/MonoidMacro.scala
similarity index 100%
rename from algebird-core/src/main/scala/com/twitter/algebird/macros/MonoidMacro.scala
rename to algebird-core/src/main/scala-2/com/twitter/algebird/macros/MonoidMacro.scala
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/macros/RingMacro.scala b/algebird-core/src/main/scala-2/com/twitter/algebird/macros/RingMacro.scala
similarity index 100%
rename from algebird-core/src/main/scala/com/twitter/algebird/macros/RingMacro.scala
rename to algebird-core/src/main/scala-2/com/twitter/algebird/macros/RingMacro.scala
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/macros/Roller.scala b/algebird-core/src/main/scala-2/com/twitter/algebird/macros/Roller.scala
similarity index 100%
rename from algebird-core/src/main/scala/com/twitter/algebird/macros/Roller.scala
rename to algebird-core/src/main/scala-2/com/twitter/algebird/macros/Roller.scala
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/macros/SemigroupMacro.scala b/algebird-core/src/main/scala-2/com/twitter/algebird/macros/SemigroupMacro.scala
similarity index 100%
rename from algebird-core/src/main/scala/com/twitter/algebird/macros/SemigroupMacro.scala
rename to algebird-core/src/main/scala-2/com/twitter/algebird/macros/SemigroupMacro.scala
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/macros/caseclass.scala b/algebird-core/src/main/scala-2/com/twitter/algebird/macros/caseclass.scala
similarity index 100%
rename from algebird-core/src/main/scala/com/twitter/algebird/macros/caseclass.scala
rename to algebird-core/src/main/scala-2/com/twitter/algebird/macros/caseclass.scala
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/macros/package.scala b/algebird-core/src/main/scala-2/com/twitter/algebird/macros/package.scala
similarity index 100%
rename from algebird-core/src/main/scala/com/twitter/algebird/macros/package.scala
rename to algebird-core/src/main/scala-2/com/twitter/algebird/macros/package.scala
diff --git a/algebird-core/src/main/scala-3/Aggregator.scala b/algebird-core/src/main/scala-3/Aggregator.scala
new file mode 100644
index 000000000..8a4d2b230
--- /dev/null
+++ b/algebird-core/src/main/scala-3/Aggregator.scala
@@ -0,0 +1,637 @@
+package com.twitter.algebird
+
+import java.util.PriorityQueue
+import scala.collection.compat._
+import scala.collection.generic.CanBuildFrom
+
+/**
+ * Aggregators compose well.
+ *
+ * To create a parallel aggregator that operates on a single input in parallel, use:
+ * GeneratedTupleAggregator.from2((agg1, agg2))
+ */
+object Aggregator extends java.io.Serializable {
+  implicit def applicative[I]: Applicative[({ type L[O] = Aggregator[I, ?, O] })#L] =
+    new AggregatorApplicative[I]
+
+  private val DefaultSeed = 471312384
+
+  /**
+   * This is a trivial aggregator that always returns a single value
+   */
+  def const[T](t: T): MonoidAggregator[Any, Unit, T] =
+    prepareMonoid { (_: Any) => () }.andThenPresent(_ => t)
+
+  /**
+   * Using Aggregator.prepare,present you can add to this aggregator
+   */
+  def fromReduce[T](red: (T, T) => T): Aggregator[T, T, T] =
+    fromSemigroup(Semigroup.from(red))
+  def fromSemigroup[T](implicit sg: Semigroup[T]): Aggregator[T, T, T] =
+    new Aggregator[T, T, T] {
+      override def prepare(input: T): T = input
+      override def semigroup: Semigroup[T] = sg
+      override def present(reduction: T): T = reduction
+    }
+  def fromMonoid[T](implicit mon: Monoid[T]): MonoidAggregator[T, T, T] =
+    prepareMonoid(identity[T])
+  // Uses the product from the ring
+  def fromRing[T](implicit rng: Ring[T]): RingAggregator[T, T, T] =
+    fromRing[T, T](rng, identity[T])
+
+  def fromMonoid[F, T](implicit mon: Monoid[T], prep: F => T): MonoidAggregator[F, T, T] =
+    prepareMonoid(prep)(mon)
+
+  def prepareSemigroup[F, T](prep: F => T)(implicit sg: Semigroup[T]): Aggregator[F, T, T] =
+    new Aggregator[F, T, T] {
+      override def prepare(input: F): T = prep(input)
+      override def semigroup: Semigroup[T] = sg
+      override def present(reduction: T): T = reduction
+    }
+  def prepareMonoid[F, T](prep: F => T)(implicit m: Monoid[T]): MonoidAggregator[F, T, T] =
+    new MonoidAggregator[F, T, T] {
+      override def prepare(input: F): T = prep(input)
+      override def monoid: Monoid[T] = m
+      override def present(reduction: T): T = reduction
+    }
+  // Uses the product from the ring
+  def fromRing[F, T](implicit rng: Ring[T], prep: F => T): RingAggregator[F, T, T] =
+    new RingAggregator[F, T, T] {
+      override def prepare(input: F): T = prep(input)
+      override def ring: Ring[T] = rng
+      override def present(reduction: T): T = reduction
+    }
+
+  /**
+   * Obtain an [[Aggregator]] that uses an efficient append operation for faster aggregation. Equivalent to
+   * {{{appendSemigroup(prep, appnd, identity[T]_)(sg)}}}
+   */
+  def appendSemigroup[F, T](prep: F => T, appnd: (T, F) => T)(implicit
+      sg: Semigroup[T]
+  ): Aggregator[F, T, T] =
+    appendSemigroup(prep, appnd, identity[T])(sg)
+
+  /**
+   * Obtain an [[Aggregator]] that uses an efficient append operation for faster aggregation
+   * @tparam F
+   *   Data input type
+   * @tparam T
+   *   Aggregating [[Semigroup]] type
+   * @tparam P
+   *   Presentation (output) type
+   * @param prep
+   *   The preparation function. Expected to construct an instance of type T from a single data element.
+   * @param appnd
+   *   Function that appends the [[Semigroup]]. Defines the [[Aggregator.append]] method for this aggregator.
+   *   Analogous to the 'seqop' function in Scala's sequence 'aggregate' method
+   * @param pres
+   *   The presentation function
+   * @param sg
+   *   The [[Semigroup]] type class
+   * @note
+   *   The functions 'appnd' and 'prep' are expected to obey the law: {{{appnd(t, f) == sg.plus(t, prep(f))}}}
+   */
+  def appendSemigroup[F, T, P](prep: F => T, appnd: (T, F) => T, pres: T => P)(implicit
+      sg: Semigroup[T]
+  ): Aggregator[F, T, P] =
+    new Aggregator[F, T, P] {
+      override def semigroup: Semigroup[T] = sg
+      override def prepare(input: F): T = prep(input)
+      override def present(reduction: T): P = pres(reduction)
+
+      override def apply(inputs: TraversableOnce[F]): P =
+        applyOption(inputs).get
+
+      override def applyOption(inputs: TraversableOnce[F]): Option[P] =
+        agg(inputs).map(pres)
+
+      override def append(l: T, r: F): T = appnd(l, r)
+
+      override def appendAll(old: T, items: TraversableOnce[F]): T =
+        if (items.iterator.isEmpty) old else reduce(old, agg(items).get)
+
+      private def agg(inputs: TraversableOnce[F]): Option[T] =
+        if (inputs.iterator.isEmpty) None
+        else {
+          val itr = inputs.iterator
+          val t = prepare(itr.next)
+          Some(itr.foldLeft(t)(appnd))
+        }
+    }
+
+  /**
+   * Obtain a [[MonoidAggregator]] that uses an efficient append operation for faster aggregation. Equivalent
+   * to {{{appendMonoid(appnd, identity[T]_)(m)}}}
+   */
+  def appendMonoid[F, T](appnd: (T, F) => T)(implicit m: Monoid[T]): MonoidAggregator[F, T, T] =
+    appendMonoid(appnd, identity[T])(m)
+
+  /**
+   * Obtain a [[MonoidAggregator]] that uses an efficient append operation for faster aggregation
+   * @tparam F
+   *   Data input type
+   * @tparam T
+   *   Aggregating [[Monoid]] type
+   * @tparam P
+   *   Presentation (output) type
+   * @param appnd
+   *   Function that appends the [[Monoid]]. Defines the [[MonoidAggregator.append]] method for this
+   *   aggregator. Analogous to the 'seqop' function in Scala's sequence 'aggregate' method
+   * @param pres
+   *   The presentation function
+   * @param m
+   *   The [[Monoid]] type class
+   * @note
+   *   The function 'appnd' is expected to obey the law: {{{appnd(t, f) == m.plus(t, appnd(m.zero, f))}}}
+   */
+  def appendMonoid[F, T, P](appnd: (T, F) => T, pres: T => P)(implicit
+      m: Monoid[T]
+  ): MonoidAggregator[F, T, P] =
+    new MonoidAggregator[F, T, P] {
+      override def monoid: Monoid[T] = m
+      override def prepare(input: F): T = appnd(m.zero, input)
+      override def present(reduction: T): P = pres(reduction)
+
+      override def apply(inputs: TraversableOnce[F]): P = present(agg(inputs))
+
+      override def applyOption(inputs: TraversableOnce[F]): Option[P] =
+        if (inputs.isEmpty) None else Some(apply(inputs))
+
+      override def append(l: T, r: F): T = appnd(l, r)
+
+      override def appendAll(old: T, items: TraversableOnce[F]): T =
+        reduce(old, agg(items))
+
+      override def appendAll(items: TraversableOnce[F]): T = agg(items)
+
+      private def agg(inputs: TraversableOnce[F]): T =
+        inputs.foldLeft(m.zero)(append)
+    }
+
+  /**
+   * How many items satisfy a predicate
+   */
+  def count[T](pred: T => Boolean): MonoidAggregator[T, Long, Long] =
+    prepareMonoid { (t: T) => if (pred(t)) 1L else 0L }
+
+  /**
+   * Do any items satisfy some predicate
+   */
+  def exists[T](pred: T => Boolean): MonoidAggregator[T, Boolean, Boolean] =
+    prepareMonoid(pred)(OrVal.unboxedMonoid)
+
+  /**
+   * Do all items satisfy a predicate
+   */
+  def forall[T](pred: T => Boolean): MonoidAggregator[T, Boolean, Boolean] =
+    prepareMonoid(pred)(AndVal.unboxedMonoid)
+
+  /**
+   * Take the first (left most in reduce order) item found
+   */
+  def head[T]: Aggregator[T, T, T] = fromReduce[T]((l, _) => l)
+
+  /**
+   * Take the last (right most in reduce order) item found
+   */
+  def last[T]: Aggregator[T, T, T] = fromReduce[T]((_, r) => r)
+
+  /**
+   * Get the maximum item
+   */
+  def max[T: Ordering]: Aggregator[T, T, T] = new MaxAggregator[T]
+  def maxBy[U, T: Ordering](fn: U => T): Aggregator[U, U, U] = {
+    implicit val ordU: Ordering[U] = Ordering.by(fn)
+    max[U]
+  }
+
+  /**
+   * Get the minimum item
+   */
+  def min[T: Ordering]: Aggregator[T, T, T] = new MinAggregator[T]
+  def minBy[U, T: Ordering](fn: U => T): Aggregator[U, U, U] = {
+    implicit val ordU: Ordering[U] = Ordering.by(fn)
+    min[U]
+  }
+
+  /**
+   * This returns the number of items we find
+   */
+  def size: MonoidAggregator[Any, Long, Long] =
+    prepareMonoid((_: Any) => 1L)
+
+  /**
+   * Take the smallest `count` items using a heap
+   */
+  def sortedTake[T: Ordering](count: Int): MonoidAggregator[T, PriorityQueue[T], Seq[T]] =
+    new mutable.PriorityQueueToListAggregator[T](count)
+
+  /**
+   * Same as sortedTake, but using a function that returns a value that has an Ordering.
+   *
+   * This function is like writing list.sortBy(fn).take(count).
+   */
+  def sortByTake[T, U: Ordering](count: Int)(fn: T => U): MonoidAggregator[T, PriorityQueue[T], Seq[T]] =
+    Aggregator.sortedTake(count)(Ordering.by(fn))
+
+  /**
+   * Take the largest `count` items using a heap
+   */
+  def sortedReverseTake[T: Ordering](count: Int): MonoidAggregator[T, PriorityQueue[T], Seq[T]] =
+    new mutable.PriorityQueueToListAggregator[T](count)(implicitly[Ordering[T]].reverse)
+
+  /**
+   * Same as sortedReverseTake, but using a function that returns a value that has an Ordering.
+   *
+   * This function is like writing list.sortBy(fn).reverse.take(count).
+   */
+  def sortByReverseTake[T, U: Ordering](
+      count: Int
+  )(fn: T => U): MonoidAggregator[T, PriorityQueue[T], Seq[T]] =
+    Aggregator.sortedReverseTake(count)(Ordering.by(fn))
+
+  /**
+   * Immutable version of sortedTake, for frameworks that check immutability of reduce functions.
+   */
+  def immutableSortedTake[T: Ordering](count: Int): MonoidAggregator[T, TopK[T], Seq[T]] =
+    new TopKToListAggregator[T](count)
+
+  /**
+   * Immutable version of sortedReverseTake, for frameworks that check immutability of reduce functions.
+   */
+  def immutableSortedReverseTake[T: Ordering](count: Int): MonoidAggregator[T, TopK[T], Seq[T]] =
+    new TopKToListAggregator[T](count)(implicitly[Ordering[T]].reverse)
+
+  /**
+   * Randomly selects input items where each item has an independent probability 'prob' of being selected.
+   * This assumes that all sampled records can fit in memory, so use this only when the expected number of
+   * sampled values is small.
+   */
+  def randomSample[T](
+      prob: Double,
+      seed: Int = DefaultSeed
+  ): MonoidAggregator[T, Option[Batched[T]], List[T]] = {
+    assert(prob >= 0 && prob <= 1, "randomSample.prob must lie in [0, 1]")
+    val rng = new java.util.Random(seed)
+    Preparer[T]
+      .filter(_ => rng.nextDouble() <= prob)
+      .monoidAggregate(toList)
+  }
+
+  /**
+   * Selects exactly 'count' of the input records randomly (or all of the records if there are less then
+   * 'count' total records). This assumes that all 'count' of the records can fit in memory, so use this only
+   * for small values of 'count'.
+   */
+  def reservoirSample[T](
+      count: Int,
+      seed: Int = DefaultSeed
+  ): MonoidAggregator[T, PriorityQueue[(Double, T)], Seq[T]] = {
+    val rng = new java.util.Random(seed)
+    Preparer[T]
+      .map(rng.nextDouble() -> _)
+      .monoidAggregate(sortByTake(count)(_._1))
+      .andThenPresent(_.map(_._2))
+  }
+
+  /**
+   * Put everything in a List. Note, this could fill the memory if the List is very large.
+   */
+  def toList[T]: MonoidAggregator[T, Option[Batched[T]], List[T]] =
+    new MonoidAggregator[T, Option[Batched[T]], List[T]] {
+      override def prepare(t: T): Option[Batched[T]] = Some(Batched(t))
+      override def monoid: Monoid[Option[Batched[T]]] =
+        Monoid.optionMonoid(Batched.semigroup)
+      override def present(o: Option[Batched[T]]): List[T] =
+        o.map(_.toList).getOrElse(Nil)
+    }
+
+  /**
+   * Put everything in a Set. Note, this could fill the memory if the Set is very large.
+   */
+  def toSet[T]: MonoidAggregator[T, Set[T], Set[T]] =
+    prepareMonoid { (t: T) => Set(t) }
+
+  /**
+   * This builds an in-memory Set, and then finally gets the size of that set. This may not be scalable if the
+   * Uniques are very large. You might check the approximateUniqueCount or HyperLogLog Aggregator to get an
+   * approximate version of this that is scalable.
+   */
+  def uniqueCount[T]: MonoidAggregator[T, Set[T], Int] =
+    toSet[T].andThenPresent(_.size)
+
+  /**
+   * Using a constant amount of memory, give an approximate unique count (~ 1% error). This uses an exact set
+   * for up to 100 items, then HyperLogLog (HLL) with an 1.2% standard error which uses at most 8192 bytes for
+   * each HLL. For more control, see HyperLogLogAggregator.
+   */
+  def approximateUniqueCount[T: Hash128]: MonoidAggregator[T, Either[HLL, Set[T]], Long] =
+    SetSizeHashAggregator[T](hllBits = 13, maxSetSize = 100)
+
+  /**
+   * Returns the lower bound of a given percentile where the percentile is between (0,1] The items that are
+   * iterated over cannot be negative.
+   */
+  def approximatePercentile[T](percentile: Double, k: Int = QTreeAggregator.DefaultK)(implicit
+      num: Numeric[T]
+  ): QTreeAggregatorLowerBound[T] =
+    QTreeAggregatorLowerBound[T](percentile, k)
+
+  /**
+   * Returns the intersection of a bounded percentile where the percentile is between (0,1] The items that are
+   * iterated over cannot be negative.
+   */
+  def approximatePercentileBounds[T](percentile: Double, k: Int = QTreeAggregator.DefaultK)(implicit
+      num: Numeric[T]
+  ): QTreeAggregator[T] =
+    QTreeAggregator[T](percentile, k)
+
+  /**
+   * An aggregator that sums Numeric values into Doubles.
+   *
+   * This is really no more than converting to Double and then summing. The conversion to double means we
+   * don't have the overflow semantics of integer types on the jvm (e.g. Int.MaxValue + 1 == Int.MinValue).
+   *
+   * Note that if you instead wanted to aggregate Numeric values of a type T into the same type T (e.g. if you
+   * want MonoidAggregator[T, T, T] for some Numeric type T), you can directly use Aggregator.fromMonoid[T]
+   * after importing the numericRing implicit:
+   *
+   * > import com.twitter.algebird.Ring.numericRing > def numericAggregator[T: Numeric]: MonoidAggregator[T,
+   * T, T] = Aggregator.fromMonoid[T]
+   */
+  def numericSum[T](implicit num: Numeric[T]): MonoidAggregator[T, Double, Double] =
+    Preparer[T].map(num.toDouble).monoidAggregate(Aggregator.fromMonoid)
+
+}
+
+/**
+ * This is a type that models map/reduce(map). First each item is mapped, then we reduce with a semigroup,
+ * then finally we present the results.
+ *
+ * Unlike Fold, Aggregator keeps it's middle aggregation type externally visible. This is because Aggregators
+ * are useful in parallel map/reduce systems where there may be some additional types needed to cross the
+ * map/reduce boundary (such a serialization and intermediate storage). If you don't care about the middle
+ * type, an _ may be used and the main utility of the instance is still preserved (e.g. def operate[T](ag:
+ * Aggregator[T, _, Int]): Int)
+ *
+ * Note, join is very useful to combine multiple aggregations with one pass. Also
+ * GeneratedTupleAggregator.fromN((agg1, agg2, ... aggN)) can glue these together well.
+ *
+ * This type is the the Fold.M from Haskell's fold package:
+ * https://hackage.haskell.org/package/folds-0.6.2/docs/Data-Fold-M.html
+ */
+trait Aggregator[-A, B, +C] extends java.io.Serializable { self =>
+  def prepare(input: A): B
+  def semigroup: Semigroup[B]
+  def present(reduction: B): C
+
+  /* *****
+   * All the following are in terms of the above
+   */
+
+  /**
+   * combine two inner values
+   */
+  def reduce(l: B, r: B): B = semigroup.plus(l, r)
+
+  /**
+   * This may error if items is empty. To be safe you might use reduceOption if you don't know that items is
+   * non-empty
+   */
+  def reduce(items: TraversableOnce[B]): B = semigroup.sumOption(items).get
+
+  /**
+   * This is the safe version of the above. If the input in empty, return None, else reduce the items
+   */
+  def reduceOption(items: TraversableOnce[B]): Option[B] =
+    semigroup.sumOption(items)
+
+  /**
+   * This may error if inputs are empty (for Monoid Aggregators it never will, instead you see
+   * present(Monoid.zero[B])
+   */
+  def apply(inputs: TraversableOnce[A]): C =
+    present(reduce(inputs.iterator.map(prepare)))
+
+  /**
+   * This returns None if the inputs are empty
+   */
+  def applyOption(inputs: TraversableOnce[A]): Option[C] =
+    reduceOption(inputs.iterator.map(prepare))
+      .map(present)
+
+  /**
+   * This returns the cumulative sum of its inputs, in the same order. If the inputs are empty, the result
+   * will be empty too.
+   */
+  def cumulativeIterator(inputs: Iterator[A]): Iterator[C] =
+    inputs
+      .scanLeft(None: Option[B]) {
+        case (None, a)    => Some(prepare(a))
+        case (Some(b), a) => Some(append(b, a))
+      }
+      .collect { case Some(b) => present(b) }
+
+  /**
+   * This returns the cumulative sum of its inputs, in the same order. If the inputs are empty, the result
+   * will be empty too.
+   */
+  def applyCumulatively[In <: TraversableOnce[A], Out](
+      inputs: In
+  )(implicit bf: CanBuildFrom[In, C, Out]): Out =
+    (bf: BuildFrom[In, C, Out]).fromSpecific(inputs)(cumulativeIterator(inputs.iterator))
+
+  def append(l: B, r: A): B = reduce(l, prepare(r))
+
+  def appendAll(old: B, items: TraversableOnce[A]): B =
+    if (items.iterator.isEmpty) old else reduce(old, reduce(items.iterator.map(prepare)))
+
+  /** Like calling andThen on the present function */
+  def andThenPresent[D](present2: C => D): Aggregator[A, B, D] =
+    new Aggregator[A, B, D] {
+      override def prepare(input: A): B = self.prepare(input)
+      override def semigroup: Semigroup[B] = self.semigroup
+      override def present(reduction: B): D = present2(self.present(reduction))
+    }
+
+  /** Like calling compose on the prepare function */
+  def composePrepare[A1](prepare2: A1 => A): Aggregator[A1, B, C] =
+    new Aggregator[A1, B, C] {
+      override def prepare(input: A1): B = self.prepare(prepare2(input))
+      override def semigroup: Semigroup[B] = self.semigroup
+      override def present(reduction: B): C = self.present(reduction)
+    }
+
+  /**
+   * This allows you to run two aggregators on the same data with a single pass
+   */
+  def join[A2 <: A, B2, C2](that: Aggregator[A2, B2, C2]): Aggregator[A2, (B, B2), (C, C2)] =
+    GeneratedTupleAggregator.from2((this, that))
+
+  /**
+   * This allows you to join two aggregators into one that takes a tuple input, which in turn allows you to
+   * chain .composePrepare onto the result if you have an initial input that has to be prepared differently
+   * for each of the joined aggregators.
+   *
+   * The law here is: ag1.zip(ag2).apply(as.zip(bs)) == (ag1(as), ag2(bs))
+   */
+  def zip[A2, B2, C2](ag2: Aggregator[A2, B2, C2]): Aggregator[(A, A2), (B, B2), (C, C2)] = {
+    val ag1 = this
+    new Aggregator[(A, A2), (B, B2), (C, C2)] {
+      override def prepare(a: (A, A2)): (B, B2) = (ag1.prepare(a._1), ag2.prepare(a._2))
+      override val semigroup = new Tuple2Semigroup()(ag1.semigroup, ag2.semigroup)
+      override def present(b: (B, B2)): (C, C2) = (ag1.present(b._1), ag2.present(b._2))
+    }
+  }
+
+  /**
+   * An Aggregator can be converted to a Fold, but not vice-versa Note, a Fold is more constrained so only do
+   * this if you require joining a Fold with an Aggregator to produce a Fold
+   */
+  def toFold: Fold[A, Option[C]] =
+    Fold.fold[Option[B], A, Option[C]](
+      {
+        case (None, a)    => Some(self.prepare(a))
+        case (Some(b), a) => Some(self.append(b, a))
+      },
+      None,
+      _.map(self.present)
+    )
+
+  def lift: MonoidAggregator[A, Option[B], Option[C]] =
+    new MonoidAggregator[A, Option[B], Option[C]] {
+      override def prepare(input: A): Option[B] = Some(self.prepare(input))
+      override def present(reduction: Option[B]): Option[C] = reduction.map(self.present)
+      override def monoid = new OptionMonoid[B]()(self.semigroup)
+    }
+}
+
+/**
+ * Aggregators are Applicatives, but this hides the middle type. If you need a join that does not hide the
+ * middle type use join on the trait, or GeneratedTupleAggregator.fromN
+ */
+class AggregatorApplicative[I] extends Applicative[({ type L[O] = Aggregator[I, ?, O] })#L] {
+  override def map[T, U](mt: Aggregator[I, ?, T])(fn: T => U): Aggregator[I, ?, U] =
+    mt.andThenPresent(fn)
+  override def apply[T](v: T): Aggregator[I, ?, T] =
+    Aggregator.const(v)
+  override def join[T, U](mt: Aggregator[I, ?, T], mu: Aggregator[I, ?, U]): Aggregator[I, ?, (T, U)] =
+    mt.join(mu)
+  override def join[T1, T2, T3](
+      m1: Aggregator[I, ?, T1],
+      m2: Aggregator[I, ?, T2],
+      m3: Aggregator[I, ?, T3]
+  ): Aggregator[I, ?, (T1, T2, T3)] =
+    GeneratedTupleAggregator.from3((m1, m2, m3))
+
+  override def join[T1, T2, T3, T4](
+      m1: Aggregator[I, ?, T1],
+      m2: Aggregator[I, ?, T2],
+      m3: Aggregator[I, ?, T3],
+      m4: Aggregator[I, ?, T4]
+  ): Aggregator[I, ?, (T1, T2, T3, T4)] =
+    GeneratedTupleAggregator.from4((m1, m2, m3, m4))
+
+  override def join[T1, T2, T3, T4, T5](
+      m1: Aggregator[I, ?, T1],
+      m2: Aggregator[I, ?, T2],
+      m3: Aggregator[I, ?, T3],
+      m4: Aggregator[I, ?, T4],
+      m5: Aggregator[I, ?, T5]
+  ): Aggregator[I, ?, (T1, T2, T3, T4, T5)] =
+    GeneratedTupleAggregator.from5((m1, m2, m3, m4, m5))
+}
+
+trait MonoidAggregator[-A, B, +C] extends Aggregator[A, B, C] { self =>
+  def monoid: Monoid[B]
+  override def semigroup: Monoid[B] = monoid
+  final override def reduce(items: TraversableOnce[B]): B =
+    monoid.sum(items)
+
+  def appendAll(items: TraversableOnce[A]): B = reduce(items.iterator.map(prepare))
+
+  override def andThenPresent[D](present2: C => D): MonoidAggregator[A, B, D] = {
+    val self = this
+    new MonoidAggregator[A, B, D] {
+      override def prepare(a: A): B = self.prepare(a)
+      override def monoid: Monoid[B] = self.monoid
+      override def present(b: B): D = present2(self.present(b))
+    }
+  }
+  override def composePrepare[A2](prepare2: A2 => A): MonoidAggregator[A2, B, C] = {
+    val self = this
+    new MonoidAggregator[A2, B, C] {
+      override def prepare(a: A2): B = self.prepare(prepare2(a))
+      override def monoid: Monoid[B] = self.monoid
+      override def present(b: B): C = self.present(b)
+    }
+  }
+
+  /**
+   * Build a MonoidAggregator that either takes left or right input and outputs the pair from both
+   */
+  def either[A2, B2, C2](
+      that: MonoidAggregator[A2, B2, C2]
+  ): MonoidAggregator[Either[A, A2], (B, B2), (C, C2)] =
+    new MonoidAggregator[Either[A, A2], (B, B2), (C, C2)] {
+      override def prepare(e: Either[A, A2]): (B, B2) = e match {
+        case Left(a)   => (self.prepare(a), that.monoid.zero)
+        case Right(a2) => (self.monoid.zero, that.prepare(a2))
+      }
+      override val monoid = new Tuple2Monoid[B, B2]()(self.monoid, that.monoid)
+      override def present(bs: (B, B2)): (C, C2) = (self.present(bs._1), that.present(bs._2))
+    }
+
+  /**
+   * Only transform values where the function is defined, else discard
+   */
+  def collectBefore[A2](fn: PartialFunction[A2, A]): MonoidAggregator[A2, B, C] =
+    new MonoidAggregator[A2, B, C] {
+      override def prepare(a: A2): B =
+        if (fn.isDefinedAt(a)) self.prepare(fn(a)) else self.monoid.zero
+      override def monoid: Monoid[B] = self.monoid
+      override def present(b: B): C = self.present(b)
+    }
+
+  /**
+   * Only aggregate items that match a predicate
+   */
+  def filterBefore[A1 <: A](pred: A1 => Boolean): MonoidAggregator[A1, B, C] =
+    new MonoidAggregator[A1, B, C] {
+      override def prepare(a: A1): B = if (pred(a)) self.prepare(a) else self.monoid.zero
+      override def monoid: Monoid[B] = self.monoid
+      override def present(b: B): C = self.present(b)
+    }
+
+  /**
+   * This maps the inputs to Bs, then sums them, effectively flattening the inputs to the MonoidAggregator
+   */
+  def sumBefore: MonoidAggregator[TraversableOnce[A], B, C] =
+    new MonoidAggregator[TraversableOnce[A], B, C] {
+      override def monoid: Monoid[B] = self.monoid
+      override def prepare(input: TraversableOnce[A]): B =
+        monoid.sum(input.iterator.map(self.prepare))
+      override def present(reduction: B): C = self.present(reduction)
+    }
+
+  /**
+   * This allows you to join two aggregators into one that takes a tuple input, which in turn allows you to
+   * chain .composePrepare onto the result if you have an initial input that has to be prepared differently
+   * for each of the joined aggregators.
+   *
+   * The law here is: ag1.zip(ag2).apply(as.zip(bs)) == (ag1(as), ag2(bs))
+   */
+  def zip[A2, B2, C2](ag2: MonoidAggregator[A2, B2, C2]): MonoidAggregator[(A, A2), (B, B2), (C, C2)] = {
+    val ag1 = self
+    new MonoidAggregator[(A, A2), (B, B2), (C, C2)] {
+      override def prepare(a: (A, A2)): (B, B2) = (ag1.prepare(a._1), ag2.prepare(a._2))
+      override val monoid = new Tuple2Monoid[B, B2]()(ag1.monoid, ag2.monoid)
+      override def present(b: (B, B2)): (C, C2) = (ag1.present(b._1), ag2.present(b._2))
+    }
+  }
+}
+
+trait RingAggregator[-A, B, +C] extends MonoidAggregator[A, B, C] {
+  def ring: Ring[B]
+  override def monoid: Monoid[B] = Ring.asTimesMonoid(ring)
+}
diff --git a/algebird-core/src/main/scala-3/CountMinSketch.scala b/algebird-core/src/main/scala-3/CountMinSketch.scala
new file mode 100644
index 000000000..a526b2a51
--- /dev/null
+++ b/algebird-core/src/main/scala-3/CountMinSketch.scala
@@ -0,0 +1,1418 @@
+/*
+Copyright 2012 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+ */
+
+import algebra.CommutativeMonoid
+
+import scala.collection.compat._
+
+/**
+ * A Count-Min sketch is a probabilistic data structure used for summarizing streams of data in sub-linear
+ * space.
+ *
+ * It works as follows. Let `(eps, delta)` be two parameters that describe the confidence in our error
+ * estimates, and let `d = ceil(ln 1/delta)` and `w = ceil(e / eps)`.
+ *
+ * Note: Throughout the code `d` and `w` are called `depth` and `width`, respectively.
+ *
+ * Then:
+ *
+ *   - Take `d` pairwise independent hash functions `h_i`, each of which maps onto the domain `[0, w - 1]`.
+ *   - Create a 2-dimensional table of counts, with `d` rows and `w` columns, initialized with all zeroes.
+ *   - When a new element x arrives in the stream, update the table of counts by setting `counts[i, h_i[x]] +=
+ *     1`, for each `1 <= i <= d`.
+ *   - (Note the rough similarity to a Bloom filter.)
+ *
+ * As an example application, suppose you want to estimate the number of times an element `x` has appeared in
+ * a data stream so far. The Count-Min sketch estimate of this frequency is
+ *
+ * min_i { counts[i, h_i[x]] }
+ *
+ * With probability at least `1 - delta`, this estimate is within `eps * N` of the true frequency (i.e., `true
+ * frequency <= estimate <= true frequency + eps * N`), where N is the total size of the stream so far.
+ *
+ * See http://www.eecs.harvard.edu/~michaelm/CS222/countmin.pdf for technical details, including proofs of the
+ * estimates and error bounds used in this implementation.
+ *
+ * Parts of this implementation are taken from
+ * https://github.com/clearspring/stream-lib/blob/master/src/main/java/com/clearspring/analytics/stream/frequency/CountMinSketch.java
+ *
+ * @author
+ *   Edwin Chen
+ */
+/**
+ * Monoid for adding CMS sketches.
+ *
+ * =Usage=
+ *
+ * `eps` and `delta` are parameters that bound the error of each query estimate. For example, errors in
+ * answering point queries (e.g., how often has element x appeared in the stream described by the sketch?) are
+ * often of the form: "with probability p >= 1 - delta, the estimate is close to the truth by some factor
+ * depending on eps."
+ *
+ * The type `K` is the type of items you want to count. You must provide an implicit `CMSHasher[K]` for `K`,
+ * and Algebird ships with several such implicits for commonly used types such as `Long` and `BigInt`.
+ *
+ * If your type `K` is not supported out of the box, you have two options: 1) You provide a "translation"
+ * function to convert items of your (unsupported) type `K` to a supported type such as Double, and then use
+ * the `contramap` function of [[CMSHasher]] to create the required `CMSHasher[K]` for your type (see the
+ * documentation of [[CMSHasher]] for an example); 2) You implement a `CMSHasher[K]` from scratch, using the
+ * existing CMSHasher implementations as a starting point.
+ *
+ * Note: Because Arrays in Scala/Java not have sane `equals` and `hashCode` implementations, you cannot safely
+ * use types such as `Array[Byte]`. Extra work is required for Arrays. For example, you may opt to convert
+ * `Array[T]` to a `Seq[T]` via `toSeq`, or you can provide appropriate wrapper classes. Algebird provides one
+ * such wrapper class, [[Bytes]], to safely wrap an `Array[Byte]` for use with CMS.
+ *
+ * @param eps
+ *   One-sided error bound on the error of each point query, i.e. frequency estimate.
+ * @param delta
+ *   A bound on the probability that a query estimate does not lie within some small interval (an interval
+ *   that depends on `eps`) around the truth.
+ * @param seed
+ *   A seed to initialize the random number generator used to create the pairwise independent hash functions.
+ * @param maxExactCountOpt
+ *   An Option parameter about how many exact counts a sparse CMS wants to keep.
+ * @tparam K
+ *   The type used to identify the elements to be counted. For example, if you want to count the occurrence of
+ *   user names, you could map each username to a unique numeric ID expressed as a `Long`, and then count the
+ *   occurrences of those `Long`s with a CMS of type `K=Long`. Note that this mapping between the elements of
+ *   your problem domain and their identifiers used for counting via CMS should be bijective. We require a
+ *   [[CMSHasher]] context bound for `K`, see [[CMSHasherImplicits]] for available implicits that can be
+ *   imported. Which type K should you pick in practice? For domains that have less than `2^64` unique
+ *   elements, you'd typically use `Long`. For larger domains you can try `BigInt`, for example. Other
+ *   possibilities include Spire's `SafeLong` and `Numerical` data types (https://github.com/non/spire),
+ *   though Algebird does not include the required implicits for CMS-hashing (cf. [[CMSHasherImplicits]].
+ */
+class CMSMonoid[K: CMSHasher](eps: Double, delta: Double, seed: Int, maxExactCountOpt: Option[Int] = None)
+    extends Monoid[CMS[K]]
+    with CommutativeMonoid[CMS[K]] {
+
+  val params: CMSParams[K] = {
+    val hashes: Seq[CMSHash[K]] = CMSFunctions.generateHashes(eps, delta, seed)
+    CMSParams(hashes, eps, delta, maxExactCountOpt)
+  }
+
+  override val zero: CMS[K] = CMSZero[K](params)
+
+  /**
+   * Combines the two sketches.
+   *
+   * The sketches must use the same hash functions.
+   */
+  override def plus(left: CMS[K], right: CMS[K]): CMS[K] = {
+    require(left.params.hashes == right.params.hashes, "The sketches must use the same hash functions.")
+    left ++ right
+  }
+
+  /**
+   * Creates a sketch out of a single item.
+   */
+  def create(item: K): CMS[K] = CMSItem[K](item, 1L, params)
+
+  /**
+   * Creates a sketch out of multiple items.
+   */
+  def create(data: Seq[K]): CMS[K] = {
+    val summation = new CMSSummation(params)
+    data.foreach(k => summation.insert(k, 1L))
+    summation.result
+  }
+
+  override def sumOption(sketches: TraversableOnce[CMS[K]]): Option[CMS[K]] =
+    if (sketches.iterator.isEmpty) None else Some(sum(sketches))
+
+  override def sum(sketches: TraversableOnce[CMS[K]]): CMS[K] = {
+    val summation = new CMSSummation(params)
+    summation.updateAll(sketches)
+    summation.result
+  }
+}
+
+/**
+ * This mutable builder can be used when speed is essential and you can be sure the scope of the mutability
+ * cannot escape in an unsafe way. The intended use is to allocate and call result in one method without
+ * letting a reference to the instance escape into a closure.
+ */
+class CMSSummation[K](params: CMSParams[K]) {
+  private[this] val hashes = params.hashes.toArray
+  private[this] val height = CMSFunctions.depth(params.delta)
+  private[this] val width = CMSFunctions.width(params.eps)
+  private[this] val cells = new Array[Long](height * width)
+  private[this] var totalCount = 0L
+
+  final def insert(k: K, count: Long): Unit = {
+    var row = 0
+    var offset = 0
+    val hs = hashes
+    while (row < hs.length) {
+      cells(offset + hs(row)(k)) += count
+      offset += width
+      row += 1
+    }
+    totalCount += count
+  }
+
+  def updateAll(sketches: TraversableOnce[CMS[K]]): Unit =
+    sketches.iterator.foreach(updateInto)
+
+  def updateInto(cms: CMS[K]): Unit =
+    cms match {
+      case CMSZero(_) =>
+        ()
+      case CMSItem(item, count, _) =>
+        insert(item, count)
+      case SparseCMS(table, _, _) =>
+        table.foreach { case (item, c) =>
+          insert(item, c)
+        }
+      case CMSInstance(CMSInstance.CountsTable(matrix), count, _) =>
+        var offset = 0
+        val rit = matrix.iterator
+        while (rit.hasNext) {
+          var col = 0
+          val cit = rit.next().iterator
+          while (cit.hasNext) {
+            cells(offset + col) += cit.next()
+            col += 1
+          }
+          offset += width
+        }
+        totalCount += count
+    }
+
+  def result: CMS[K] =
+    if (totalCount == 0L) CMSZero(params)
+    else {
+      def vectorize(row: Int): Vector[Long] = {
+        val offset = row * width
+        val b = Vector.newBuilder[Long]
+        var col = 0
+        while (col < width) {
+          b += cells(offset + col)
+          col += 1
+        }
+        b.result()
+      }
+
+      val b = Vector.newBuilder[Vector[Long]]
+      var row = 0
+      while (row < height) {
+        b += vectorize(row)
+        row += 1
+      }
+      CMSInstance(CMSInstance.CountsTable(b.result()), totalCount, params)
+    }
+}
+
+/**
+ * An Aggregator for [[CMS]]. Can be created using CMS.aggregator.
+ */
+case class CMSAggregator[K](cmsMonoid: CMSMonoid[K]) extends MonoidAggregator[K, CMS[K], CMS[K]] {
+  override val monoid: CMSMonoid[K] = cmsMonoid
+
+  override def prepare(value: K): CMS[K] = monoid.create(value)
+
+  override def present(cms: CMS[K]): CMS[K] = cms
+
+}
+
+/**
+ * Configuration parameters for [[CMS]].
+ *
+ * @param hashes
+ *   Pair-wise independent hashes functions. We need `N=depth` such functions (`depth` can be derived from
+ *   `delta`).
+ * @param eps
+ *   One-sided error bound on the error of each point query, i.e. frequency estimate.
+ * @param delta
+ *   A bound on the probability that a query estimate does not lie within some small interval (an interval
+ *   that depends on `eps`) around the truth.
+ * @param maxExactCountOpt
+ *   An Option parameter about how many exact counts a sparse CMS wants to keep.
+ * @tparam K
+ *   The type used to identify the elements to be counted.
+ */
+case class CMSParams[K](
+    hashes: Seq[CMSHash[K]],
+    eps: Double,
+    delta: Double,
+    maxExactCountOpt: Option[Int] = None
+) {
+
+  require(0 < eps && eps < 1, "eps must lie in (0, 1)")
+  require(0 < delta && delta < 1, "delta must lie in (0, 1)")
+  require(
+    hashes.size >= CMSFunctions.depth(delta),
+    s"we require at least ${CMSFunctions.depth(delta)} hash functions"
+  )
+
+}
+
+/**
+ * Helper functions to generate or to translate between various CMS parameters (cf. [[CMSParams]]).
+ */
+object CMSFunctions {
+
+  /**
+   * Translates from `width` to `eps`.
+   */
+  def eps(width: Int): Double = scala.math.exp(1.0) / width
+
+  /**
+   * Translates from `depth` to `delta`.
+   */
+  @throws[IllegalArgumentException]("if depth is too large, causing precision errors when computing delta")
+  def delta(depth: Int): Double = {
+    val i = scala.math.exp(-depth)
+    require(
+      i > 0.0,
+      s"depth must be smaller as it causes precision errors when computing delta ($depth led to an invalid delta of $i)"
+    )
+    i
+  }
+
+  /**
+   * Translates from `delta` to `depth`.
+   */
+  @throws[IllegalArgumentException]("if delta is is not in (0, 1)")
+  def depth(delta: Double): Int = {
+    require(0 < delta && delta < 1, "delta must lie in (0, 1)")
+    scala.math.ceil(scala.math.log(1.0 / delta)).toInt
+  }
+
+  /**
+   * Translates from `eps` to `width`.
+   */
+  def width(eps: Double): Int =
+    scala.math.ceil(truncatePrecisionError(scala.math.exp(1) / eps)).toInt
+
+  /**
+   * Compute maxExactCount from parameters or `depth` and `width`
+   */
+  def maxExactCount(maxExactCountOpt: Option[Int], depth: Int, width: Int): Int =
+    maxExactCountOpt.getOrElse(math.max(width * depth / 100, 50))
+
+  // Eliminates precision errors such as the following:
+  //
+  //   scala> val width = 39
+  //   scala> scala.math.exp(1) / CMSFunctions.eps(width)
+  //   res171: Double = 39.00000000000001   <<< should be 39.0
+  //
+  // Because of the actual types on which CMSFunctions operates (i.e. Int and Double), the maximum number of decimal
+  // places should be 6.
+  private def truncatePrecisionError(i: Double, decimalPlaces: Int = 6) =
+    BigDecimal(i)
+      .setScale(decimalPlaces, BigDecimal.RoundingMode.HALF_UP)
+      .toDouble
+
+  /**
+   * Generates `N=depth` pair-wise independent hash functions.
+   *
+   * @param eps
+   *   One-sided error bound on the error of each point query, i.e. frequency estimate.
+   * @param delta
+   *   Error bound on the probability that a query estimate does NOT lie within some small interval around the
+   *   truth.
+   * @param seed
+   *   Seed for the random number generator.
+   * @tparam K
+   *   The type used to identify the elements to be counted.
+   * @return
+   *   The generated hash functions.
+   */
+  def generateHashes[K: CMSHasher](eps: Double, delta: Double, seed: Int): Seq[CMSHash[K]] = {
+    // Typically, we would use d -- aka depth -- pair-wise independent hash functions of the form
+    //
+    //   h_i(x) = a_i * x + b_i (mod p)
+    //
+    // But for this particular application, setting b_i does not matter (since all it does is shift the results of a
+    // particular hash), so we omit it (by setting b_i to 0) and simply use hash functions of the form
+    //
+    //   h_i(x) = a_i * x (mod p)
+    //
+    val r = new scala.util.Random(seed)
+    val numHashes = depth(delta)
+    val numCounters = width(eps)
+    (0 to (numHashes - 1)).map(_ => CMSHash[K](r.nextInt(), 0, numCounters))
+  }
+
+}
+
+/**
+ * A trait for CMS implementations that can count elements in a data stream and that can answer point queries
+ * (i.e. frequency estimates) for these elements.
+ *
+ * Known implementations: [[CMS]], [[TopCMS]].
+ *
+ * @tparam K
+ *   The type used to identify the elements to be counted.
+ * @tparam C
+ *   The type of the actual CMS that implements this trait.
+ */
+trait CMSCounting[K, C[_]] {
+
+  /**
+   * Returns the one-sided error bound on the error of each point query, i.e. frequency estimate.
+   */
+  def eps: Double
+
+  /**
+   * Returns the bound on the probability that a query estimate does NOT lie within some small interval (an
+   * interval that depends on `eps`) around the truth.
+   */
+  def delta: Double
+
+  /**
+   * Number of hash functions (also: number of rows in the counting table). This number is derived from
+   * `delta`.
+   */
+  def depth: Int = CMSFunctions.depth(delta)
+
+  /**
+   * Number of counters per hash function (also: number of columns in the counting table). This number is
+   * derived from `eps`.
+   */
+  def width: Int = CMSFunctions.width(eps)
+
+  /**
+   * An Option parameter about how many exact counts a sparse CMS wants to keep
+   */
+  def maxExactCountOpt: Option[Int]
+
+  /**
+   * Number of exact counts a sparse CMS wants to keep. This number is derived from `maxExactCountOpt`.
+   */
+  def maxExactCount: Int =
+    CMSFunctions.maxExactCount(maxExactCountOpt, depth, width)
+
+  /**
+   * Returns a new sketch that is the combination of this sketch and the other sketch.
+   */
+  def ++(other: C[K]): C[K]
+
+  /**
+   * Counts the item and returns the result as a new sketch.
+   */
+  def +(item: K): C[K] = this + (item, 1L)
+
+  /**
+   * Counts the item `count` times and returns the result as a new sketch.
+   */
+  def +(item: K, count: Long): C[K]
+
+  /**
+   * Returns an estimate of the total number of times this item has been seen in the stream so far. This
+   * estimate is an upper bound.
+   *
+   * It is always true that `estimatedFrequency >= trueFrequency`. With probability `p >= 1 - delta`, it also
+   * holds that `estimatedFrequency <= trueFrequency + eps * totalCount`.
+   */
+  def frequency(item: K): Approximate[Long]
+
+  /**
+   * Returns an estimate of the inner product against another data stream.
+   *
+   * In other words, let a_i denote the number of times element i has been seen in the data stream summarized
+   * by this CMS, and let b_i denote the same for the other CMS. Then this returns an estimate of `<a, b> =
+   * \sum a_i b_i`.
+   *
+   * Note: This can also be viewed as the join size between two relations.
+   *
+   * It is always true that actualInnerProduct <= estimatedInnerProduct. With probability `p >= 1 - delta`, it
+   * also holds that `estimatedInnerProduct <= actualInnerProduct + eps * thisTotalCount * otherTotalCount`.
+   */
+  def innerProduct(other: C[K]): Approximate[Long]
+
+  /**
+   * Total number of elements counted (i.e. seen in the data stream) so far.
+   */
+  def totalCount: Long
+
+  /**
+   * The first frequency moment is the total number of elements in the stream.
+   */
+  def f1: Long = totalCount
+
+  /**
+   * The second frequency moment is `\sum a_i^2`, where `a_i` is the count of the i-th element.
+   */
+  def f2: Approximate[Long]
+
+}
+
+/**
+ * A trait for CMS implementations that can track heavy hitters in a data stream.
+ *
+ * It is up to the implementation how the semantics of tracking heavy hitters are defined. For instance, one
+ * implementation could track the "top %" heavy hitters whereas another implementation could track the "top N"
+ * heavy hitters.
+ *
+ * Known implementations: [[TopCMS]].
+ *
+ * @tparam K
+ *   The type used to identify the elements to be counted.
+ */
+trait CMSHeavyHitters[K] {
+
+  /**
+   * The pluggable logic of how heavy hitters are being tracked.
+   */
+  def heavyHittersLogic: HeavyHittersLogic[K]
+
+  /**
+   * Returns the set of heavy hitters.
+   */
+  def heavyHitters: Set[K]
+
+}
+
+object CMS {
+
+  def monoid[K: CMSHasher](eps: Double, delta: Double, seed: Int): CMSMonoid[K] =
+    monoid(eps, delta, seed, None)
+  def monoid[K: CMSHasher](
+      eps: Double,
+      delta: Double,
+      seed: Int,
+      maxExactCountOpt: Option[Int]
+  ): CMSMonoid[K] =
+    new CMSMonoid[K](eps, delta, seed, maxExactCountOpt)
+
+  def monoid[K: CMSHasher](depth: Int, width: Int, seed: Int): CMSMonoid[K] =
+    monoid(depth, width, seed, None)
+  def monoid[K: CMSHasher](depth: Int, width: Int, seed: Int, maxExactCountOpt: Option[Int]): CMSMonoid[K] =
+    monoid(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, maxExactCountOpt)
+
+  def aggregator[K: CMSHasher](eps: Double, delta: Double, seed: Int): CMSAggregator[K] =
+    aggregator(eps, delta, seed, None)
+  def aggregator[K: CMSHasher](
+      eps: Double,
+      delta: Double,
+      seed: Int,
+      maxExactCountOpt: Option[Int]
+  ): CMSAggregator[K] =
+    new CMSAggregator[K](monoid(eps, delta, seed, maxExactCountOpt))
+
+  def aggregator[K: CMSHasher](depth: Int, width: Int, seed: Int): CMSAggregator[K] =
+    aggregator(depth, width, seed, None)
+  def aggregator[K: CMSHasher](
+      depth: Int,
+      width: Int,
+      seed: Int,
+      maxExactCountOpt: Option[Int]
+  ): CMSAggregator[K] =
+    aggregator(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, maxExactCountOpt)
+
+  /**
+   * Returns a fresh, zeroed CMS instance.
+   */
+  def apply[K: CMSHasher](
+      eps: Double,
+      delta: Double,
+      seed: Int,
+      maxExactCountOpt: Option[Int] = None
+  ): CMS[K] = {
+    val params = {
+      val hashes: Seq[CMSHash[K]] =
+        CMSFunctions.generateHashes(eps, delta, seed)
+      CMSParams(hashes, eps, delta, maxExactCountOpt)
+    }
+    CMSZero[K](params)
+  }
+
+}
+
+/**
+ * A Count-Min sketch data structure that allows for counting and frequency estimation of elements in a data
+ * stream.
+ *
+ * Tip: If you also need to track heavy hitters ("Top N" problems), take a look at [[TopCMS]].
+ *
+ * =Usage=
+ *
+ * This example demonstrates how to count `Long` elements with [[CMS]], i.e. `K=Long`.
+ *
+ * Note that the actual counting is always performed with a `Long`, regardless of your choice of `K`. That is,
+ * the counting table behind the scenes is backed by `Long` values (at least in the current implementation),
+ * and thus the returned frequency estimates are always instances of `Approximate[Long]`.
+ *
+ * @example
+ *   {{{
+ *
+ * // Creates a monoid for a CMS that can count `Long` elements. val cmsMonoid: CMSMonoid[Long] = { val eps =
+ * 0.001 val delta = 1E-10 val seed = 1 CMS.monoid[Long](eps, delta, seed) }
+ *
+ * // Creates a CMS instance that has counted the element `1L`. val cms: CMS[Long] = cmsMonoid.create(1L)
+ *
+ * // Estimates the frequency of `1L` val estimate: Approximate[Long] = cms.frequency(1L)
+ *   }}}
+ *
+ * @tparam K
+ *   The type used to identify the elements to be counted.
+ */
+sealed abstract class CMS[K](val params: CMSParams[K]) extends java.io.Serializable with CMSCounting[K, CMS] {
+
+  override val eps: Double = params.eps
+
+  override val delta: Double = params.delta
+
+  override val maxExactCountOpt: Option[Int] = params.maxExactCountOpt
+
+  override def f2: Approximate[Long] = innerProduct(this)
+
+}
+
+/**
+ * Zero element. Used for initialization.
+ */
+case class CMSZero[K](override val params: CMSParams[K]) extends CMS[K](params) {
+
+  override val totalCount: Long = 0L
+
+  override def +(item: K, count: Long): CMS[K] = CMSItem[K](item, count, params)
+
+  override def ++(other: CMS[K]): CMS[K] = other
+
+  override def frequency(item: K): Approximate[Long] = Approximate.exact(0L)
+
+  override def innerProduct(other: CMS[K]): Approximate[Long] =
+    Approximate.exact(0L)
+
+}
+
+/**
+ * Used for holding a single element, to avoid repeatedly adding elements from sparse counts tables.
+ */
+case class CMSItem[K](item: K, override val totalCount: Long, override val params: CMSParams[K])
+    extends CMS[K](params) {
+
+  override def +(x: K, count: Long): CMS[K] =
+    SparseCMS[K](params) + (item, totalCount) + (x, count)
+
+  override def ++(other: CMS[K]): CMS[K] =
+    other match {
+      case _: CMSZero[?] => this
+      case other: CMSItem[K] =>
+        CMSInstance[K](params) + (item, totalCount) + (other.item, other.totalCount)
+      case _ => other + item
+    }
+
+  override def frequency(x: K): Approximate[Long] =
+    if (item == x) Approximate.exact(totalCount) else Approximate.exact(0L)
+
+  override def innerProduct(other: CMS[K]): Approximate[Long] =
+    Approximate.exact(totalCount) * other.frequency(item)
+
+}
+
+/**
+ * A sparse Count-Min sketch structure, used for situations where the key is highly skewed.
+ */
+case class SparseCMS[K](
+    exactCountTable: Map[K, Long],
+    override val totalCount: Long,
+    override val params: CMSParams[K]
+) extends CMS[K](params) {
+  import SparseCMS._
+
+  override def +(x: K, count: Long): CMS[K] = {
+    val currentCount = exactCountTable.getOrElse(x, 0L)
+    val newTable = exactCountTable.updated(x, currentCount + count)
+    if (newTable.size < maxExactCount) {
+      // still sparse
+      SparseCMS(newTable, totalCount = totalCount + count, params = params)
+    } else {
+      toDense(newTable, params)
+    }
+  }
+
+  override def ++(other: CMS[K]): CMS[K] =
+    other match {
+      case _: CMSZero[?]       => this
+      case other: CMSItem[K]   => this + (other.item, other.totalCount)
+      case other: SparseCMS[K] =>
+        // This SparseCMS's maxExactCount is used, so ++ is not communitive
+        val newTable = Semigroup.plus(exactCountTable, other.exactCountTable)
+        if (newTable.size < maxExactCount) {
+          // still sparse
+          SparseCMS(newTable, totalCount = totalCount + other.totalCount, params = params)
+        } else {
+          toDense(newTable, params)
+        }
+
+      case other: CMSInstance[K] => other ++ this
+    }
+
+  override def frequency(x: K): Approximate[Long] =
+    Approximate.exact(exactCountTable.getOrElse(x, 0L))
+
+  override def innerProduct(other: CMS[K]): Approximate[Long] =
+    exactCountTable.iterator
+      .map { case (x, count) => Approximate.exact(count) * other.frequency(x) }
+      .reduceOption(_ + _)
+      .getOrElse(Approximate.exact(0L))
+}
+
+object SparseCMS {
+
+  /**
+   * Creates a new [[SparseCMS]] with empty exactCountTable
+   */
+  def apply[K](params: CMSParams[K]): SparseCMS[K] = {
+    val exactCountTable = Map[K, Long]()
+    SparseCMS[K](exactCountTable, totalCount = 0, params = params)
+  }
+
+  /**
+   * Creates a new [[CMSInstance]] from a Map[K, Long]
+   */
+  def toDense[K](exactCountTable: Map[K, Long], params: CMSParams[K]): CMS[K] =
+    // Create new CMSInstace
+    exactCountTable.foldLeft(CMSInstance[K](params)) { case (cms, (x, count)) =>
+      cms + (x, count)
+    }
+}
+
+/**
+ * The general Count-Min sketch structure, used for holding any number of elements.
+ */
+case class CMSInstance[K](
+    countsTable: CMSInstance.CountsTable[K],
+    override val totalCount: Long,
+    override val params: CMSParams[K]
+) extends CMS[K](params) {
+
+  override def ++(other: CMS[K]): CMS[K] =
+    other match {
+      case _: CMSZero[?]     => this
+      case other: CMSItem[K] => this + other.item
+      case other: SparseCMS[K] =>
+        other.exactCountTable.foldLeft(this) { case (cms, (x, count)) =>
+          cms + (x, count)
+        }
+      case other: CMSInstance[K] =>
+        val newTable = countsTable ++ other.countsTable
+        val newTotalCount = totalCount + other.totalCount
+        CMSInstance[K](newTable, newTotalCount, params)
+    }
+
+  private def makeApprox(est: Long): Approximate[Long] =
+    if (est == 0L) Approximate.exact(0L)
+    else {
+      val lower = math.max(0L, est - (eps * totalCount).toLong)
+      Approximate(lower, est, est, 1 - delta)
+    }
+
+  override def frequency(item: K): Approximate[Long] = {
+    var freq = Long.MaxValue
+    val hs = params.hashes
+    val it = countsTable.counts.iterator
+    var i = 0
+    while (it.hasNext) {
+      val row = it.next()
+      val count = row(hs(i)(item))
+      if (count < freq) freq = count
+      i += 1
+    }
+    makeApprox(freq)
+  }
+
+  /**
+   * Let X be a CMS, and let count_X[j, k] denote the value in X's 2-dimensional count table at row j and
+   * column k. Then the Count-Min sketch estimate of the inner product between A and B is the minimum inner
+   * product between their rows: estimatedInnerProduct = min_j (\sum_k count_A[j, k] * count_B[j, k]|)
+   */
+  override def innerProduct(other: CMS[K]): Approximate[Long] =
+    other match {
+      case other: CMSInstance[?] =>
+        require(other.depth == depth && other.width == width, "Tables must have the same dimensions.")
+
+        def innerProductAtDepth(d: Int) =
+          (0 to (width - 1)).iterator.map { w =>
+            countsTable.getCount((d, w)) * other.countsTable.getCount((d, w))
+          }.sum
+
+        val est = (0 to (depth - 1)).iterator.map(innerProductAtDepth).min
+        val minimum =
+          math.max(est - (eps * totalCount * other.totalCount).toLong, 0)
+        Approximate(minimum, est, est, 1 - delta)
+      case _ => other.innerProduct(this)
+    }
+
+  override def +(item: K, count: Long): CMSInstance[K] = {
+    require(count >= 0, "count must be >= 0 (negative counts not implemented")
+    if (count != 0L) {
+      val newCountsTable =
+        (0 to (depth - 1)).foldLeft(countsTable) { case (table, row) =>
+          val pos = (row, params.hashes(row)(item))
+          table + (pos, count)
+        }
+      CMSInstance[K](newCountsTable, totalCount + count, params)
+    } else this
+  }
+
+}
+
+object CMSInstance {
+
+  /**
+   * Initializes a [[CMSInstance]] with all zeroes, i.e. nothing has been counted yet.
+   */
+  def apply[K](params: CMSParams[K]): CMSInstance[K] = {
+    val countsTable = CountsTable[K](CMSFunctions.depth(params.delta), CMSFunctions.width(params.eps))
+    CMSInstance[K](countsTable, 0, params)
+  }
+
+  /**
+   * The 2-dimensional table of counters used in the Count-Min sketch. Each row corresponds to a particular
+   * hash function.
+   */
+  // TODO: implement a dense matrix type, and use it here
+  case class CountsTable[K](counts: Vector[Vector[Long]]) {
+    require(depth > 0, "Table must have at least 1 row.")
+    require(width > 0, "Table must have at least 1 column.")
+
+    def depth: Int = counts.size
+
+    def width: Int = counts(0).size
+
+    def getCount(pos: (Int, Int)): Long = {
+      val (row, col) = pos
+      require(row < depth && col < width, "Position must be within the bounds of this table.")
+      counts(row)(col)
+    }
+
+    /**
+     * Updates the count of a single cell in the table.
+     */
+    def +(pos: (Int, Int), count: Long): CountsTable[K] = {
+      val (row, col) = pos
+      val currCount = getCount(pos)
+      val newCounts =
+        counts.updated(row, counts(row).updated(col, currCount + count))
+      CountsTable[K](newCounts)
+    }
+
+    /**
+     * Adds another counts table to this one, through element-wise addition.
+     */
+    def ++(other: CountsTable[K]): CountsTable[K] = {
+      require(depth == other.depth && width == other.width, "Tables must have the same dimensions.")
+      val xss = this.counts.iterator
+      val yss = other.counts.iterator
+      val rows = Vector.newBuilder[Vector[Long]]
+      while (xss.hasNext) {
+        val xs = xss.next().iterator
+        val ys = yss.next().iterator
+        val row = Vector.newBuilder[Long]
+        while (xs.hasNext) row += (xs.next() + ys.next())
+        rows += row.result()
+      }
+      CountsTable[K](rows.result())
+    }
+  }
+
+  object CountsTable {
+
+    /**
+     * Creates a new [[CountsTable]] with counts initialized to all zeroes.
+     */
+    def apply[K](depth: Int, width: Int): CountsTable[K] =
+      CountsTable[K](Vector.fill[Long](depth, width)(0L))
+
+  }
+
+}
+
+case class TopCMSParams[K](logic: HeavyHittersLogic[K])
+
+/**
+ * A Count-Min sketch data structure that allows for (a) counting and frequency estimation of elements in a
+ * data stream and (b) tracking the heavy hitters among these elements.
+ *
+ * The logic of how heavy hitters are computed is pluggable, see [[HeavyHittersLogic]].
+ *
+ * Tip: If you do not need to track heavy hitters, take a look at [[CMS]], which is more efficient in this
+ * case.
+ *
+ * =Usage=
+ *
+ * This example demonstrates how to count `Long` elements with [[TopCMS]], i.e. `K=Long`.
+ *
+ * Note that the actual counting is always performed with a `Long`, regardless of your choice of `K`. That is,
+ * the counting table behind the scenes is backed by `Long` values (at least in the current implementation),
+ * and thus the returned frequency estimates are always instances of `Approximate[Long]`.
+ *
+ * @example
+ *   {{{ // Creates a monoid for a CMS that can count `Long` elements. val topPctCMSMonoid:
+ *   TopPctCMSMonoid[Long] = { val eps = 0.001 val delta = 1E-10 val seed = 1 val heavyHittersPct = 0.1
+ *   TopPctCMS.monoid[Long](eps, delta, seed, heavyHittersPct) }
+ *
+ * // Creates a TopCMS instance that has counted the element `1L`. val topCMS: TopCMS[Long] =
+ * topPctCMSMonoid.create(1L)
+ *
+ * // Estimates the frequency of `1L` val estimate: Approximate[Long] = topCMS.frequency(1L)
+ *
+ * // What are the heavy hitters so far? val heavyHitters: Set[Long] = topCMS.heavyHitters }}}
+ *
+ * @tparam K
+ *   The type used to identify the elements to be counted.
+ */
+sealed abstract class TopCMS[K](val cms: CMS[K], params: TopCMSParams[K])
+    extends java.io.Serializable
+    with CMSCounting[K, TopCMS]
+    with CMSHeavyHitters[K] {
+
+  override val eps: Double = cms.eps
+
+  override val delta: Double = cms.delta
+
+  override val totalCount: Long = cms.totalCount
+
+  override val maxExactCountOpt: Option[Int] = cms.maxExactCountOpt
+
+  override def frequency(item: K): Approximate[Long] = cms.frequency(item)
+
+  override def innerProduct(other: TopCMS[K]): Approximate[Long] =
+    cms.innerProduct(other.cms)
+
+  override def f2: Approximate[Long] = innerProduct(this)
+
+  /**
+   * The pluggable logic with which heavy hitters are being tracked.
+   */
+  override def heavyHittersLogic: HeavyHittersLogic[K] = params.logic
+
+}
+
+/**
+ * Zero element. Used for initialization.
+ */
+case class TopCMSZero[K](override val cms: CMS[K], params: TopCMSParams[K]) extends TopCMS[K](cms, params) {
+
+  override val heavyHitters: Set[K] = Set.empty[K]
+
+  override def +(item: K, count: Long): TopCMS[K] =
+    TopCMSInstance(cms, params) + (item, count)
+
+  override def ++(other: TopCMS[K]): TopCMS[K] = other
+
+}
+
+/**
+ * Used for holding a single element, to avoid repeatedly adding elements from sparse counts tables.
+ */
+case class TopCMSItem[K](item: K, override val cms: CMS[K], params: TopCMSParams[K])
+    extends TopCMS[K](cms, params) {
+
+  override val heavyHitters: Set[K] = Set(item)
+
+  override def +(x: K, count: Long): TopCMS[K] = toCMSInstance + (x, count)
+
+  override def ++(other: TopCMS[K]): TopCMS[K] = other match {
+    case _: TopCMSZero[?]         => this
+    case other: TopCMSItem[K]     => toCMSInstance + other.item
+    case other: TopCMSInstance[K] => other + item
+  }
+
+  private def toCMSInstance: TopCMSInstance[K] = {
+    val hhs = HeavyHitters.from(HeavyHitter(item, 1L))
+    TopCMSInstance(cms, hhs, params)
+  }
+
+}
+
+object TopCMSInstance {
+
+  def apply[K](cms: CMS[K], params: TopCMSParams[K]): TopCMSInstance[K] =
+    TopCMSInstance[K](cms, HeavyHitters.empty[K], params)
+
+}
+
+case class TopCMSInstance[K](override val cms: CMS[K], hhs: HeavyHitters[K], params: TopCMSParams[K])
+    extends TopCMS[K](cms, params) {
+
+  override def heavyHitters: Set[K] = hhs.items
+
+  override def +(item: K, count: Long): TopCMSInstance[K] = {
+    require(count >= 0, "count must be >= 0 (negative counts not implemented")
+    if (count != 0L) {
+      val newCms = cms + (item, count)
+      val newHhs =
+        heavyHittersLogic.updateHeavyHitters(cms, newCms)(hhs, item, count)
+      TopCMSInstance[K](newCms, newHhs, params)
+    } else this
+  }
+
+  override def ++(other: TopCMS[K]): TopCMS[K] = other match {
+    case _: TopCMSZero[?]     => this
+    case other: TopCMSItem[K] => this + other.item
+    case other: TopCMSInstance[K] =>
+      val newCms = cms ++ other.cms
+      val newHhs = heavyHittersLogic.updateHeavyHitters(newCms)(hhs, other.hhs)
+      TopCMSInstance(newCms, newHhs, params)
+  }
+
+}
+
+class TopCMSMonoid[K](emptyCms: CMS[K], logic: HeavyHittersLogic[K]) extends Monoid[TopCMS[K]] {
+
+  val params: TopCMSParams[K] = TopCMSParams(logic)
+
+  override val zero: TopCMS[K] = TopCMSZero[K](emptyCms, params)
+
+  /**
+   * Combines the two sketches.
+   *
+   * The sketches must use the same hash functions.
+   */
+  override def plus(left: TopCMS[K], right: TopCMS[K]): TopCMS[K] = {
+    require(
+      left.cms.params.hashes == right.cms.params.hashes,
+      "The sketches must use the same hash functions."
+    )
+    left ++ right
+  }
+
+  /**
+   * Creates a sketch out of a single item.
+   */
+  def create(item: K): TopCMS[K] =
+    TopCMSItem[K](item, emptyCms + item, params)
+
+  /**
+   * Creates a sketch out of multiple items.
+   */
+  def create(data: Seq[K]): TopCMS[K] =
+    data.foldLeft(zero) { case (acc, x) => plus(acc, create(x)) }
+
+  override def sum(sketches: TraversableOnce[TopCMS[K]]): TopCMS[K] = {
+    val topCandidates = scala.collection.mutable.Set.empty[K]
+    val summation = new CMSSummation(emptyCms.params)
+    sketches.iterator.foreach { sketch =>
+      summation.updateInto(sketch.cms)
+      topCandidates ++= sketch.heavyHitters
+    }
+    val cms = summation.result
+    val ests =
+      topCandidates.map(k => HeavyHitter(k, cms.frequency(k).estimate)).toSet
+    val hhs = logic.purgeHeavyHitters(cms)(HeavyHitters(ests))
+    TopCMSInstance(cms, hhs, params)
+  }
+
+  override def sumOption(sketches: TraversableOnce[TopCMS[K]]): Option[TopCMS[K]] =
+    if (sketches.iterator.isEmpty) None else Some(sum(sketches))
+}
+
+class TopCMSAggregator[K](cmsMonoid: TopCMSMonoid[K]) extends MonoidAggregator[K, TopCMS[K], TopCMS[K]] {
+
+  override def monoid: TopCMSMonoid[K] = cmsMonoid
+
+  override def prepare(value: K): TopCMS[K] = monoid.create(value)
+
+  override def present(cms: TopCMS[K]): TopCMS[K] = cms
+
+}
+
+/**
+ * Controls how a CMS that implements [[CMSHeavyHitters]] tracks heavy hitters.
+ */
+abstract class HeavyHittersLogic[K] extends java.io.Serializable {
+
+  def updateHeavyHitters(
+      oldCms: CMS[K],
+      newCms: CMS[K]
+  )(hhs: HeavyHitters[K], item: K, count: Long): HeavyHitters[K] = {
+    val oldItemCount = oldCms.frequency(item).estimate
+    val oldHh = HeavyHitter[K](item, oldItemCount)
+    val newItemCount = oldItemCount + count
+    val newHh = HeavyHitter[K](item, newItemCount)
+    purgeHeavyHitters(newCms)(hhs - oldHh + newHh)
+  }
+
+  def updateHeavyHitters(cms: CMS[K])(left: HeavyHitters[K], right: HeavyHitters[K]): HeavyHitters[K] = {
+    val candidates = (left.items ++ right.items).map { case i =>
+      HeavyHitter[K](i, cms.frequency(i).estimate)
+    }
+    val newHhs = HeavyHitters.from(candidates)
+    purgeHeavyHitters(cms)(newHhs)
+  }
+
+  def purgeHeavyHitters(cms: CMS[K])(hhs: HeavyHitters[K]): HeavyHitters[K]
+
+}
+
+/**
+ * Finds all heavy hitters, i.e., elements in the stream that appear at least `(heavyHittersPct * totalCount)`
+ * times.
+ *
+ * Every item that appears at least `(heavyHittersPct * totalCount)` times is output, and with probability `p
+ * >= 1 - delta`, no item whose count is less than `(heavyHittersPct - eps) * totalCount` is output.
+ *
+ * This also means that this parameter is an upper bound on the number of heavy hitters that will be tracked:
+ * the set of heavy hitters contains at most `1 / heavyHittersPct` elements. For example, if
+ * `heavyHittersPct=0.01` (or 0.25), then at most `1 / 0.01 = 100` items (or `1 / 0.25 = 4` items) will be
+ * tracked/returned as heavy hitters. This parameter can thus control the memory footprint required for
+ * tracking heavy hitters.
+ */
+case class TopPctLogic[K](heavyHittersPct: Double) extends HeavyHittersLogic[K] {
+
+  require(0 < heavyHittersPct && heavyHittersPct < 1, "heavyHittersPct must lie in (0, 1)")
+
+  override def purgeHeavyHitters(cms: CMS[K])(hitters: HeavyHitters[K]): HeavyHitters[K] = {
+    val minCount = heavyHittersPct * cms.totalCount
+    HeavyHitters[K](hitters.hhs.filter(_.count >= minCount))
+  }
+
+}
+
+/**
+ * Tracks the top N heavy hitters, where `N` is defined by `heavyHittersN`.
+ *
+ * '''Warning:''' top-N computations are not associative. The effect is that a top-N CMS has an ordering bias
+ * (with regard to heavy hitters) when merging instances. This means merging heavy hitters across CMS
+ * instances may lead to incorrect, biased results: the outcome is biased by the order in which CMS instances
+ * / heavy hitters are being merged, with the rule of thumb being that the earlier a set of heavy hitters is
+ * being merged, the more likely is the end result biased towards these heavy hitters.
+ *
+ * @see
+ *   Discussion in [[https://github.com/twitter/algebird/issues/353 Algebird issue 353]]
+ */
+case class TopNLogic[K](heavyHittersN: Int) extends HeavyHittersLogic[K] {
+
+  require(heavyHittersN > 0, "heavyHittersN must be > 0")
+
+  override def purgeHeavyHitters(cms: CMS[K])(hitters: HeavyHitters[K]): HeavyHitters[K] = {
+    val sorted =
+      hitters.hhs.toSeq.sortBy(hh => hh.count).takeRight(heavyHittersN)
+    HeavyHitters[K](sorted.toSet)
+  }
+
+}
+
+/**
+ * Containers for holding heavy hitter items and their associated counts.
+ */
+case class HeavyHitters[K](hhs: Set[HeavyHitter[K]]) extends java.io.Serializable {
+
+  def -(hh: HeavyHitter[K]): HeavyHitters[K] = HeavyHitters[K](hhs - hh)
+
+  def +(hh: HeavyHitter[K]): HeavyHitters[K] = HeavyHitters[K](hhs + hh)
+
+  def ++(other: HeavyHitters[K]): HeavyHitters[K] =
+    HeavyHitters[K](hhs ++ other.hhs)
+
+  def items: Set[K] = hhs.map(_.item)
+
+}
+
+object HeavyHitters {
+
+  def empty[K]: HeavyHitters[K] = HeavyHitters(emptyHhs)
+
+  private def emptyHhs[K]: Set[HeavyHitter[K]] = Set[HeavyHitter[K]]()
+
+  def from[K](hhs: Set[HeavyHitter[K]]): HeavyHitters[K] =
+    hhs.foldLeft(empty[K])(_ + _)
+
+  def from[K](hh: HeavyHitter[K]): HeavyHitters[K] = HeavyHitters(emptyHhs + hh)
+
+}
+
+case class HeavyHitter[K](item: K, count: Long) extends java.io.Serializable
+
+/**
+ * Monoid for Top-% based [[TopCMS]] sketches.
+ *
+ * =Usage=
+ *
+ * The type `K` is the type of items you want to count. You must provide an implicit `CMSHasher[K]` for `K`,
+ * and Algebird ships with several such implicits for commonly used types such as `Long` and `BigInt`.
+ *
+ * If your type `K` is not supported out of the box, you have two options: 1) You provide a "translation"
+ * function to convert items of your (unsupported) type `K` to a supported type such as Double, and then use
+ * the `contramap` function of [[CMSHasher]] to create the required `CMSHasher[K]` for your type (see the
+ * documentation of [[CMSHasher]] for an example); 2) You implement a `CMSHasher[K]` from scratch, using the
+ * existing CMSHasher implementations as a starting point.
+ *
+ * Note: Because Arrays in Scala/Java not have sane `equals` and `hashCode` implementations, you cannot safely
+ * use types such as `Array[Byte]`. Extra work is required for Arrays. For example, you may opt to convert
+ * `Array[T]` to a `Seq[T]` via `toSeq`, or you can provide appropriate wrapper classes. Algebird provides one
+ * such wrapper class, [[Bytes]], to safely wrap an `Array[Byte]` for use with CMS.
+ *
+ * @param cms
+ *   A [[CMS]] instance, which is used for the counting and the frequency estimation performed by this class.
+ * @param heavyHittersPct
+ *   A threshold for finding heavy hitters, i.e., elements that appear at least (heavyHittersPct * totalCount)
+ *   times in the stream.
+ * @tparam K
+ *   The type used to identify the elements to be counted. For example, if you want to count the occurrence of
+ *   user names, you could map each username to a unique numeric ID expressed as a `Long`, and then count the
+ *   occurrences of those `Long`s with a CMS of type `K=Long`. Note that this mapping between the elements of
+ *   your problem domain and their identifiers used for counting via CMS should be bijective. We require a
+ *   [[CMSHasher]] context bound for `K`, see [[CMSHasher]] for available implicits that can be imported.
+ *   Which type K should you pick in practice? For domains that have less than `2^64` unique elements, you'd
+ *   typically use `Long`. For larger domains you can try `BigInt`, for example.
+ */
+class TopPctCMSMonoid[K](cms: CMS[K], heavyHittersPct: Double = 0.01)
+    extends TopCMSMonoid[K](cms, TopPctLogic[K](heavyHittersPct))
+
+object TopPctCMS {
+
+  def monoid[K: CMSHasher](
+      eps: Double,
+      delta: Double,
+      seed: Int,
+      heavyHittersPct: Double
+  ): TopPctCMSMonoid[K] =
+    new TopPctCMSMonoid[K](CMS(eps, delta, seed), heavyHittersPct)
+
+  def monoid[K: CMSHasher](depth: Int, width: Int, seed: Int, heavyHittersPct: Double): TopPctCMSMonoid[K] =
+    monoid(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersPct)
+
+  def aggregator[K: CMSHasher](
+      eps: Double,
+      delta: Double,
+      seed: Int,
+      heavyHittersPct: Double
+  ): TopPctCMSAggregator[K] =
+    new TopPctCMSAggregator[K](monoid(eps, delta, seed, heavyHittersPct))
+
+  def aggregator[K: CMSHasher](
+      depth: Int,
+      width: Int,
+      seed: Int,
+      heavyHittersPct: Double
+  ): TopPctCMSAggregator[K] =
+    aggregator(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersPct)
+
+}
+
+/**
+ * An Aggregator for [[TopPctCMS]]. Can be created using [[TopPctCMS.aggregator]].
+ */
+case class TopPctCMSAggregator[K](cmsMonoid: TopPctCMSMonoid[K]) extends TopCMSAggregator(cmsMonoid)
+
+/**
+ * Monoid for top-N based [[TopCMS]] sketches. '''Use with care! (see warning below)'''
+ *
+ * =Warning: Adding top-N CMS instances (`++`) is an unsafe operation=
+ *
+ * Top-N computations are not associative. The effect is that a top-N CMS has an ordering bias (with regard to
+ * heavy hitters) when ''merging'' CMS instances (e.g. via `++`). This means merging heavy hitters across CMS
+ * instances may lead to incorrect, biased results: the outcome is biased by the order in which CMS instances
+ * / heavy hitters are being merged, with the rule of thumb being that the earlier a set of heavy hitters is
+ * being merged, the more likely is the end result biased towards these heavy hitters.
+ *
+ * The warning above only applies when ''adding CMS instances'' (think: `cms1 ++ cms2`). In comparison, heavy
+ * hitters are correctly computed when:
+ *
+ *   - a top-N CMS instance is created from a single data stream, i.e. `Seq[K]`
+ *   - items are added/counted individually, i.e. `cms + item` or `cms + (item, count)`.
+ *
+ * See the discussion in [[https://github.com/twitter/algebird/issues/353 Algebird issue 353]] for further
+ * details.
+ *
+ * =Alternatives=
+ *
+ * The following, alternative data structures may be better picks than a top-N based CMS given the warning
+ * above:
+ *
+ *   - [[TopPctCMS]]: Has safe merge semantics for its instances including heavy hitters.
+ *   - [[SpaceSaver]]: Has the same ordering bias than a top-N CMS, but at least it provides bounds on the
+ *     bias.
+ *
+ * =Usage=
+ *
+ * The type `K` is the type of items you want to count. You must provide an implicit `CMSHasher[K]` for `K`,
+ * and Algebird ships with several such implicits for commonly used types such as `Long` and `BigInt`.
+ *
+ * If your type `K` is not supported out of the box, you have two options: 1) You provide a "translation"
+ * function to convert items of your (unsupported) type `K` to a supported type such as [[Double]], and then
+ * use the `contramap` function of [[CMSHasher]] to create the required `CMSHasher[K]` for your type (see the
+ * documentation of [[CMSHasher]] for an example); 2) You implement a `CMSHasher[K]` from scratch, using the
+ * existing CMSHasher implementations as a starting point.
+ *
+ * Note: Because Arrays in Scala/Java not have sane `equals` and `hashCode` implementations, you cannot safely
+ * use types such as `Array[Byte]`. Extra work is required for Arrays. For example, you may opt to convert
+ * `Array[T]` to a `Seq[T]` via `toSeq`, or you can provide appropriate wrapper classes. Algebird provides one
+ * such wrapper class, [[Bytes]], to safely wrap an `Array[Byte]` for use with CMS.
+ *
+ * @param cms
+ *   A [[CMS]] instance, which is used for the counting and the frequency estimation performed by this class.
+ * @param heavyHittersN
+ *   The maximum number of heavy hitters to track.
+ * @tparam K
+ *   The type used to identify the elements to be counted. For example, if you want to count the occurrence of
+ *   user names, you could map each username to a unique numeric ID expressed as a `Long`, and then count the
+ *   occurrences of those `Long`s with a CMS of type `K=Long`. Note that this mapping between the elements of
+ *   your problem domain and their identifiers used for counting via CMS should be bijective. We require a
+ *   [[CMSHasher]] context bound for `K`, see [[CMSHasher]] for available implicits that can be imported.
+ *   Which type K should you pick in practice? For domains that have less than `2^64` unique elements, you'd
+ *   typically use `Long`. For larger domains you can try `BigInt`, for example.
+ */
+class TopNCMSMonoid[K](cms: CMS[K], heavyHittersN: Int = 100)
+    extends TopCMSMonoid[K](cms, TopNLogic[K](heavyHittersN))
+
+object TopNCMS {
+
+  def monoid[K: CMSHasher](eps: Double, delta: Double, seed: Int, heavyHittersN: Int): TopNCMSMonoid[K] =
+    new TopNCMSMonoid[K](CMS(eps, delta, seed), heavyHittersN)
+
+  def monoid[K: CMSHasher](depth: Int, width: Int, seed: Int, heavyHittersN: Int): TopNCMSMonoid[K] =
+    monoid(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersN)
+
+  def aggregator[K: CMSHasher](
+      eps: Double,
+      delta: Double,
+      seed: Int,
+      heavyHittersN: Int
+  ): TopNCMSAggregator[K] =
+    new TopNCMSAggregator[K](monoid(eps, delta, seed, heavyHittersN))
+
+  def aggregator[K: CMSHasher](depth: Int, width: Int, seed: Int, heavyHittersN: Int): TopNCMSAggregator[K] =
+    aggregator(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersN)
+
+}
+
+/**
+ * An Aggregator for [[TopNCMS]]. Can be created using [[TopNCMS.aggregator]].
+ */
+case class TopNCMSAggregator[K](cmsMonoid: TopNCMSMonoid[K]) extends TopCMSAggregator(cmsMonoid)
+
+/**
+ * K1 defines a scope for the CMS. For each k1, keep the top heavyHittersN associated k2 values.
+ */
+case class ScopedTopNLogic[K1, K2](heavyHittersN: Int) extends HeavyHittersLogic[(K1, K2)] {
+
+  require(heavyHittersN > 0, "heavyHittersN must be > 0")
+
+  override def purgeHeavyHitters(
+      cms: CMS[(K1, K2)]
+  )(hitters: HeavyHitters[(K1, K2)]): HeavyHitters[(K1, K2)] = {
+    val grouped = hitters.hhs.groupBy(hh => hh.item._1)
+    val (underLimit, overLimit) = grouped.partition {
+      _._2.size <= heavyHittersN
+    }
+    val sorted = overLimit.transform { case (_, hhs) =>
+      hhs.toSeq.sortBy(hh => hh.count)
+    }
+    val purged = sorted.transform { case (_, hhs) =>
+      hhs.takeRight(heavyHittersN)
+    }
+    HeavyHitters[(K1, K2)](purged.values.flatten.toSet ++ underLimit.values.flatten.toSet)
+  }
+
+}
+
+/*
+ * Monoid for Top-N values per key in an associative [[TopCMS]].
+ *
+ * Typical use case for this might be (Country, City) pairs.  For a stream of such
+ * pairs, we might want to keep track of the most popular cities for each country.
+ *
+ * This can, of course, be achieved using a Map[Country, TopNCMS[City]], but this
+ * requires storing one CMS per distinct Country.
+ *
+ * Similarly, one could attempt to use a TopNCMS[(Country, City)], but less common
+ * countries may not make the cut if N is not "very large".
+ *
+ * ScopedTopNCMSMonoid[Country, City] will avoid having one Country drown others
+ * out, while still only using a single CMS.
+ *
+ * In general the eviction of K1 is not supported, and all distinct K1 values must
+ * be retained.  Therefore it is important to only use this Monoid when the number
+ * of distinct K1 values is known to be reasonably bounded.
+ */
+class ScopedTopNCMSMonoid[K1, K2](cms: CMS[(K1, K2)], heavyHittersN: Int = 100)
+    extends TopCMSMonoid[(K1, K2)](cms, ScopedTopNLogic[K1, K2](heavyHittersN))
+
+object ScopedTopNCMS {
+
+  def scopedHasher[K1: CMSHasher, K2: CMSHasher]: CMSHasher[(K1, K2)] = new CMSHasher[(K1, K2)] {
+    private val k1Hasher = implicitly[CMSHasher[K1]]
+    private val k2Hasher = implicitly[CMSHasher[K2]]
+
+    override def hash(a: Int, b: Int, width: Int)(x: (K1, K2)): Int = {
+      val (k1, k2) = x
+      val xs = Seq(k1Hasher.hash(a, b, width)(k1), k2Hasher.hash(a, b, width)(k2), a, b)
+      (scala.util.hashing.MurmurHash3.seqHash(xs) & Int.MaxValue) % width
+    }
+  }
+
+  def monoid[K1: CMSHasher, K2: CMSHasher](
+      eps: Double,
+      delta: Double,
+      seed: Int,
+      heavyHittersN: Int
+  ): ScopedTopNCMSMonoid[K1, K2] =
+    new ScopedTopNCMSMonoid[K1, K2](CMS(eps, delta, seed)(scopedHasher[K1, K2]), heavyHittersN)
+
+  def monoid[K1: CMSHasher, K2: CMSHasher](
+      depth: Int,
+      width: Int,
+      seed: Int,
+      heavyHittersN: Int
+  ): ScopedTopNCMSMonoid[K1, K2] =
+    monoid(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersN)
+
+  def aggregator[K1: CMSHasher, K2: CMSHasher](
+      eps: Double,
+      delta: Double,
+      seed: Int,
+      heavyHittersN: Int
+  ): TopCMSAggregator[(K1, K2)] =
+    new TopCMSAggregator(monoid(eps, delta, seed, heavyHittersN))
+
+  def aggregator[K1: CMSHasher, K2: CMSHasher](
+      depth: Int,
+      width: Int,
+      seed: Int,
+      heavyHittersN: Int
+  ): TopCMSAggregator[(K1, K2)] =
+    aggregator(CMSFunctions.eps(width), CMSFunctions.delta(depth), seed, heavyHittersN)
+
+}
+
+case class CMSHash[K: CMSHasher](a: Int, b: Int, width: Int) extends java.io.Serializable {
+
+  /**
+   * Returns `a * x + b (mod p) (mod width)`.
+   */
+  def apply(x: K): Int = implicitly[CMSHasher[K]].hash(a, b, width)(x)
+
+}
+
+/**
+ * This formerly held the instances that moved to object CMSHasher
+ *
+ * These instances are slow, but here for compatibility with old serialized data. For new code, avoid these
+ * and instead use the implicits found in the CMSHasher companion object.
+ */
+object CMSHasherImplicits {
+
+  implicit object CMSHasherBigInt extends CMSHasher[BigInt] {
+    override def hash(a: Int, b: Int, width: Int)(x: BigInt): Int =
+      CMSHasher.hashBytes(a, b, width)(x.toByteArray)
+  }
+
+  implicit object CMSHasherString extends CMSHasher[String] {
+    override def hash(a: Int, b: Int, width: Int)(x: String): Int =
+      CMSHasher.hashBytes(a, b, width)(x.getBytes("UTF-8"))
+  }
+
+  def cmsHasherShort: CMSHasher[Short] = CMSHasher.cmsHasherShort
+}
diff --git a/algebird-core/src/main/scala-3/DecayedVector.scala b/algebird-core/src/main/scala-3/DecayedVector.scala
new file mode 100644
index 000000000..18e816fe4
--- /dev/null
+++ b/algebird-core/src/main/scala-3/DecayedVector.scala
@@ -0,0 +1,75 @@
+/*
+Copyright 2012 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+ */
+
+package com.twitter.algebird
+
+/**
+ * Represents a container class together with time. Its monoid consists of exponentially scaling the older
+ * value and summing with the newer one.
+ */
+object DecayedVector extends CompatDecayedVector {
+  def buildWithHalflife[C[_]](vector: C[Double], time: Double, halfLife: Double): DecayedVector[C] =
+    DecayedVector(vector, time * scala.math.log(2.0) / halfLife)
+
+  def monoidWithEpsilon[C[_]](
+      eps: Double
+  )(implicit vs: VectorSpace[Double, C], metric: Metric[C[Double]]): Monoid[DecayedVector[C]] =
+    new Monoid[DecayedVector[C]] {
+      override val zero = DecayedVector(vs.group.zero, Double.NegativeInfinity)
+      override def plus(left: DecayedVector[C], right: DecayedVector[C]) =
+        if (left.scaledTime <= right.scaledTime) {
+          scaledPlus(right, left, eps)
+        } else {
+          scaledPlus(left, right, eps)
+        }
+    }
+
+  def forMap[K](m: Map[K, Double], scaledTime: Double): DecayedVector[Map[K, _]] =
+    DecayedVector[Map[K, _]](m, scaledTime)
+  def forMapWithHalflife[K](m: Map[K, Double], time: Double, halfLife: Double): DecayedVector[Map[K, _]] =
+    forMap(m, time * scala.math.log(2.0) / halfLife)
+
+  def mapMonoidWithEpsilon[K](
+      eps: Double
+  )(implicit
+      vs: VectorSpace[Double, Map[K, _]],
+      metric: Metric[Map[K, Double]]
+  ): Monoid[DecayedVector[Map[K, _]]] =
+    monoidWithEpsilon[Map[K, _]](eps)
+
+  implicit def mapMonoid[K](implicit
+      vs: VectorSpace[Double, Map[K, _]],
+      metric: Metric[Map[K, Double]]
+  ): Monoid[DecayedVector[Map[K, _]]] =
+    mapMonoidWithEpsilon(-1.0)
+
+  def scaledPlus[C[_]](newVal: DecayedVector[C], oldVal: DecayedVector[C], eps: Double)(implicit
+      vs: VectorSpace[Double, C],
+      metric: Metric[C[Double]]
+  ): DecayedVector[C] = {
+    implicit val mon: Monoid[C[Double]] = vs.group
+    val expFactor = scala.math.exp(oldVal.scaledTime - newVal.scaledTime)
+    val newVector =
+      Monoid.plus(newVal.vector, vs.scale(expFactor, oldVal.vector))
+    if (eps < 0.0 || Metric.norm(newVector) > eps) {
+      DecayedVector(newVector, newVal.scaledTime)
+    } else {
+      DecayedVector(mon.zero, Double.NegativeInfinity)
+    }
+  }
+}
+
+case class DecayedVector[C[_]](vector: C[Double], scaledTime: Double)
diff --git a/algebird-core/src/main/scala-3/DecayingCMS.scala b/algebird-core/src/main/scala-3/DecayingCMS.scala
new file mode 100644
index 000000000..54809e2a8
--- /dev/null
+++ b/algebird-core/src/main/scala-3/DecayingCMS.scala
@@ -0,0 +1,650 @@
+package com.twitter.algebird
+
+import java.lang.Double.{compare => cmp}
+import java.lang.Math
+import java.util.Arrays.deepHashCode
+import scala.concurrent.duration.Duration
+import scala.util.Random
+
+/**
+ * DecayingCMS is a module to build count-min sketch instances whose counts decay exponentially.
+ *
+ * Similar to a Map[K, com.twitter.algebird.DecayedValue], each key is associated with a single count value
+ * that decays over time. Unlike a map, the decyaing CMS is an approximate count -- in exchange for the
+ * possibility of over-counting, we can bound its size in memory.
+ *
+ * The intended use case is for metrics or machine learning where exact values aren't needed.
+ *
+ * You can expect the keys with the biggest values to be fairly accurate but the very small values (rare keys
+ * or very old keys) to be lost in the noise. For both metrics and ML this should be fine: you can't learn too
+ * much from very rare values.
+ *
+ * We recommend depth of at least 5, and width of at least 100, but you should do some experiments to
+ * determine the smallest parameters that will work for your use case.
+ */
+final class DecayingCMS[K](
+    seed: Long,
+    val halfLife: Duration,
+    val depth: Int, // number of hashing functions
+    val width: Int, // number of table cells per hashing function
+    hasher: CMSHasher[K]
+) extends Serializable { module =>
+
+  override def toString: String =
+    s"DecayingCMS(seed=$seed, halfLife=$halfLife, depth=$depth, width=$width)"
+
+  @inline private def getNextLogScale(
+      logScale: Double,
+      oldTimeInHL: Double,
+      nowInHL: Double
+  ): Double =
+    if (nowInHL == oldTimeInHL) logScale else logScale + (nowInHL - oldTimeInHL) * log2
+
+  @inline private def getScale(logScale: Double, oldTimeInHL: Double, nowInHL: Double): Double = {
+    val logScale1 = getNextLogScale(logScale, oldTimeInHL, nowInHL)
+    Math.exp(-logScale1)
+  }
+
+  val empty: CMS =
+    new CMS(Array.fill(depth)(Vector.fill[Double](width)(0.0)), 0.0, Double.NegativeInfinity)
+
+  /**
+   * Represents a decaying scalar value at a particular point in time.
+   *
+   * The value decays according to halfLife. Another way to think about DoubleAt is that it represents a
+   * particular decay curve (and in particular, a point along that curve). Two DoubleAt values may be
+   * equivalent if they are two points on the same curve.
+   *
+   * The `timeToZero` and `timeToUnit` methods can be used to "normalize" DoubleAt values. If two DoubleAt
+   * values do not produce the same (approximate) Double values from these methods, they represent different
+   * curves.
+   */
+  class DoubleAt private[algebird] (val value: Double, val timeInHL: Double) extends Serializable {
+    lhs =>
+
+    // this is not public because it's not safe in general -- you need
+    // to run a function that is time-commutative.
+    private[algebird] def map(f: Double => Double): DoubleAt =
+      new DoubleAt(f(value), timeInHL)
+
+    // this is not public because it's not safe in general -- you need
+    // to run a function that is time-commutative.
+    private[algebird] def map2(rhs: DoubleAt)(f: (Double, Double) => Double): DoubleAt =
+      if (lhs.timeInHL < rhs.timeInHL) {
+        val x = lhs.scaledAt(rhs.timeInHL)
+        new DoubleAt(f(x, rhs.value), rhs.timeInHL)
+      } else if (lhs.timeInHL == rhs.timeInHL) {
+        new DoubleAt(f(lhs.value, rhs.value), rhs.timeInHL)
+      } else {
+        val y = rhs.scaledAt(lhs.timeInHL)
+        new DoubleAt(f(lhs.value, y), lhs.timeInHL)
+      }
+
+    def unary_- : DoubleAt = new DoubleAt(-value, timeInHL)
+    def abs: DoubleAt = new DoubleAt(Math.abs(value), timeInHL)
+    def *(n: Double): DoubleAt = new DoubleAt(value * n, timeInHL)
+
+    def +(rhs: DoubleAt): DoubleAt = map2(rhs)(_ + _)
+    def -(rhs: DoubleAt): DoubleAt = map2(rhs)(_ - _)
+    def min(rhs: DoubleAt): DoubleAt = map2(rhs)(Math.min)
+    def max(rhs: DoubleAt): DoubleAt = map2(rhs)(Math.max)
+
+    def /(rhs: DoubleAt): Double = map2(rhs)(_ / _).value
+
+    /**
+     * We consider two DoubleAt values equal not just if their elements are equal, but also if they represent
+     * the same value at different points of decay.
+     */
+    def compare(rhs: DoubleAt): Int = {
+      val vc = cmp(lhs.value, rhs.value)
+      val tc = cmp(lhs.timeInHL, rhs.timeInHL)
+      if (vc == tc) vc
+      else if (tc == 0) vc
+      else if (vc == 0) tc
+      else if (tc < 0) cmp(lhs.scaledAt(rhs.timeInHL), rhs.value)
+      else cmp(lhs.value, rhs.scaledAt(lhs.timeInHL))
+    }
+
+    /**
+     * Time when this value will reach the smallest double value bigger than zero, unless we are already at
+     * zero in which case we return the current time
+     */
+    def timeToZero: Double =
+      if (java.lang.Double.isNaN(value)) Double.NaN
+      else if (java.lang.Double.isInfinite(value)) Double.PositiveInfinity
+      else if (value == 0.0) timeInHL
+      else timeToUnit + DoubleAt.TimeFromUnitToZero
+
+    /**
+     * This is the scaled time when the current value will reach 1 (or -1 for negative values)
+     *
+     * This method is a way of collapsing a DoubleAt into a single value (the time in the past or future where
+     * its value would be 1, the unit value).
+     */
+    def timeToUnit: Double =
+      if (java.lang.Double.isNaN(value)) Double.NaN
+      else if (java.lang.Double.isInfinite(value)) Double.PositiveInfinity
+      else if (value == 0.0) Double.NegativeInfinity
+      else {
+        // solve for result:
+        //
+        //   1 = value * module.getScale(0.0, timeInHL, result)
+        //   1 = value * Math.exp(-getNextLogScale(0.0, timeInHL, result))
+        //   1 / value = Math.exp(-getNextLogScale(0.0, timeInHL, result))
+        //   log(1 / value) = -getNextLogScale(0.0, timeInHL, result)
+        //   -log(1 / value) = getNextLogScale(0.0, timeInHL, result)
+        //   log(value) = getNextLogScale(0.0, timeInHL, result)
+        //   log(value) = if (result == timeInHL) 0 else 0 + (result - timeInHL) * log2
+        //   log(value) = if (result == timeInHL) 0 else (result - timeInHL) * log2
+        //
+        //   log(value) = (result - timeInHL) * log2
+        //   log(value) / log2 = result - timeInHL
+        //   log(value) / log2 + timeInHL = result
+        Math.log(Math.abs(value)) / log2 + timeInHL
+      }
+
+    override def equals(that: Any): Boolean =
+      that match {
+        case d: DoubleAt => compare(d) == 0
+        case _           => false
+      }
+
+    override def hashCode: Int =
+      timeToUnit.##
+
+    override def toString: String =
+      s"DoubleAt($value, $timeInHL)"
+
+    def <(rhs: DoubleAt): Boolean = (lhs.compare(rhs)) < 0
+    def <=(rhs: DoubleAt): Boolean = (lhs.compare(rhs)) <= 0
+    def >(rhs: DoubleAt): Boolean = (lhs.compare(rhs)) > 0
+    def >=(rhs: DoubleAt): Boolean = (lhs.compare(rhs)) >= 0
+
+    def time: Long =
+      toTimestamp(timeInHL)
+
+    private def scaledAt(t: Double): Double =
+      if (value == 0.0) 0.0
+      else value * module.getScale(0.0, timeInHL, t)
+
+    def at(time: Long): Double =
+      if (value == 0.0) 0.0
+      else value * module.getScale(0.0, timeInHL, fromTimestamp(time))
+  }
+
+  object DoubleAt {
+    def apply(x: Double, t: Long): DoubleAt =
+      new DoubleAt(x, fromTimestamp(t))
+
+    val zero: DoubleAt =
+      new DoubleAt(0.0, Double.NegativeInfinity)
+
+    private val TimeFromUnitToZero: Double =
+      -Math.log(Double.MinPositiveValue) / log2
+  }
+
+  val totalCells: Int = depth * width
+
+  val halfLifeSecs: Double =
+    halfLife.toMillis.toDouble / 1000.0
+
+  // TODO: consider a smaller number?
+  // we are trading accuracy for possible performence
+  private[this] val maxLogScale: Double = 20.0
+
+  /**
+   * Allocate an empty array of row.
+   *
+   * The elements start as null. It's an important optimization _not_ to allocate vectors here, since we're
+   * often building up cells mutably.
+   */
+  private def allocCells(): Array[Vector[Double]] =
+    new Array[Vector[Double]](depth)
+
+  def toTimestamp(t: Double): Long =
+    (t * halfLifeSecs * 1000.0).toLong
+
+  def fromTimestamp(t: Long): Double =
+    (t.toDouble / 1000.0) / halfLifeSecs
+
+  val hashFns: Array[K => Int] = {
+    val rng = new Random(seed)
+    def genPos(): Int =
+      rng.nextInt() match {
+        case 0 => genPos()
+        case n => n & 0x7fffffff
+      }
+
+    (0 until depth).map { _ =>
+      val n = genPos()
+      (k: K) => hasher.hash(n, 0, width)(k)
+    }.toArray
+  }
+
+  private final val log2 = Math.log(2.0)
+
+  /**
+   * The idealized formula for the updating current value for a key (y0 -> y1) is given as:
+   *
+   * delta = (t1 - t0) / halflife y1 = y0 * 2^(-delta) + n
+   *
+   * However, we want to avoid having to rescale every single cell every time we update; i.e. a cell with a
+   * zero value should continue to have a zero value when n=0.
+   *
+   * Therefore, we introduce a change of variable to cell values (z) along with a scale factor (scale), and
+   * the following formula:
+   *
+   * (1) zN = yN * scaleN
+   *
+   * Our constraint is expressed as:
+   *
+   * (2) If n=0, z1 = z0
+   *
+   * In that case:
+   *
+   * (3) If n=0, (y1 * scale1) = (y0 * scale0) (4) Substituting for y1, (y0 * 2^(-delta) + 0) * scale1 = y0 *
+   * scale0 (5) 2^(-delta) * scale1 = scale0 (6) scale1 = scale0 * 2^(delta)
+   *
+   * Also, to express z1 in terms of z0, we say:
+   *
+   * (7) z1 = y1 * scale1 (8) z1 = (y0 * 2^(-delta) + n) * scale1 (9) z1 = ((z0 / scale0) * 2^(-delta) + n) *
+   * scale1 (10) z1 / scale1 = (z0 / (scale1 * 2^(-delta))) * 2^(-delta) + n (11) z1 / scale1 = z0 / scale1 +
+   * n (12) z1 = z0 + n * scale1
+   *
+   * So, for cells where n=0, we just update scale0 to scale1, and for cells where n is non-zero, we update z1
+   * in terms of z0 and scale1.
+   *
+   * If we convert scale to logscale, we have:
+   *
+   * (13) logscale1 = logscale0 + delta * log(2) (14) z1 = z0 + n * exp(logscale1)
+   *
+   * When logscale1 gets big, we start to distort z1. For example, exp(36) is close to 2^53. We can measure
+   * when n * exp(logscale1) gets big, and in those cases we can rescale all our cells (set each z to its
+   * corresponding y) and set the logscale to 0.
+   *
+   * (15) y1 = z1 / scale1 (16) y1 = z1 / exp(logscale1) (17) y1 = z1 * exp(-logscale1)
+   */
+  final class CMS(
+      val cells: Array[Vector[Double]],
+      val logScale: Double,
+      val timeInHL: Double
+  ) extends Serializable {
+
+    @inline private def scale: Double =
+      Math.exp(-logScale)
+
+    override def toString: String = {
+      val s = cells.iterator.map(_.toString).mkString("Array(", ", ", ")")
+      s"CMS($s, $logScale, $timeInHL)"
+    }
+
+    override def hashCode: Int =
+      deepHashCode(cells.asInstanceOf[Array[Object]]) * 59 +
+        logScale.## * 17 +
+        timeInHL.## * 37 +
+        19
+
+    // unfortunately we can't check the path-dependent type of this
+    // CMS, which we signal by using a type projection here.
+    override def equals(any: Any): Boolean =
+      any match {
+        case that: DecayingCMS[?]#CMS =>
+          this.logScale == that.logScale &&
+          this.timeInHL == that.timeInHL &&
+          this.cells.length == that.cells.length && {
+            var i = 0
+            while (i < depth) {
+              if (this.cells(i) != that.cells(i)) return false
+              i += 1
+            }
+            true
+          }
+        case _ =>
+          false
+      }
+
+    def lastUpdateTime: Long =
+      toTimestamp(timeInHL)
+
+    /**
+     * Provide lower and upper bounds on values returned for any possible key.
+     *
+     * The first value is a lower bound: even keys that have never been counted will return this value or
+     * greater. This will be zero unless the CMS is saturated.
+     *
+     * The second value is an upper bound: the key with the largest cardinality will not be reported as being
+     * larger than this value (though it might be reported as being smaller).
+     *
+     * Together these values indicate how saturated and skewed the CMS might be.
+     */
+    def range: (DoubleAt, DoubleAt) = {
+      var minMinimum = Double.PositiveInfinity
+      var minMaximum = Double.PositiveInfinity
+      var i = 0
+      while (i < cells.length) {
+        val it = cells(i).iterator
+        var localMax = it.next() // we know it doesn't start empty
+        if (localMax < minMinimum) minMinimum = localMax
+        while (it.hasNext) {
+          val n = it.next()
+          if (n > localMax) localMax = n
+          else if (n < minMinimum) minMinimum = n
+        }
+        if (localMax < minMaximum) minMaximum = localMax
+        i += 1
+      }
+
+      val s = scale
+      def sc(x: Double): DoubleAt =
+        new DoubleAt(if (x == 0.0) 0.0 else x * s, timeInHL)
+
+      (sc(minMinimum), sc(minMaximum))
+    }
+
+    /**
+     * Returns the square-root of the inner product of two decaying CMSs.
+     *
+     * We want the result to decay at the same rate as the CMS for this method to be valid. Taking the square
+     * root ensures that this is true. Without it, we would violate the following equality (assuming we had
+     * at() on a CMS):
+     *
+     * x.innerProduct(y).at(t) = x.at(t).innerProduct(y.at(t))
+     *
+     * This is why we don't support innerProduct, only innerProductRoot.
+     */
+    def innerProductRoot(that: CMS): DoubleAt = {
+      var i = 0
+      var res = Double.PositiveInfinity
+      val t = Math.max(this.timeInHL, that.timeInHL)
+      val scale = this.getScale(t) * that.getScale(t)
+      while (i < depth) {
+        var sum = 0.0
+        val it0 = this.cells(i).iterator
+        val it1 = that.cells(i).iterator
+        while (it0.hasNext) {
+          val x = it0.next() * it1.next()
+          if (x != 0.0) sum += x
+        }
+        if (sum < res) res = sum
+        i += 1
+      }
+      val x = if (res != 0.0) Math.sqrt(res * scale) else 0.0
+      new DoubleAt(x, t)
+    }
+
+    def l2Norm: DoubleAt =
+      innerProductRoot(this)
+
+    def scale(x: Double): CMS =
+      if (java.lang.Double.isNaN(x)) {
+        throw new IllegalArgumentException(s"invalid scale: $x")
+      } else if (x < 0.0) {
+        throw new IllegalArgumentException(s"negative scale is not allowed: $x")
+      } else if (x == 0.0) {
+        module.empty
+      } else {
+        val s = logScale + Math.log(x)
+        val c = new CMS(cells, s, timeInHL)
+        if (s > maxLogScale) c.rescaleTo(timeInHL) else c
+      }
+
+    /**
+     * Get the total count of all items in the CMS.
+     *
+     * The total is the same as the l1Norm, since we don't allow negative values.
+     *
+     * Total is one of the few non-approximate statistics that DecayingCMS supports. We expect the total to be
+     * exact (except for floating-point error).
+     */
+    def total: DoubleAt = {
+      val n = cells(0).sum
+      val x = if (n == 0.0) 0.0 else scale * n
+      new DoubleAt(x, timeInHL)
+    }
+
+    def get(k: K): DoubleAt = {
+      var minValue = Double.PositiveInfinity
+      var didx = 0
+      while (didx < depth) {
+        val i = hashFns(didx)(k)
+        val inner = cells(didx)
+        val value = inner(i)
+        if (value < minValue) minValue = value
+        didx += 1
+      }
+      val x = if (minValue == 0.0) 0.0 else scale * minValue
+      new DoubleAt(x, timeInHL)
+    }
+
+    def getScale(t: Double): Double =
+      module.getScale(logScale, timeInHL, t)
+
+    private final def nextLogScale(t: Double): Double =
+      module.getNextLogScale(logScale, timeInHL, t)
+
+    def +(other: CMS): CMS = {
+      val x = this
+      val y = other
+      val timeInHL = Math.max(x.timeInHL, y.timeInHL)
+      val cms = new CMS(allocCells(), 0.0, timeInHL)
+
+      val xscale = x.getScale(timeInHL)
+      val yscale = y.getScale(timeInHL)
+
+      // a zero count is zero, no matter, how big the scale is.
+      @inline def prod(x: Double, y: Double): Double =
+        if (x == 0.0) 0.0 else x * y
+
+      var i = 0
+      while (i < depth) {
+        val left = x.cells(i)
+        val right = y.cells(i)
+        var j = 0
+        val bldr = rowBuilder()
+        while (j < width) {
+          bldr += prod(left(j), xscale) + prod(right(j), yscale)
+          j += 1
+        }
+        cms.cells(i) = bldr.result()
+        i += 1
+      }
+      cms
+    }
+
+    def add(t: Long, k: K, n: Double): CMS =
+      scaledAdd(fromTimestamp(t), k, n)
+
+    // TODO: we could allocate a mutable scratch pad, write all the
+    // values into it, and then build a CMS out of it. if items is
+    // very small, this would be less efficient than what we're doing
+    // now. probably the "ideal" solution would be determine how many
+    // items there are. if we have fewer than ~width items, this
+    // approach is fine. for more, a scratch pad would be better
+    // (assuming we wrote that code).
+    //
+    // alternately, you could map items into (zero + item) and then
+    // use the monoid's sum to boil it down.
+    //
+    // we only use this in testing currently so the current code is
+    // fine until we rely on it in production. any change here should
+    // probably include benchmarks justifying the design.
+    def bulkAdd(items: Iterable[(Long, K, Double)]): CMS =
+      items.foldLeft(this) { case (c, (t, k, v)) => c.add(t, k, v) }
+
+    private[algebird] def scaledAdd(ts1: Double, k: K, n: Double): CMS =
+      if (n < 0.0) {
+        val t = toTimestamp(ts1)
+        throw new IllegalArgumentException(
+          s"we can only add non-negative numbers to a CMS, got $n for key: $k at time: $t"
+        )
+      } else if (n == 0.0) {
+        this
+      } else {
+        val logScale1 = nextLogScale(ts1)
+        if (logScale1 > maxLogScale) {
+          rescaleTo(ts1).scaledAdd(ts1, k, n)
+        } else {
+          val increment = n * Math.exp(logScale1)
+          val cells1 = allocCells()
+          var didx = 0
+          while (didx < depth) {
+            val cell = cells(didx)
+            val w = hashFns(didx)(k)
+            cells1(didx) = cell.updated(w, cell(w) + increment)
+            didx += 1
+          }
+          new CMS(cells1, logScale1, ts1)
+        }
+      }
+
+    // Set the scale back to 0.0
+    // input time is in half-lives
+    private[algebird] def rescaleTo(ts: Double): CMS = {
+      val logScale1 = nextLogScale(ts)
+      val expL = Math.exp(-logScale1)
+      if (expL == 0.0) {
+        new CMS(monoid.zero.cells, 0.0, ts)
+      } else {
+        val cms = new CMS(allocCells(), 0.0, ts)
+        var i = 0
+        while (i < depth) {
+          val ci = cells(i)
+          cms.cells(i) = ci.map(_ * expL)
+          i += 1
+        }
+        cms
+      }
+    }
+  }
+
+  private def rowBuilder() = {
+    val bldr = Vector.newBuilder[Double]
+    bldr.sizeHint(width)
+    bldr
+  }
+
+  object CMS {
+
+    implicit val monoidForCMS: Monoid[CMS] =
+      new Monoid[CMS] {
+
+        def zero: CMS = module.empty
+
+        def plus(x: CMS, y: CMS): CMS =
+          x + y
+
+        /**
+         * Turn a flat array into an array of vectors.
+         */
+        private def scratchToCells(scratch: Array[Double]): Array[Vector[Double]] = {
+          val cells = new Array[Vector[Double]](depth)
+          var i = 0
+          while (i < depth) {
+            var j = i * width
+            val limit = j + width
+            val bldr = rowBuilder()
+            while (j < limit) {
+              bldr += scratch(j)
+              j += 1
+            }
+            cells(i) = bldr.result()
+            i += 1
+          }
+          cells
+        }
+
+        /**
+         * This method sums the first `num` items in `arr`.
+         */
+        private def innerSum(arr: Array[CMS], num: Int): CMS =
+          if (num == 0) zero
+          else if (num == 1) arr(0)
+          else if (num == 2) plus(arr(0), arr(1))
+          else {
+            // start with zero
+            val scratch: Array[Double] = new Array(totalCells)
+
+            val latestTimeInHL: Double =
+              arr.iterator.take(num).map(cms => cms.timeInHL).max
+
+            var i = 0
+            while (i < num) {
+              val cms = arr(i)
+              val scale = cms.getScale(latestTimeInHL)
+              var j = 0
+              while (j < depth) {
+                val row = cms.cells(j)
+                val stride = j * width
+                var k = 0
+                while (k < width) {
+                  val n = row(k)
+                  if (n > 0.0) {
+                    scratch(stride + k) += scale * n
+                  }
+                  k += 1
+                }
+                j += 1
+              }
+              i += 1
+            }
+
+            val cells = scratchToCells(scratch)
+
+            new CMS(cells, 0.0, latestTimeInHL)
+          }
+
+        override def sumOption(xs: TraversableOnce[CMS]): Option[CMS] = {
+
+          val it: Iterator[CMS] = xs.toIterator
+          val ChunkSize = 1000
+
+          // the idea here is that we read up to 1000 CMS values into
+          // a fixed array, crunch them down to a single CMS, store it
+          // in the first array index, read up to 999 more CMS values
+          // in, crunch them down, and so on.
+          var i = 0
+          val arr = new Array[CMS](ChunkSize)
+          while (it.hasNext) {
+            while (it.hasNext && i < ChunkSize) {
+              arr(i) = it.next()
+              i += 1
+            }
+            if (i > 1) {
+              arr(0) = innerSum(arr, i)
+            }
+            i = 1
+          }
+          if (i == 0) None else Some(arr(0))
+        }
+      }
+  }
+
+  val monoid: Monoid[CMS] = CMS.monoidForCMS
+}
+
+object DecayingCMS {
+
+  /**
+   * Construct a DecayingCMS module.
+   *
+   * The seed is used to initialize the hash families used by the count-min sketch. Using the same seed will
+   * always produce the same hash family.
+   *
+   * Half-life determines the rate at which values in the CMS decay. If a key was counted once at time t, by
+   * time (t + halfLife), the value for that key will be 0.5. After enough half lives the value will decay to
+   * zero.
+   *
+   * The size of the CMS in bytes is O(depth * width).
+   *
+   * Width controls the relative error due to over-counting (approximately 1/width). For 1% error, use
+   * width=100, for 0.1% error, use width=1000, etc.
+   *
+   * Depth controls the probability the error bounds are broken and that probability scales with exp(-alpha *
+   * depth) so, a small depth (e.g. 5-10) is fine. Each update requires O(depth) work so you want to keep this
+   * as small as possible.
+   */
+  def apply[K](seed: Long, halfLife: Duration, depth: Int, width: Int)(implicit
+      hasher: CMSHasher[K]
+  ): DecayingCMS[K] =
+    new DecayingCMS(seed, halfLife, depth, width, hasher)
+}
diff --git a/algebird-core/src/main/scala-3/Fold.scala b/algebird-core/src/main/scala-3/Fold.scala
new file mode 100644
index 000000000..0b89f2d62
--- /dev/null
+++ b/algebird-core/src/main/scala-3/Fold.scala
@@ -0,0 +1,352 @@
+/*
+Copyright 2014 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+ */
+package com.twitter.algebird
+
+import java.io.Serializable
+import scala.collection.compat._
+
+/**
+ * Folds are first-class representations of "Traversable.foldLeft." They have the nice property that they can
+ * be fused to work in parallel over an input sequence.
+ *
+ * A Fold accumulates inputs (I) into some internal type (X), converting to a defined output type (O) when
+ * done. We use existential types to hide internal details and to allow for internal and external (X and O)
+ * types to differ for "map" and "join."
+ *
+ * In discussing this type we draw parallels to Function1 and related types. You can think of a fold as a
+ * function "Seq[I] => O" but in reality we do not have to materialize the input sequence at once to "run" the
+ * fold.
+ *
+ * The traversal of the input data structure is NOT done by Fold itself. Instead we expose some methods like
+ * "overTraversable" that know how to iterate through various sequence types and drive the fold. We also
+ * expose some internal state so library authors can fold over their own types.
+ *
+ * See the companion object for constructors.
+ */
+sealed trait Fold[-I, +O] extends Serializable {
+
+  /**
+   * Users can ignore this type.
+   *
+   * The internal accumulator type. No one outside this Fold needs to know what this is, and that's a good
+   * thing. It keeps type signatures sane and makes this easy to use for the amount of flexibility it
+   * provides.
+   */
+  type X
+
+  /**
+   * Users can ignore this method. It is exposed so library authors can run folds over their own sequence
+   * types.
+   *
+   * "build" constructs a FoldState, which tells us how to run the fold. It is expected that we can run the
+   * same Fold many times over different data structures, but we must build a new FoldState every time.
+   *
+   * See FoldState for information on how to use this for your own sequence types.
+   */
+  def build(): FoldState[X, I, O]
+
+  /**
+   * Transforms the output of the Fold after iteration is complete. This is analogous to "Future.map" or
+   * "Function1.compose."
+   */
+  def map[P](f: O => P): Fold[I, P] = {
+    val self = this
+    new Fold[I, P] {
+      type X = self.X
+      override def build(): FoldState[X, I, P] =
+        self.build().map(f)
+    }
+  }
+
+  /**
+   * Joins two folds into one and combines the results. The fused fold accumulates with both at the same time
+   * and combines at the end.
+   */
+  def joinWith[I2 <: I, P, Q](other: Fold[I2, P])(f: (O, P) => Q): Fold[I2, Q] = {
+    val self = this
+    new Fold[I2, Q] {
+      type X = (self.X, other.X)
+      override def build(): FoldState[X, I2, Q] = {
+        val first = self.build()
+        val second = other.build()
+        new FoldState(
+          { case ((x, y), i) => (first.add(x, i), second.add(y, i)) },
+          (first.start, second.start),
+          { case (x, y) => f(first.end(x), second.end(y)) }
+        )
+      }
+    }
+  }
+
+  /**
+   * Convenient shorthand for joining Folds without combining at the end.
+   */
+  def join[I2 <: I, P](other: Fold[I2, P]): Fold[I2, (O, P)] =
+    joinWith(other) { case (o, p) => (o, p) }
+
+  /**
+   * Transforms the input of the fold before every accumulation. (The name comes from "contravariant map.")
+   * This is analogous to "Function1.andThen."
+   */
+  def contramap[H](f: H => I): Fold[H, O] = {
+    val self = this
+    new Fold[H, O] {
+      type X = self.X
+      override def build(): FoldState[X, H, O] =
+        self.build().contramap(f)
+    }
+  }
+
+  /**
+   * Trivially runs a Fold over an empty sequence.
+   */
+  def overEmpty: O = {
+    // build is a "def" so we construct the state once and use the pieces to run the fold
+    val state = build()
+    state.end(state.start)
+  }
+
+  /**
+   * Trivially runs a Fold over a single element sequence.
+   */
+  def overSingleton(i: I): O = {
+    val state = build()
+    state.end(state.add(state.start, i))
+  }
+
+  /**
+   * Runs a Fold over a Traversable.
+   */
+  def overTraversable(is: TraversableOnce[I]): O = {
+    val state = build()
+    state.end(is.iterator.foldLeft(state.start)(state.add))
+  }
+}
+
+/**
+ * A FoldState defines a left fold with a "hidden" accumulator type. It is exposed so library authors can run
+ * Folds over their own sequence types.
+ *
+ * The fold can be executed correctly according to the properties of "add" and your traversed data structure.
+ * For example, the "add" function of a monoidal fold will be associative. A FoldState is valid for only one
+ * iteration because the accumulator (seeded by "start" may be mutable.
+ *
+ * The three components of a fold are add: (X, I) => X - updates and returns internal state for every input I
+ * start: X - the initial state end: X => O - transforms internal state to a final result
+ *
+ * Folding over Seq(x, y) would produce the result end(add(add(start, x), y))
+ */
+final class FoldState[X, -I, +O] private[algebird] (val add: (X, I) => X, val start: X, val end: X => O)
+    extends Serializable {
+
+  /**
+   * Transforms the output type of the FoldState (see Fold.map).
+   */
+  def map[P](f: O => P): FoldState[X, I, P] =
+    new FoldState(add, start, end.andThen(f))
+
+  /**
+   * Transforms the input type of the FoldState (see Fold.contramap).
+   */
+  def contramap[H](f: H => I): FoldState[X, H, O] =
+    new FoldState((x, h) => add(x, f(h)), start, end)
+}
+
+/**
+ * Methods to create and run Folds.
+ *
+ * The Folds defined here are immutable and serializable, which we expect by default. It is important that you
+ * as a user indicate mutability or non-serializability when defining new Folds. Additionally, it is
+ * recommended that "end" functions not mutate the accumulator in order to support scans (producing a stream
+ * of intermediate outputs by calling "end" at each step).
+ */
+object Fold extends CompatFold {
+
+  /**
+   * "import Fold.applicative" will bring the Applicative instance into scope. See FoldApplicative.
+   */
+  implicit def applicative[I]: Applicative[Fold[I, _]] =
+    new FoldApplicative[I]
+
+  /**
+   * Turn a common Scala foldLeft into a Fold. The accumulator MUST be immutable and serializable.
+   */
+  def foldLeft[I, O](o: O)(add: (O, I) => O): Fold[I, O] =
+    fold[O, I, O](add, o, o => o)
+
+  /**
+   * A general way of defining Folds that supports a separate accumulator type. The accumulator MUST be
+   * immutable and serializable.
+   */
+  def fold[M, I, O](add: (M, I) => M, start: M, end: M => O): Fold[I, O] =
+    new Fold[I, O] {
+      type X = M
+      override def build(): FoldState[X, I, O] =
+        new FoldState(add, start, end)
+    }
+
+  /**
+   * A general way of defining Folds that supports constructing mutable or non-serializable accumulators.
+   */
+  def foldMutable[M, I, O](add: (M, I) => M, start: Unit => M, end: M => O): Fold[I, O] =
+    new Fold[I, O] {
+      type X = M
+      override def build(): FoldState[X, I, O] =
+        new FoldState(add, start(()), end)
+    }
+
+  /**
+   * Fuse a sequence of Folds into one that outputs the result of each.
+   */
+  def sequence[I, O](ms: Seq[Fold[I, O]]): Fold[I, Seq[O]] =
+    new Fold[I, Seq[O]] {
+      type X = Seq[Any]
+      override def build(): FoldState[Seq[Any], I, Seq[O]] = {
+        val bs: Seq[FoldState[Any, I, O]] =
+          ms.map(_.build().asInstanceOf[FoldState[Any, I, O]])
+        val adds =
+          bs.map(_.add)
+        val ends =
+          bs.map(_.end)
+        val starts: Seq[Any] =
+          bs.map(_.start)
+        val add: (Seq[Any], I) => Seq[Any] = { (xs, i) => adds.zip(xs).map { case (f, x) => f(x, i) } }
+        val end: (Seq[Any] => Seq[O]) = { xs => ends.zip(xs).map { case (f, x) => f(x) } }
+        new FoldState(add, starts, end)
+      }
+    }
+
+  /**
+   * An even simpler Fold that collects into a Seq. Shorthand for "container[I, Seq];" fewer type arguments,
+   * better type inferrence.
+   */
+  def seq[I]: Fold[I, Seq[I]] =
+    container[I, Seq]
+
+  /**
+   * A Fold that does no work and returns a constant. Analogous to Function1 const: def const[A, B](b: B): (A
+   * \=> B) = { _ => b }
+   */
+  def const[O](value: O): Fold[Any, O] =
+    Fold.foldLeft(value) { case (u, _) => u }
+
+  /**
+   * A Fold that runs the given side effect for every element.
+   */
+  def foreach[I](e: I => Unit): Fold[I, Unit] =
+    Fold.foldLeft(()) { case (_, i) => e(i) }
+
+  /**
+   * A Fold that returns the first value in a sequence.
+   */
+  def first[I]: Fold[I, Option[I]] =
+    Fold.foldLeft[I, Option[I]](None) {
+      case (None, i) => Some(i)
+      case (x, _)    => x
+    }
+
+  /**
+   * A Fold that returns the last value in a sequence.
+   */
+  def last[I]: Fold[I, Option[I]] =
+    Fold.foldLeft[I, Option[I]](None) { case (_, i) => Some(i) }
+
+  /**
+   * A Fold that returns the max value in a sequence. (Biased to earlier equal values.)
+   */
+  def max[I](implicit ordering: Ordering[I]): Fold[I, Option[I]] =
+    Fold.foldLeft[I, Option[I]](None) {
+      case (None, i)                                  => Some(i)
+      case (Some(y), i) if ordering.compare(y, i) < 0 => Some(i)
+      case (x, _)                                     => x
+    }
+
+  /**
+   * A Fold that returns a min value in a sequence. (Biased to earlier equal values.)
+   */
+  def min[I](implicit ordering: Ordering[I]): Fold[I, Option[I]] =
+    Fold.foldLeft[I, Option[I]](None) {
+      case (None, i)                                  => Some(i)
+      case (Some(y), i) if ordering.compare(y, i) > 0 => Some(i)
+      case (x, _)                                     => x
+    }
+
+  /**
+   * A Fold that returns the sum of a numeric sequence. Does not protect against overflow.
+   */
+  def sum[I](implicit numeric: Monoid[I]): Fold[I, I] =
+    Fold.foldLeft(numeric.zero) { case (x, i) => numeric.plus(x, i) }
+
+  /**
+   * For a semigroup, if we get more than 0 items, use plus
+   */
+  def sumOption[T](implicit sg: Semigroup[T]): Fold[T, Option[T]] =
+    Fold.foldLeft(None: Option[T]) {
+      case (None, i)    => Some(i)
+      case (Some(l), r) => Some(sg.plus(l, r))
+    }
+
+  /**
+   * A Fold that returns the product of a numeric sequence. Does not protect against overflow.
+   */
+  def product[I](implicit numeric: Ring[I]): Fold[I, I] =
+    Fold.foldLeft(numeric.one) { case (x, i) => numeric.times(x, i) }
+
+  /**
+   * A Fold that returns the length of a sequence.
+   */
+  def size: Fold[Any, Long] =
+    Fold.foldLeft(0L) { case (x, _) => x + 1 }
+
+  /**
+   * A Fold that returns "true" if all elements of the sequence statisfy the predicate. Note this does not
+   * short-circuit enumeration of the sequence.
+   */
+  def forall[I](pred: I => Boolean): Fold[I, Boolean] =
+    foldLeft(true)((b, i) => b && pred(i))
+
+  /**
+   * A Fold that returns "true" if any element of the sequence statisfies the predicate. Note this does not
+   * short-circuit enumeration of the sequence.
+   */
+  def exists[I](pred: I => Boolean): Fold[I, Boolean] =
+    foldLeft(false)((b, i) => b || pred(i))
+
+  /**
+   * A Fold that counts the number of elements satisfying the predicate.
+   */
+  def count[I](pred: I => Boolean): Fold[I, Long] =
+    foldLeft(0L) {
+      case (c, i) if pred(i) => c + 1L
+      case (c, _)            => c
+    }
+}
+
+/**
+ * Folds are Applicatives!
+ */
+class FoldApplicative[I] extends Applicative[Fold[I, _]] {
+  override def map[T, U](mt: Fold[I, T])(fn: T => U): Fold[I, U] =
+    mt.map(fn)
+  override def apply[T](v: T): Fold[I, T] =
+    Fold.const(v)
+  override def join[T, U](mt: Fold[I, T], mu: Fold[I, U]): Fold[I, (T, U)] =
+    mt.join(mu)
+  override def sequence[T](ms: Seq[Fold[I, T]]): Fold[I, Seq[T]] =
+    Fold.sequence(ms)
+  override def joinWith[T, U, V](mt: Fold[I, T], mu: Fold[I, U])(fn: (T, U) => V): Fold[I, V] =
+    mt.joinWith(mu)(fn)
+}
diff --git a/algebird-core/src/main/scala-3/Interval.scala b/algebird-core/src/main/scala-3/Interval.scala
new file mode 100644
index 000000000..6a1645d16
--- /dev/null
+++ b/algebird-core/src/main/scala-3/Interval.scala
@@ -0,0 +1,380 @@
+/*
+ Copyright 2013 Twitter, Inc.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package com.twitter.algebird
+
+// TODO this is clearly more general than summingbird, and should be extended to be a ring (add union, etc...)
+
+/**
+ * Represents a single interval on a T with an Ordering
+ */
+sealed trait Interval[T] extends java.io.Serializable {
+  def contains(t: T)(implicit ord: Ordering[T]): Boolean
+
+  def intersect(that: Interval[T])(implicit ord: Ordering[T]): Interval[T]
+  final def apply(t: T)(implicit ord: Ordering[T]): Boolean = contains(t)
+  final def &&(that: Interval[T])(implicit ord: Ordering[T]): Interval[T] = intersect(that)
+
+  /**
+   * Map the Interval with a non-decreasing function. If you use a non-monotonic function (like x^2) then the
+   * result is meaningless. TODO: It might be good to have types for these properties in algebird.
+   */
+  def mapNonDecreasing[U](fn: T => U): Interval[U]
+}
+
+case class Universe[T]() extends Interval[T] {
+  override def contains(t: T)(implicit ord: Ordering[T]): Boolean = true
+  override def intersect(that: Interval[T])(implicit ord: Ordering[T]): Interval[T] =
+    that
+  override def mapNonDecreasing[U](fn: T => U): Interval[U] = Universe()
+}
+
+case class Empty[T]() extends Interval[T] {
+  override def contains(t: T)(implicit ord: Ordering[T]): Boolean = false
+  override def intersect(that: Interval[T])(implicit ord: Ordering[T]): Interval[T] =
+    this
+  override def mapNonDecreasing[U](fn: T => U): Interval[U] = Empty()
+}
+
+object Interval extends java.io.Serializable {
+
+  /**
+   * Class that only exists so that [[leftClosedRightOpen]] and [[leftOpenRightClosed]] can retain the type
+   * information of the returned interval. The compiler doesn't know anything about ordering, so without
+   * [[MaybeEmpty]] the only valid return type is Interval[T].
+   */
+  sealed abstract class MaybeEmpty[T, NonEmpty[t] <: Interval[t]] {
+    def isEmpty: Boolean
+  }
+  object MaybeEmpty {
+
+    /**
+     * Represents an empty interval.
+     */
+    case class SoEmpty[T, NonEmpty[t] <: Interval[t]]() extends MaybeEmpty[T, NonEmpty] {
+      override def isEmpty: Boolean = true
+    }
+
+    /**
+     * Represents a non-empty interval.
+     */
+    case class NotSoEmpty[T, NonEmpty[t] <: Interval[t]](get: NonEmpty[T]) extends MaybeEmpty[T, NonEmpty] {
+      override def isEmpty: Boolean = false
+    }
+  }
+
+  type GenIntersection[T] = Intersection[Lower, Upper, T]
+  type InLowExUp[T] = Intersection[InclusiveLower, ExclusiveUpper, T]
+  type InLowInUp[T] = Intersection[InclusiveLower, InclusiveUpper, T]
+  type ExLowExUp[T] = Intersection[ExclusiveLower, ExclusiveUpper, T]
+  type ExLowInUp[T] = Intersection[ExclusiveLower, InclusiveUpper, T]
+
+  implicit def monoid[T: Ordering]: Monoid[Interval[T]] =
+    Monoid.from[Interval[T]](Universe[T]())(_ && _)
+
+  // Automatically convert from a MaybeEmpty instance
+  implicit def fromMaybeEmpty[T, NonEmpty[t] <: Interval[t]](me: MaybeEmpty[T, NonEmpty]): Interval[T] =
+    me match {
+      case MaybeEmpty.SoEmpty()     => Empty()
+      case MaybeEmpty.NotSoEmpty(i) => i
+    }
+
+  def leftClosedRightOpen[T: Ordering](lower: T, upper: T): MaybeEmpty[T, InLowExUp] =
+    if (Ordering[T].lt(lower, upper))
+      MaybeEmpty.NotSoEmpty[T, InLowExUp](Intersection(InclusiveLower(lower), ExclusiveUpper(upper)))
+    else MaybeEmpty.SoEmpty[T, InLowExUp]()
+
+  def leftOpenRightClosed[T: Ordering](lower: T, upper: T): MaybeEmpty[T, ExLowInUp] =
+    if (Ordering[T].lt(lower, upper))
+      MaybeEmpty.NotSoEmpty[T, ExLowInUp](Intersection(ExclusiveLower(lower), InclusiveUpper(upper)))
+    else MaybeEmpty.SoEmpty[T, ExLowInUp]()
+
+  def closed[T: Ordering](lower: T, upper: T): MaybeEmpty[T, InLowInUp] =
+    if (Ordering[T].lteq(lower, upper))
+      MaybeEmpty.NotSoEmpty[T, InLowInUp](Intersection(InclusiveLower(lower), InclusiveUpper(upper)))
+    else MaybeEmpty.SoEmpty[T, InLowInUp]()
+
+  def open[T: Ordering](lower: T, upper: T): MaybeEmpty[T, ExLowExUp] =
+    if (Ordering[T].lt(lower, upper))
+      MaybeEmpty.NotSoEmpty[T, ExLowExUp](Intersection(ExclusiveLower(lower), ExclusiveUpper(upper)))
+    else MaybeEmpty.SoEmpty[T, ExLowExUp]()
+
+  /**
+   * This is here for binary compatibility reasons. These methods should be moved to Interval, which should
+   * also be an abstract class for better binary compatibility at the next incompatible change
+   */
+  implicit final class IntervalMethods[T](val intr: Interval[T]) extends AnyVal {
+    def isEmpty(implicit succ: Successible[T], pred: Predecessible[T]): Boolean = intr match {
+      case Empty()    => true
+      case Universe() => false
+      case Intersection(InclusiveLower(l), ExclusiveUpper(u)) =>
+        !succ.ordering.lt(l, u)
+      case Intersection(InclusiveLower(l), InclusiveUpper(u)) =>
+        !succ.ordering.lteq(l, u)
+      case Intersection(ExclusiveLower(l), ExclusiveUpper(u)) =>
+        !succ.next(l).exists(succ.ordering.lt(_, u))
+      case Intersection(ExclusiveLower(l), InclusiveUpper(u)) =>
+        !succ.next(l).exists(succ.ordering.lteq(_, u))
+      case InclusiveLower(_) => false // we at least have l
+      case InclusiveUpper(_) => false // false // we at least have u
+      case ExclusiveLower(l) =>
+        succ.next(l).isEmpty
+      case ExclusiveUpper(u) =>
+        pred.prev(u).isEmpty
+    }
+
+    /**
+     * If this returns Some(t), then intr.contains(t) and there is no s less than t such that intr.contains(s)
+     *
+     * if this returns None, it may be Empty, Upper or Universe
+     */
+    def boundedLeast(implicit succ: Successible[T]): Option[T] = intr match {
+      case Empty()                => None
+      case Universe()             => None
+      case _: Upper[?]            => None
+      case i @ Intersection(_, _) => i.least
+      case l: Lower[?]            => l.least
+    }
+
+    /**
+     * If this returns Some(t), then intr.contains(t) and there is no s greater than t such that
+     * intr.contains(s)
+     *
+     * if this returns None, it may be Empty, Lower, or Universe
+     */
+    def boundedGreatest(implicit pred: Predecessible[T]): Option[T] =
+      intr match {
+        case Empty()                => None
+        case Universe()             => None
+        case _: Lower[?]            => None
+        case i @ Intersection(_, _) => i.greatest
+        case u: Upper[?]            => u.greatest
+      }
+  }
+}
+
+// Marker traits to keep lower on the left in Intersection
+sealed trait Lower[T] extends Interval[T] {
+
+  /**
+   * This may give a false positive (but should try not to). Note the case of (0,1) for the integers. If they
+   * were doubles, this would intersect, but since there are no members of the set Int that are bigger than 0
+   * and less than 1, they don't really intersect. So, ordering is not enough here. You need a stronger
+   * notion, which we don't have a typeclass for.
+   */
+  def intersects(u: Upper[T])(implicit ord: Ordering[T]): Boolean
+
+  /**
+   * The smallest value that is contained here This is an Option, because of cases like
+   * ExclusiveLower(Int.MaxValue) which are pathological and equivalent to Empty
+   */
+  def least(implicit s: Successible[T]): Option[T]
+  def strictLowerBound(implicit p: Predecessible[T]): Option[T]
+
+  /**
+   * Iterates all the items in this Lower[T] from lowest to highest
+   */
+  def toIterable(implicit s: Successible[T]): Iterable[T] =
+    least match {
+      case Some(l) => s.iterateNext(l)
+      case None    => Iterable.empty
+    }
+}
+sealed trait Upper[T] extends Interval[T] {
+
+  /**
+   * The smallest value that is contained here This is an Option, because of cases like
+   * ExclusiveUpper(Int.MinValue), which are pathological and equivalent to Empty
+   */
+  def greatest(implicit p: Predecessible[T]): Option[T]
+  // The smallest value that is not present
+  def strictUpperBound(implicit s: Successible[T]): Option[T]
+
+  /**
+   * Iterates all the items in this Upper[T] from highest to lowest
+   */
+  def toIterable(implicit p: Predecessible[T]): Iterable[T] =
+    greatest match {
+      case Some(g) => p.iteratePrev(g)
+      case None    => Iterable.empty
+    }
+}
+
+case class InclusiveLower[T](lower: T) extends Interval[T] with Lower[T] {
+  override def contains(t: T)(implicit ordering: Ordering[T]): Boolean =
+    ordering.lteq(lower, t)
+  override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match {
+    case Universe() => this
+    case Empty()    => that
+    case ub @ InclusiveUpper(_) =>
+      if (intersects(ub)) Intersection(this, ub) else Empty()
+    case ub @ ExclusiveUpper(_) =>
+      if (intersects(ub)) Intersection(this, ub) else Empty()
+    case InclusiveLower(thatlb) =>
+      if (ordering.gt(lower, thatlb)) this else that
+    case ExclusiveLower(thatlb) =>
+      if (ordering.gt(lower, thatlb)) this else that
+    case Intersection(thatL, thatU) => (this && thatL) && thatU
+  }
+  override def intersects(u: Upper[T])(implicit ordering: Ordering[T]): Boolean =
+    u match {
+      case InclusiveUpper(upper) => ordering.lteq(lower, upper)
+      case ExclusiveUpper(upper) => ordering.lt(lower, upper)
+    }
+  override def least(implicit s: Successible[T]): Option[T] = Some(lower)
+  override def strictLowerBound(implicit p: Predecessible[T]): Option[T] = p.prev(lower)
+  override def mapNonDecreasing[U](fn: T => U): Interval[U] = InclusiveLower(fn(lower))
+}
+case class ExclusiveLower[T](lower: T) extends Interval[T] with Lower[T] {
+  override def contains(t: T)(implicit ordering: Ordering[T]): Boolean =
+    ordering.lt(lower, t)
+  override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match {
+    case Universe() => this
+    case Empty()    => that
+    case ub @ InclusiveUpper(_) =>
+      if (intersects(ub)) Intersection(this, ub) else Empty()
+    case ub @ ExclusiveUpper(_) =>
+      if (intersects(ub)) Intersection(this, ub) else Empty()
+    case InclusiveLower(thatlb) =>
+      if (ordering.gteq(lower, thatlb)) this else that
+    case ExclusiveLower(thatlb) =>
+      if (ordering.gteq(lower, thatlb)) this else that
+    case Intersection(thatL, thatU) => (this && thatL) && thatU
+  }
+  override def intersects(u: Upper[T])(implicit ordering: Ordering[T]): Boolean =
+    u match {
+      case InclusiveUpper(upper) => ordering.lt(lower, upper)
+      case ExclusiveUpper(upper) =>
+        ordering.lt(lower, upper) // This is a false positive for (x, next(x))
+    }
+  override def least(implicit s: Successible[T]): Option[T] = s.next(lower)
+  override def strictLowerBound(implicit p: Predecessible[T]): Option[T] = Some(lower)
+  override def mapNonDecreasing[U](fn: T => U): Interval[U] = ExclusiveLower(fn(lower))
+}
+case class InclusiveUpper[T](upper: T) extends Interval[T] with Upper[T] {
+  override def contains(t: T)(implicit ordering: Ordering[T]): Boolean =
+    ordering.lteq(t, upper)
+  override def greatest(implicit p: Predecessible[T]): Option[T] = Some(upper)
+  // The smallest value that is not present
+  override def strictUpperBound(implicit s: Successible[T]): Option[T] = s.next(upper)
+  override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match {
+    case Universe() => this
+    case Empty()    => that
+    case lb @ InclusiveLower(_) =>
+      if (lb.intersects(this)) Intersection(lb, this) else Empty()
+    case lb @ ExclusiveLower(_) =>
+      if (lb.intersects(this)) Intersection(lb, this) else Empty()
+    case InclusiveUpper(thatub) =>
+      if (ordering.lt(upper, thatub)) this else that
+    case ExclusiveUpper(thatub) =>
+      if (ordering.lt(upper, thatub)) this else that
+    case Intersection(thatL, thatU) => thatL && (this && thatU)
+  }
+  override def mapNonDecreasing[U](fn: T => U): Interval[U] = InclusiveUpper(fn(upper))
+}
+case class ExclusiveUpper[T](upper: T) extends Interval[T] with Upper[T] {
+  override def contains(t: T)(implicit ordering: Ordering[T]): Boolean =
+    ordering.lt(t, upper)
+  override def greatest(implicit p: Predecessible[T]): Option[T] = p.prev(upper)
+  // The smallest value that is not present
+  override def strictUpperBound(implicit s: Successible[T]): Option[T] = Some(upper)
+  override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match {
+    case Universe() => this
+    case Empty()    => that
+    case lb @ InclusiveLower(_) =>
+      if (lb.intersects(this)) Intersection(lb, this) else Empty()
+    case lb @ ExclusiveLower(_) =>
+      if (lb.intersects(this)) Intersection(lb, this) else Empty()
+    case InclusiveUpper(thatub) =>
+      if (ordering.lteq(upper, thatub)) this else that
+    case ExclusiveUpper(thatub) =>
+      if (ordering.lteq(upper, thatub)) this else that
+    case Intersection(thatL, thatU) => thatL && (this && thatU)
+  }
+  override def mapNonDecreasing[U](fn: T => U): Interval[U] = ExclusiveUpper(fn(upper))
+}
+
+case class Intersection[L[t] <: Lower[t], U[t] <: Upper[t], T](lower: L[T], upper: U[T]) extends Interval[T] {
+  override def contains(t: T)(implicit ordering: Ordering[T]): Boolean =
+    lower.contains(t) && upper.contains(t)
+  override def intersect(that: Interval[T])(implicit ordering: Ordering[T]): Interval[T] = that match {
+    case Universe()                 => this
+    case Empty()                    => that
+    case lb @ InclusiveLower(_)     => (lb && lower) && upper
+    case lb @ ExclusiveLower(_)     => (lb && lower) && upper
+    case ub @ InclusiveUpper(_)     => lower && (ub && upper)
+    case ub @ ExclusiveUpper(_)     => lower && (ub && upper)
+    case Intersection(thatL, thatU) => (lower && thatL) && (upper && thatU)
+  }
+  override def mapNonDecreasing[T1](fn: T => T1): Interval[T1] = {
+    val newLower = lower match {
+      case InclusiveLower(l) => InclusiveLower(fn(l))
+      case ExclusiveLower(l) => ExclusiveLower(fn(l))
+    }
+    val newUpper = upper match {
+      case InclusiveUpper(u) => InclusiveUpper(fn(u))
+      case ExclusiveUpper(u) => ExclusiveUpper(fn(u))
+    }
+    Intersection(newLower, newUpper)
+  }
+
+  def least(implicit s: Successible[T]): Option[T] =
+    lower.least.filter(upper.contains(_)(s.ordering))
+
+  /**
+   * Goes from lowest to highest for all items that are contained in this Intersection
+   */
+  def leastToGreatest(implicit s: Successible[T]): Iterable[T] = {
+    val self = this
+    implicit val ord: Ordering[T] = s.ordering
+    // TODO https://github.com/twitter/algebird/issues/263
+    new AbstractIterable[T] {
+      // We have to do this because the normal takeWhile causes OOM on big intervals
+      override def iterator: Iterator[T] = lower.toIterable.iterator.takeWhile(self.upper.contains(_))
+    }
+  }
+
+  def greatest(implicit p: Predecessible[T]): Option[T] =
+    upper.greatest.filter(lower.contains(_)(p.ordering))
+
+  /**
+   * Goes from highest to lowest for all items that are contained in this Intersection
+   */
+  def greatestToLeast(implicit p: Predecessible[T]): Iterable[T] = {
+    val self = this
+    implicit val ord: Ordering[T] = p.ordering
+    // TODO https://github.com/twitter/algebird/issues/263
+    new AbstractIterable[T] {
+      // We have to do this because the normal takeWhile causes OOM on big intervals
+      override def iterator: Iterator[T] = upper.toIterable.iterator.takeWhile(self.lower.contains(_))
+    }
+  }
+
+  /**
+   * Some intervals can actually be synonyms for empty: (0,0) for instance, contains nothing. This cannot be
+   * normalized to [a, b) form, thus we return an option Also, there are cases like [Int.MinValue,
+   * Int.MaxValue] that cannot are actually equivalent to Universe. The bottom line: if this returns None, it
+   * just means you can't express it this way, it does not mean it is empty or universe, etc... (there are
+   * other cases).
+   */
+  def toLeftClosedRightOpen(implicit
+      s: Successible[T]
+  ): Option[Intersection[InclusiveLower, ExclusiveUpper, T]] =
+    for {
+      l <- lower.least
+      g <- upper.strictUpperBound if s.ordering.lt(l, g)
+    } yield Intersection(InclusiveLower(l), ExclusiveUpper(g))
+}
diff --git a/algebird-core/src/main/scala-3/InvariantAlgebras.scala b/algebird-core/src/main/scala-3/InvariantAlgebras.scala
new file mode 100644
index 000000000..6f30ebc1c
--- /dev/null
+++ b/algebird-core/src/main/scala-3/InvariantAlgebras.scala
@@ -0,0 +1,48 @@
+package com.twitter.algebird
+
+class InvariantSemigroup[T, U](val forward: T => U, val reverse: U => T)(implicit val semigroup: Semigroup[T])
+    extends Semigroup[U] {
+  override def plus(l: U, r: U): U =
+    forward(semigroup.plus(reverse(l), reverse(r)))
+  override def sumOption(iter: TraversableOnce[U]): Option[U] =
+    semigroup.sumOption(iter.map(reverse)).map(forward)
+
+  /*
+   * Note these work for the subclasses since in those cases semigroup
+   * will be the appropriate algebra.
+   */
+  override val hashCode: Int = (forward, reverse, semigroup).hashCode
+  override def equals(that: Any): Boolean =
+    that match {
+      case r: InvariantSemigroup[?, ?] =>
+        (hashCode == r.hashCode) &&
+        (forward == r.forward) &&
+        (reverse == r.reverse) &&
+        (semigroup == r.semigroup)
+      case _ => false
+    }
+}
+
+class InvariantMonoid[T, U](forward: T => U, reverse: U => T)(implicit val monoid: Monoid[T])
+    extends InvariantSemigroup[T, U](forward, reverse)
+    with Monoid[U] {
+  override val zero: U = forward(monoid.zero)
+}
+
+class InvariantGroup[T, U](forward: T => U, reverse: U => T)(implicit val group: Group[T])
+    extends InvariantMonoid[T, U](forward, reverse)
+    with Group[U] {
+  override def negate(u: U): U = forward(group.negate(reverse(u)))
+  override def minus(l: U, r: U): U =
+    forward(group.minus(reverse(l), reverse(r)))
+}
+
+class InvariantRing[T, U](forward: T => U, reverse: U => T)(implicit val ring: Ring[T])
+    extends InvariantGroup[T, U](forward, reverse)
+    with Ring[U] {
+  override val one: U = forward(ring.one)
+  override def times(l: U, r: U): U =
+    forward(ring.times(reverse(l), reverse(r)))
+  override def product(iter: TraversableOnce[U]): U =
+    forward(ring.product(iter.map(reverse)))
+}
diff --git a/algebird-core/src/main/scala-3/JavaMonoids.scala b/algebird-core/src/main/scala-3/JavaMonoids.scala
new file mode 100644
index 000000000..26ce54f0a
--- /dev/null
+++ b/algebird-core/src/main/scala-3/JavaMonoids.scala
@@ -0,0 +1,147 @@
+/*
+Copyright 2012 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+ */
+package com.twitter.algebird
+
+import java.lang.{
+  Boolean => JBool,
+  Double => JDouble,
+  Float => JFloat,
+  Integer => JInt,
+  Long => JLong,
+  Short => JShort
+}
+import java.util.{ArrayList => JArrayList, HashMap => JHashMap, List => JList, Map => JMap}
+
+import scala.collection.JavaConverters._
+
+object JIntRing extends Ring[JInt] {
+  override val zero: JInt = JInt.valueOf(0)
+  override val one: JInt = JInt.valueOf(1)
+  override def plus(x: JInt, y: JInt): JInt = x + y
+  override def negate(x: JInt): JInt = -x
+  override def minus(x: JInt, y: JInt): JInt = x - y
+  override def times(x: JInt, y: JInt): JInt = x * y
+}
+
+object JShortRing extends Ring[JShort] {
+  override val zero: JShort = Short.box(0)
+  override val one: JShort = Short.box(1)
+  override def plus(x: JShort, y: JShort): JShort = (x + y).toShort
+  override def negate(x: JShort): JShort = (-x).toShort
+  override def minus(x: JShort, y: JShort): JShort = (x - y).toShort
+  override def times(x: JShort, y: JShort): JShort = (x * y).toShort
+}
+
+object JLongRing extends Ring[JLong] {
+  override val zero: JLong = JLong.valueOf(0L)
+  override val one: JLong = JLong.valueOf(1L)
+  override def plus(x: JLong, y: JLong): JLong = x + y
+  override def negate(x: JLong): JLong = -x
+  override def minus(x: JLong, y: JLong): JLong = x - y
+  override def times(x: JLong, y: JLong): JLong = x * y
+}
+
+object JFloatRing extends Ring[JFloat] {
+  override val zero: JFloat = JFloat.valueOf(0.0f)
+  override val one: JFloat = JFloat.valueOf(1.0f)
+  override def plus(x: JFloat, y: JFloat): JFloat = x + y
+  override def negate(x: JFloat): JFloat = -x
+  override def minus(x: JFloat, y: JFloat): JFloat = x - y
+  override def times(x: JFloat, y: JFloat): JFloat = x * y
+}
+
+object JDoubleRing extends Ring[JDouble] {
+  override val zero: JDouble = JDouble.valueOf(0.0)
+  override val one: JDouble = JDouble.valueOf(1.0)
+  override def plus(x: JDouble, y: JDouble): JDouble = x + y
+  override def negate(x: JDouble): JDouble = -x
+  override def minus(x: JDouble, y: JDouble): JDouble = x - y
+  override def times(x: JDouble, y: JDouble): JDouble = x * y
+}
+
+object JBoolRing extends Ring[JBool] {
+  override val zero: JBool = JBool.FALSE
+  override val one: JBool = JBool.TRUE
+  override def plus(x: JBool, y: JBool): JBool =
+    JBool.valueOf(x.booleanValue ^ y.booleanValue)
+  override def negate(x: JBool): JBool = x
+  override def minus(x: JBool, y: JBool): JBool = plus(x, y)
+  override def times(x: JBool, y: JBool): JBool =
+    JBool.valueOf(x.booleanValue & y.booleanValue)
+}
+
+/**
+ * Since Lists are mutable, this always makes a full copy. Prefer scala immutable Lists if you use scala
+ * immutable lists, the tail of the result of plus is always the right argument
+ */
+class JListMonoid[T] extends Monoid[JList[T]] {
+  override def isNonZero(x: JList[T]): Boolean = !x.isEmpty
+  override lazy val zero: JArrayList[T] = new JArrayList[T](0)
+  override def plus(x: JList[T], y: JList[T]): JArrayList[T] = {
+    val res = new JArrayList[T](x.size + y.size)
+    res.addAll(x)
+    res.addAll(y)
+    res
+  }
+}
+
+/**
+ * Since maps are mutable, this always makes a full copy. Prefer scala immutable maps if you use scala
+ * immutable maps, this operation is much faster TODO extend this to Group, Ring
+ */
+class JMapMonoid[K, V: Semigroup] extends Monoid[JMap[K, V]] {
+  override lazy val zero: JHashMap[K, V] = new JHashMap[K, V](0)
+
+  val nonZero: (V => Boolean) = implicitly[Semigroup[V]] match {
+    case mon: Monoid[?] => mon.isNonZero(_)
+    case _              => _ => true
+  }
+
+  override def isNonZero(x: JMap[K, V]): Boolean =
+    !x.isEmpty && (implicitly[Semigroup[V]] match {
+      case mon: Monoid[?] =>
+        x.values.asScala.exists(v => mon.isNonZero(v))
+      case _ => true
+    })
+  override def plus(x: JMap[K, V], y: JMap[K, V]): JHashMap[K, V] = {
+    val (big, small, bigOnLeft) =
+      if (x.size > y.size) {
+        (x, y, true)
+      } else {
+        (y, x, false)
+      }
+    val vsemi = implicitly[Semigroup[V]]
+    val result = new JHashMap[K, V](big.size + small.size)
+    result.putAll(big)
+    small.entrySet.asScala.foreach { kv =>
+      val smallK = kv.getKey
+      val smallV = kv.getValue
+      if (big.containsKey(smallK)) {
+        val bigV = big.get(smallK)
+        val newV =
+          if (bigOnLeft) vsemi.plus(bigV, smallV) else vsemi.plus(smallV, bigV)
+        if (nonZero(newV))
+          result.put(smallK, newV)
+        else
+          result.remove(smallK)
+      } else {
+        // No need to explicitly add with zero on V, just put in the small value
+        result.put(smallK, smallV)
+      }
+    }
+    result
+  }
+}
diff --git a/algebird-core/src/main/scala-3/MapAlgebra.scala b/algebird-core/src/main/scala-3/MapAlgebra.scala
new file mode 100644
index 000000000..9ca370eaf
--- /dev/null
+++ b/algebird-core/src/main/scala-3/MapAlgebra.scala
@@ -0,0 +1,320 @@
+/*
+Copyright 2012 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+ */
+package com.twitter.algebird
+
+import com.twitter.algebird.macros.{Cuber, Roller}
+import scala.collection.mutable.{Builder, Map => MMap}
+import scala.collection.{Map => ScMap}
+import algebra.ring.Rng
+import scala.collection.compat._
+
+trait MapOperations[K, V, M <: ScMap[K, V]] {
+  def add(oldMap: M, kv: (K, V)): M
+  def remove(oldMap: M, k: K): M
+  def fromMutable(mut: MMap[K, V]): M
+}
+
+abstract class GenericMapMonoid[K, V, M <: ScMap[K, V]](implicit val semigroup: Semigroup[V])
+    extends Monoid[M]
+    with MapOperations[K, V, M] {
+
+  val nonZero: (V => Boolean) = semigroup match {
+    case mon: Monoid[?] => mon.isNonZero(_)
+    case _              => _ => true
+  }
+
+  override def isNonZero(x: M): Boolean =
+    !x.isEmpty && (semigroup match {
+      case mon: Monoid[?] =>
+        x.valuesIterator.exists(v => mon.isNonZero(v))
+      case _ => true
+    })
+
+  override def plus(x: M, y: M): M = {
+    // Scala maps can reuse internal structure, so don't copy just add into the bigger one:
+    // This really saves computation when adding lots of small maps into big ones (common)
+    val (big, small, bigOnLeft) =
+      if (x.size > y.size) {
+        (x, y, true)
+      } else {
+        (y, x, false)
+      }
+    small match {
+      // Mutable maps create new copies of the underlying data on add so don't use the
+      // handleImmutable method.
+      // Cannot have a None so 'get' is safe here.
+      case _: MMap[?, ?] => sumOption(Seq(big, small)).get
+      case _             => handleImmutable(big, small, bigOnLeft)
+    }
+  }
+
+  private def handleImmutable(big: M, small: M, bigOnLeft: Boolean) =
+    small.foldLeft(big) { (oldMap, kv) =>
+      val newV = big
+        .get(kv._1)
+        .map { bigV =>
+          if (bigOnLeft)
+            semigroup.plus(bigV, kv._2)
+          else
+            semigroup.plus(kv._2, bigV)
+        }
+        .getOrElse(kv._2)
+      if (nonZero(newV))
+        add(oldMap, kv._1 -> newV)
+      else
+        remove(oldMap, kv._1)
+    }
+  override def sumOption(items: TraversableOnce[M]): Option[M] =
+    if (items.iterator.isEmpty) None
+    else {
+      val mutable = MMap[K, V]()
+      items.iterator.foreach { m =>
+        m.foreach { case (k, v) =>
+          val oldVOpt = mutable.get(k)
+          // sorry for the micro optimization here: avoiding a closure
+          val newV =
+            if (oldVOpt.isEmpty) v else Semigroup.plus(oldVOpt.get, v)
+          if (nonZero(newV))
+            mutable.update(k, newV)
+          else
+            mutable.remove(k)
+        }
+      }
+      Some(fromMutable(mutable))
+    }
+}
+
+class MapMonoid[K, V](implicit semigroup: Semigroup[V]) extends GenericMapMonoid[K, V, Map[K, V]] {
+  override lazy val zero: Map[K, V] = Map[K, V]()
+  override def add(oldMap: Map[K, V], kv: (K, V)): Map[K, V] = oldMap + kv
+  override def remove(oldMap: Map[K, V], k: K): Map[K, V] = oldMap - k
+  override def fromMutable(mut: MMap[K, V]): Map[K, V] =
+    new MutableBackedMap(mut)
+}
+
+class ScMapMonoid[K, V](implicit semigroup: Semigroup[V]) extends GenericMapMonoid[K, V, ScMap[K, V]] {
+  override lazy val zero: ScMap[K, V] = ScMap[K, V]()
+  override def add(oldMap: ScMap[K, V], kv: (K, V)): ScMap[K, V] = oldMap + kv
+  override def remove(oldMap: ScMap[K, V], k: K): ScMap[K, V] = oldMap - k
+  override def fromMutable(mut: MMap[K, V]): ScMap[K, V] =
+    new MutableBackedMap(mut)
+}
+
+/**
+ * You can think of this as a Sparse vector group
+ */
+class MapGroup[K, V](implicit val group: Group[V]) extends MapMonoid[K, V]()(group) with Group[Map[K, V]] {
+  override def negate(kv: Map[K, V]): Map[K, V] =
+    kv.iterator.map { case (k, v) =>
+      (k, group.negate(v))
+    }.toMap
+}
+
+class ScMapGroup[K, V](implicit val group: Group[V])
+    extends ScMapMonoid[K, V]()(group)
+    with Group[ScMap[K, V]] {
+  override def negate(kv: ScMap[K, V]): ScMap[K, V] =
+    kv.iterator.map { case (k, v) =>
+      (k, group.negate(v))
+    }.toMap
+}
+
+/**
+ * You can think of this as a Sparse vector ring
+ */
+trait GenericMapRing[K, V, M <: ScMap[K, V]] extends Rng[M] with MapOperations[K, V, M] {
+
+  implicit def ring: Ring[V]
+
+  override def times(x: M, y: M): M = {
+    val (big, small, bigOnLeft) =
+      if (x.size > y.size) {
+        (x, y, true)
+      } else {
+        (y, x, false)
+      }
+    small.foldLeft(zero) { (oldMap, kv) =>
+      val bigV = big.getOrElse(kv._1, ring.zero)
+      val newV =
+        if (bigOnLeft) ring.times(bigV, kv._2) else ring.times(kv._2, bigV)
+      if (ring.isNonZero(newV)) {
+        add(oldMap, kv._1 -> newV)
+      } else {
+        remove(oldMap, kv._1)
+      }
+    }
+  }
+}
+
+class MapRing[K, V](implicit override val ring: Ring[V])
+    extends MapGroup[K, V]()(ring)
+    with GenericMapRing[K, V, Map[K, V]]
+
+class ScMapRing[K, V](implicit override val ring: Ring[V])
+    extends ScMapGroup[K, V]()(ring)
+    with GenericMapRing[K, V, ScMap[K, V]]
+
+object MapAlgebra {
+  def rightContainsLeft[K, V: Equiv](l: Map[K, V], r: Map[K, V]): Boolean =
+    l.forall { case (k, v) =>
+      r.get(k).exists(Equiv[V].equiv(_, v))
+    }
+
+  implicit def sparseEquiv[K, V: Monoid: Equiv]: Equiv[Map[K, V]] =
+    Equiv.fromFunction { (m1, m2) =>
+      val cleanM1 = removeZeros(m1)
+      val cleanM2 = removeZeros(m2)
+      rightContainsLeft(cleanM1, cleanM2) && rightContainsLeft(cleanM2, cleanM1)
+    }
+
+  def mergeLookup[T, U, V: Monoid](
+      keys: TraversableOnce[T]
+  )(lookup: T => Option[V])(present: T => U): Map[U, V] =
+    sumByKey {
+      keys.iterator.map(k => present(k) -> lookup(k).getOrElse(Monoid.zero[V]))
+    }
+
+  // Returns a new map with zero-value entries removed
+  def removeZeros[K, V: Monoid](m: Map[K, V]): Map[K, V] =
+    m.filter { case (_, v) => Monoid.isNonZero(v) }
+
+  /**
+   * For each key, sum all the values. Note that if V is a Monoid, the current implementation will drop from
+   * the output any key where the values are all Monoid.zero. If the Semigroup is a Monoid, This function is
+   * equivalent to:
+   *
+   * pairs.filter(_._2 != Monoid.zero).groupBy(_._1).mapValues(_.map(_._2).sum)
+   *
+   * Otherwise, the function is equivalent to:
+   *
+   * pairs.groupBy(_._1).mapValues(_.map(_._2).sum)
+   */
+  def sumByKey[K, V: Semigroup](pairs: TraversableOnce[(K, V)]): Map[K, V] =
+    Monoid.sum(pairs.iterator.map(Map(_)))
+
+  /**
+   * For each key, creates a list of all values. This function is equivalent to:
+   *
+   * pairs.groupBy(_._1).mapValues(_.map(_._2))
+   */
+  def group[K, V](pairs: TraversableOnce[(K, V)]): Map[K, List[V]] =
+    if (pairs.iterator.isEmpty) Map.empty
+    else {
+      val mutable = MMap[K, Builder[V, List[V]]]()
+      pairs.iterator.foreach { case (k, v) =>
+        val oldVOpt = mutable.get(k)
+        // sorry for the micro optimization here: avoiding a closure
+        val bldr = if (oldVOpt.isEmpty) {
+          val b = List.newBuilder[V]
+          mutable.update(k, b)
+          b
+        } else oldVOpt.get
+        bldr += v
+      }
+      mutable.iterator.map { case (k, bldr) => (k, bldr.result()) }.toMap
+    }
+
+  // Consider this as edges from k -> v, produce a Map[K,Set[V]]
+  def toGraph[K, V](pairs: TraversableOnce[(K, V)]): Map[K, Set[V]] =
+    Monoid.sum(pairs.map { case (k, v) => Map(k -> Set(v)) })
+
+  /** join the keys of two maps (similar to outer-join in a DB) */
+  def join[K, V, W](map1: Map[K, V], map2: Map[K, W]): Map[K, (Option[V], Option[W])] =
+    Monoid
+      .plus(
+        map1.transform { case (_, v) =>
+          (List(v), List[W]())
+        },
+        map2.transform { case (_, w) =>
+          (List[V](), List(w))
+        }
+      )
+      .transform { case (_, (v, w)) => (v.headOption, w.headOption) }
+
+  /**
+   * Reverses a graph losslessly None key is for v's with no sources.
+   */
+  def invertExact[K, V](m: Map[Option[K], Set[V]]): Map[Option[V], Set[K]] = {
+    def nonEmptyIter[T](i: Iterable[T]): Iterable[Option[T]] =
+      if (i.isEmpty) Iterable(None)
+      else {
+        i.map(Some(_))
+      }
+
+    Monoid.sum {
+      for {
+        (k, sv) <- m.view.toIterable
+        v <- nonEmptyIter(sv)
+      } yield Map(v -> k.toSet)
+    }
+  }
+
+  /**
+   * Invert the Common case of exactly one value for each key
+   */
+  def invert[K, V](m: Map[K, V]): Map[V, Set[K]] =
+    Monoid.sum(m.view.toIterable.map { case (k, v) => Map(v -> Set(k)) })
+
+  def dot[K, V](left: Map[K, V], right: Map[K, V])(implicit mring: Ring[Map[K, V]], mon: Monoid[V]): V =
+    Monoid.sum(mring.times(left, right).values)
+
+  def cube[K, V](it: TraversableOnce[(K, V)])(implicit c: Cuber[K]): Map[c.K, List[V]] = {
+    val map: MMap[c.K, List[V]] = MMap[c.K, List[V]]()
+    it.iterator.foreach { case (k, v) =>
+      c(k).iterator.foreach { ik =>
+        map.get(ik) match {
+          case Some(vs) => map += ik -> (v :: vs)
+          case None     => map += ik -> List(v)
+        }
+      }
+    }
+    map.foreach { case (k, v) => map(k) = v.reverse }
+    new MutableBackedMap(map)
+  }
+
+  def cubeSum[K, V](it: TraversableOnce[(K, V)])(implicit c: Cuber[K], sg: Semigroup[V]): Map[c.K, V] =
+    sumByKey(it.iterator.flatMap { case (k, v) => c(k).map((_, v)) })
+
+  def cubeAggregate[T, K, U, V](it: TraversableOnce[T], agg: Aggregator[T, U, V])(
+      fn: T => K
+  )(implicit c: Cuber[K]): Map[c.K, V] =
+    sumByKey(it.iterator.flatMap(t => c(fn(t)).iterator.map((_, agg.prepare(t)))))(agg.semigroup)
+      .map { case (k, v) => (k, agg.present(v)) }
+
+  def rollup[K, V](it: TraversableOnce[(K, V)])(implicit r: Roller[K]): Map[r.K, List[V]] = {
+    val map: MMap[r.K, List[V]] = MMap[r.K, List[V]]()
+    it.iterator.foreach { case (k, v) =>
+      r(k).iterator.foreach { ik =>
+        map.get(ik) match {
+          case Some(vs) => map += ik -> (v :: vs)
+          case None     => map += ik -> List(v)
+        }
+      }
+    }
+    map.foreach { case (k, v) => map(k) = v.reverse }
+    new MutableBackedMap(map)
+  }
+
+  def rollupSum[K, V](it: TraversableOnce[(K, V)])(implicit r: Roller[K], sg: Semigroup[V]): Map[r.K, V] =
+    sumByKey(it.iterator.flatMap { case (k, v) => r(k).iterator.map((_, v)) })
+
+  def rollupAggregate[T, K, U, V](it: TraversableOnce[T], agg: Aggregator[T, U, V])(
+      fn: T => K
+  )(implicit r: Roller[K]): Map[r.K, V] =
+    sumByKey(it.iterator.flatMap(t => r(fn(t)).iterator.map((_, agg.prepare(t)))))(agg.semigroup)
+      .map { case (k, v) => (k, agg.present(v)) }
+
+}
diff --git a/algebird-core/src/main/scala-3/Scan.scala b/algebird-core/src/main/scala-3/Scan.scala
new file mode 100644
index 000000000..2dc2ff9c2
--- /dev/null
+++ b/algebird-core/src/main/scala-3/Scan.scala
@@ -0,0 +1,333 @@
+package com.twitter.algebird
+
+import scala.collection.compat._
+
+object Scan {
+
+  /**
+   * Most consumers of Scan don't care about the type of the type State type variable. But for those that do,
+   * we make an effort to expose it in all of our combinators.
+   * @tparam I
+   * @tparam S
+   * @tparam O
+   */
+  type Aux[-I, S, +O] = Scan[I, O] { type State = S }
+
+  implicit def applicative[I]: Applicative[Scan[I, _]] = new ScanApplicative[I]
+
+  def from[I, S, O](initState: S)(presentAndNextStateFn: (I, S) => (O, S)): Aux[I, S, O] =
+    new Scan[I, O] {
+      override type State = S
+      override val initialState = initState
+      override def presentAndNextState(i: I, s: State): (O, State) = presentAndNextStateFn(i, s)
+    }
+
+  def fromFunction[I, O](f: I => O): Aux[I, Unit, O] = new Scan[I, O] {
+    override type State = Unit
+    override val initialState = ()
+    override def presentAndNextState(i: I, stateBeforeProcessingI: Unit): (O, State) = (f(i), ())
+  }
+
+  /**
+   * Scans take streams of inputs to streams of outputs, but some scans have trivial inputs and just produce a
+   * stream of outputs. Streams can be thought of as being a hidden state that is queryable for a head
+   * element, and another hidden state that represents the rest of the stream.
+   * @param initState
+   *   The initial state of the scan; think of this as an infinite stream.
+   * @param destructor
+   *   This function decomposes a stream into the its head-element and tail-stream.
+   * @tparam S
+   *   The hidden state of the stream that we are turning into a Scan.
+   * @tparam O
+   *   The type of the elments of the stream that we are turning into a Scan
+   * @return
+   *   A Scan whose inputs are irrelevant, and whose outputs are those that we would get from implementing a
+   *   stream using the information provided to this method.
+   */
+  def iterate[S, O](initState: S)(destructor: S => (O, S)): Aux[Any, S, O] = new Scan[Any, O] {
+    override type State = S
+    override val initialState = initState
+    override def presentAndNextState(i: Any, stateBeforeProcessingI: S): (O, S) =
+      destructor(stateBeforeProcessingI)
+  }
+
+  /**
+   * A Scan whose `Nth` output is the number `N` (starting from 0).
+   */
+  val index: Aux[Any, Long, Long] = iterate(0L)(n => (n, n + 1))
+
+  def identity[A]: Aux[A, Unit, A] = fromFunction[A, A](x => x)
+
+  /**
+   * @param initStateCreator
+   *   A call-by-name method that allocates new mutable state
+   * @param presentAndUpdateStateFn
+   *   A function that both presents the output value, and has the side-effect of updating the mutable state
+   * @tparam I
+   * @tparam S
+   * @tparam O
+   * @return
+   *   A Scan that safely encapsulates state while it's doing its thing.
+   */
+  def mutable[I, S, O](initStateCreator: => S)(presentAndUpdateStateFn: (I, S) => O): Aux[I, S, O] =
+    new Scan[I, O] {
+      override type State = S
+      override def initialState = initStateCreator
+      override def presentAndNextState(i: I, s: S): (O, S) = (presentAndUpdateStateFn(i, s), s)
+    }
+
+  /**
+   * The trivial scan that always returns the same value, regardless of input
+   * @param t
+   * @tparam T
+   */
+  def const[T](t: T): Aux[Any, Unit, T] = fromFunction(_ => t)
+
+  /**
+   * @param aggregator
+   * @param initState
+   * @tparam A
+   * @tparam B
+   * @tparam C
+   * @return
+   *   A scan which, when given `[a_1, ..., a_n]` outputs `[c_1, ..., c_n]` where `c_i = initState +
+   *   aggregator.prepare(a_1) + ... + aggregator.prepare(a_i)`
+   */
+  def fromAggregator[A, B, C](aggregator: Aggregator[A, B, C], initState: B): Aux[A, B, C] =
+    from(initState) { (a: A, stateBeforeProcessingI: B) =>
+      // nb: the order of the arguments to semigroup.plus here is what determines the order of the final summation;
+      // this matters because not all semigroups are commutative
+      val stateAfterProcessingA =
+        aggregator.append(stateBeforeProcessingI, a)
+      (aggregator.present(stateAfterProcessingA), stateAfterProcessingA)
+    }
+
+  /**
+   * @param monoidAggregator
+   * @tparam A
+   * @tparam B
+   * @tparam C
+   * @return
+   *   A scan which, when given `[a_1, ..., a_n]` outputs `[c_1, ..., c_n]` where `c_i =
+   *   monoidAggregator.monoid.zero + aggregator.prepare(a_1) + ... + aggregator.prepare(a_i)`
+   */
+  def fromMonoidAggregator[A, B, C](monoidAggregator: MonoidAggregator[A, B, C]): Aux[A, B, C] =
+    fromAggregator(monoidAggregator, monoidAggregator.monoid.zero)
+
+}
+
+/**
+ * The Scan trait is an alternative to the `scanLeft` method on iterators/other collections for a range of of
+ * use-cases where `scanLeft` is awkward to use. At a high level it provides some of the same functionality as
+ * `scanLeft`, but with a separation of "what is the state of the scan" from "what are the elements that I'm
+ * scanning over?". In particular, when scanning over an iterator with `N` elements, the output is an iterator
+ * with `N` elements (in contrast to scanLeft's `N+1`).
+ *
+ * If you find yourself writing a `scanLeft` over pairs of elements, where you only use one element of the
+ * pair within the `scanLeft`, then throw that element away in a `map` immediately after the scanLeft is done,
+ * then this abstraction is for you.
+ *
+ * The canonical method to use a scan is `apply`.
+ *
+ * @tparam I
+ *   The type of elements that the computation is scanning over.
+ * @tparam O
+ *   The output type of the scan (typically distinct from the hidden `State` of the scan).
+ */
+sealed abstract class Scan[-I, +O] extends Serializable {
+
+  import Scan.{from, Aux}
+
+  /**
+   * The computation of any given scan involves keeping track of a hidden state.
+   */
+  type State
+
+  /**
+   * The state of the scan before any elements have been processed
+   * @return
+   */
+  def initialState: State
+
+  /**
+   * @param i
+   *   An element in the stream to process
+   * @param stateBeforeProcessingI
+   *   The state of the scan before processing i
+   * @return
+   *   The output of the scan corresponding to processing i with state stateBeforeProcessing, along with the
+   *   result of updating stateBeforeProcessing with the information from i.
+   */
+  def presentAndNextState(i: I, stateBeforeProcessingI: State): (O, State)
+
+  /**
+   * @param iter
+   * @return
+   *   If `iter = Iterator(a_1, ..., a_n)`, return:` `Iterator(o_1, ..., o_n)` where `(o_(i+1), state_(i+1)) =
+   *   presentAndNextState(a_i, state_i)` and `state_0 = initialState`
+   */
+  def scanIterator(iter: Iterator[I]): Iterator[O] = new AbstractIterator[O] {
+    override def hasNext: Boolean = iter.hasNext
+    var state: State = initialState
+    override def next(): O = {
+      val thisState = state
+      val thisA = iter.next()
+      val (thisC, nextState) = presentAndNextState(thisA, thisState)
+      state = nextState
+      thisC
+    }
+  }
+
+  /**
+   * @param inputs
+   * @param bf
+   * @tparam In
+   *   The type of the input collection
+   * @tparam Out
+   *   The type of the output collection
+   * @return
+   *   Given inputs as a collection of the form `[a_1, ..., a_n]` the output will be a collection of the form:
+   *   `[o_1, ..., o_n]` where `(o_(i+1), state_(i+1)) = presentAndNextState(a_i, state_i)` and `state_0 =
+   *   initialState`.
+   */
+  def apply[In <: TraversableOnce[I], Out](
+      inputs: In
+  )(implicit bf: BuildFrom[In, O, Out]): Out =
+    bf.fromSpecific(inputs)(scanIterator(inputs.toIterator))
+
+  // combinators
+
+  /**
+   * Return a new scan that is the same as this scan, but with a different `initialState`.
+   * @param newInitialState
+   * @return
+   */
+  def replaceState(newInitialState: => State): Aux[I, State, O] =
+    from(newInitialState)(presentAndNextState(_, _))
+
+  def composePrepare[I1](f: I1 => I): Aux[I1, State, O] = from(initialState) { (i, stateBeforeProcessingI) =>
+    presentAndNextState(f(i), stateBeforeProcessingI)
+  }
+
+  def andThenPresent[O1](g: O => O1): Aux[I, State, O1] = from(initialState) { (i, stateBeforeProcessingI) =>
+    val (c, stateAfterProcessingA) = presentAndNextState(i, stateBeforeProcessingI)
+    (g(c), stateAfterProcessingA)
+  }
+
+  /**
+   * Return a scan that is semantically identical to `this.join(Scan.identity[I1])`, but where we don't
+   * pollute the `State` by pairing it redundantly with `Unit`.
+   * @tparam I1
+   * @return
+   *   If this Scan's `apply` method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1,
+   *   ..., o_n`, then this results in a Scan whose `apply` method returns `[(o_1, a_1), ..., (o_n, a_n)]`
+   *   when given the same input.
+   */
+  def joinWithInput[I1 <: I]: Aux[I1, State, (O, I1)] = from(initialState) { (i, stateBeforeProcessingI) =>
+    val (o, stateAfterProcessingI) = presentAndNextState(i, stateBeforeProcessingI)
+    ((o, i), stateAfterProcessingI)
+  }
+
+  /**
+   * Return a scan whose output is paired with the state of the scan before each input updates the state.
+   * @return
+   *   If this Scan's `apply` method is given inputs [a_1, ..., a_n] resulting in outputs of the form `[o_1,
+   *   ..., o_n]`, where `(o_(i+1), state_(i+1)) = presentAndNextState(a_i, state_i)` and `state_0 =
+   *   initialState`, return a scan that whose apply method, when given inputs `[a_1, ..., a_n]` will return
+   *   `[(o_1, state_0), ..., (o_n, state_(n-1))]`.
+   */
+  def joinWithPriorState: Aux[I, State, (State, O)] = from(initialState) { (i, stateBeforeProcessingI) =>
+    val (o, stateAfterProcessingA) = presentAndNextState(i, stateBeforeProcessingI)
+    ((stateBeforeProcessingI, o), stateAfterProcessingA)
+  }
+
+  /**
+   * Return a scan whose output is paired with the state of the scan after each input updates the state.
+   * @return
+   *   If this Scan's `apply` method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1,
+   *   ..., o_n]`, where `(o_(i+1), state_(i+1)) = presentAndNextState(a_i, state_i)`` and state_0 =
+   *   initialState, return a scan that whose apply method, when given inputs `[a_1, ..., a_n]` will return
+   *   `[(o_1, state_1), ..., (o_n, state_n]`.
+   */
+  def joinWithPosteriorState: Aux[I, State, (O, State)] = from(initialState) { (i, stateBeforeProcessingI) =>
+    val (c, stateAfterProcessingA) = presentAndNextState(i, stateBeforeProcessingI)
+    ((c, stateAfterProcessingA), stateAfterProcessingA)
+  }
+
+  /**
+   * For every `foo`, `scan.joinWithIndex(foo) == scan(foo).zipWithIndex`.
+   * @return
+   *   If this Scan's `apply` method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1,
+   *   ..., o_n]`, return a scan that whose apply method, when given the same input, will return `[(o_1, 1),
+   *   ..., (o_n, n)]`.
+   */
+  def joinWithIndex: Aux[I, (State, Long), (O, Long)] = join(Scan.index)
+
+  /**
+   * Compose two scans pairwise such that, when given pairwise zipped inputs, the resulting scan will output
+   * pairwise zipped outputs.
+   * @param scan2
+   * @tparam I2
+   * @tparam O2
+   * @return
+   *   If this Scan's apply method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1,
+   *   ..., o_n]`, and `scan2.apply([b_1, ..., b_n] = [p_1, ..., p_n]` then `zip` will return a scan whose
+   *   apply method, when given input `[(a_1, b_1), ..., (a_n, b_n)]` results in the output `[(o_1, p_1), ...,
+   *   (o_2, p_2)]`. In other words: `scan.zip(scan2)(foo.zip(bar)) == scan(foo).zip(scan2(bar))`
+   */
+  def zip[I2, O2](scan2: Scan[I2, O2]): Aux[(I, I2), (State, scan2.State), (O, O2)] =
+    from((initialState, scan2.initialState)) { (i1i2, stateBeforeProcessingI1I2) =>
+      val (o1, state1AfterProcesingI1) =
+        presentAndNextState(i1i2._1, stateBeforeProcessingI1I2._1)
+      val (o2, state2AfterProcesingI2) =
+        scan2.presentAndNextState(i1i2._2, stateBeforeProcessingI1I2._2)
+      ((o1, o2), (state1AfterProcesingI1, state2AfterProcesingI2))
+    }
+
+  /**
+   * Given a scan that takes compatible input to this one, pairwise compose the state and outputs of each scan
+   * on a common input stream.
+   * @param scan2
+   * @tparam I2
+   * @tparam O2
+   * @return
+   *   If this Scan's apply method is given inputs [a_1, ..., a_n] resulting in outputs of the form `[o_1,
+   *   ..., o_n]`, and `scan2.apply([a_1, ..., a_n] = [p_1, ..., p_n]` then `join` will return a scan whose
+   *   apply method returns `[(o_1, p_1), ..., (o_2, p_2)]`. In other words: `scan.join(scan2)(foo) ==
+   *   scan(foo).zip(scan2(foo))`
+   */
+  def join[I2 <: I, O2](scan2: Scan[I2, O2]): Aux[I2, (State, scan2.State), (O, O2)] =
+    from((initialState, scan2.initialState)) { (i, stateBeforeProcessingI) =>
+      val (o1, state1AfterProcesingI1) = presentAndNextState(i, stateBeforeProcessingI._1)
+      val (o2, state2AfterProcesingI2) = scan2.presentAndNextState(i, stateBeforeProcessingI._2)
+      ((o1, o2), (state1AfterProcesingI1, state2AfterProcesingI2))
+    }
+
+  /**
+   * Takes the output of this scan and feeds as input into scan2.
+   * @param scan2
+   * @tparam P
+   * @return
+   *   If this Scan's apply method is given inputs `[a_1, ..., a_n]` resulting in outputs of the form `[o_1,
+   *   ..., o_n]`, and `scan2.apply([o_1, ..., o_n] = [p_1, ..., p_n]` then `compose` will return a scan which
+   *   returns `[p_1, ..., p_n]`.
+   */
+  def compose[P](scan2: Scan[O, P]): Aux[I, (State, scan2.State), P] =
+    from((initialState, scan2.initialState)) { (i, stateBeforeProcessingI) =>
+      val (o, state1AfterProcesingI) = presentAndNextState(i, stateBeforeProcessingI._1)
+      val (p, state2AfterProcesingO) = scan2.presentAndNextState(o, stateBeforeProcessingI._2)
+      (p, (state1AfterProcesingI, state2AfterProcesingO))
+    }
+
+}
+
+class ScanApplicative[I] extends Applicative[Scan[I, _]] {
+  override def map[T, U](mt: Scan[I, T])(fn: T => U): Scan[I, U] =
+    mt.andThenPresent(fn)
+
+  override def apply[T](v: T): Scan[I, T] =
+    Scan.const(v)
+
+  override def join[T, U](mt: Scan[I, T], mu: Scan[I, U]): Scan[I, (T, U)] =
+    mt.join(mu)
+}
diff --git a/algebird-core/src/main/scala-3/SpaceSaver.scala b/algebird-core/src/main/scala-3/SpaceSaver.scala
new file mode 100644
index 000000000..5f9eee7e6
--- /dev/null
+++ b/algebird-core/src/main/scala-3/SpaceSaver.scala
@@ -0,0 +1,296 @@
+package com.twitter.algebird
+
+import java.nio.ByteBuffer
+
+import scala.collection.immutable.SortedMap
+import scala.util.{Failure, Success, Try}
+
+object SpaceSaver {
+
+  /**
+   * Construct SpaceSaver with given capacity containing a single item. This is the public api to create a new
+   * SpaceSaver.
+   */
+  def apply[T](capacity: Int, item: T): SpaceSaver[T] = SSOne(capacity, item)
+
+  /**
+   * Construct SpaceSaver with given capacity containing a single item with provided exact count. This is the
+   * public api to create a new SpaceSaver.
+   */
+  def apply[T](capacity: Int, item: T, count: Long): SpaceSaver[T] =
+    SSMany(capacity, Map(item -> ((count, 0L))))
+
+  private[algebird] val ordering =
+    Ordering.by[(?, (Long, Long)), (Long, Long)] { case (_, (count, err)) =>
+      (-count, err)
+    }
+
+  implicit def spaceSaverSemiGroup[T]: Semigroup[SpaceSaver[T]] =
+    new SpaceSaverSemigroup[T]
+
+  /**
+   * Encodes the SpaceSaver as a sequence of bytes containing in order
+   *   - 1 byte: 1/2 => 1 = SSOne, 2 = SSMany
+   *   - 4 bytes: the capacity
+   *   - N bytes: the item/counters (counters as length + N*(item size + item + 2 * counters)
+   */
+  def toBytes[T](ss: SpaceSaver[T], tSerializer: T => Array[Byte]): Array[Byte] =
+    ss match {
+      case SSOne(capacity, item) =>
+        val itemAsBytes = tSerializer(item)
+        val itemLength = itemAsBytes.length
+        // 1 for the type, 4 for capacity, 4 for itemAsBytes.length
+        val buffer = new Array[Byte](1 + 4 + 4 + itemLength)
+        ByteBuffer
+          .wrap(buffer)
+          .put(1: Byte)
+          .putInt(capacity)
+          .putInt(itemLength)
+          .put(itemAsBytes)
+        buffer
+
+      case SSMany(
+            capacity,
+            counters,
+            _
+          ) => // We do not care about the buckets are thery are created by SSMany.apply
+        val buffer = scala.collection.mutable.ArrayBuffer.newBuilder[Byte]
+        buffer += (2: Byte)
+
+        var buff = ByteBuffer.allocate(4)
+        buff.putInt(capacity)
+        buffer ++= buff.array()
+
+        buff = ByteBuffer.allocate(4)
+        buff.putInt(counters.size)
+        buffer ++= buff.array()
+        counters.foreach { case (item, (a, b)) =>
+          val itemAsBytes = tSerializer(item)
+
+          buff = ByteBuffer.allocate(4)
+          buff.putInt(itemAsBytes.length)
+          buffer ++= buff.array()
+
+          buffer ++= itemAsBytes
+
+          buff = ByteBuffer.allocate(8 * 2)
+          buff.putLong(a)
+          buff.putLong(b)
+          buffer ++= buff.array()
+        }
+        buffer.result().toArray
+    }
+
+  // Make sure to be reversible so fromBytes(toBytes(x)) == x
+  def fromBytes[T](bytes: Array[Byte], tDeserializer: Array[Byte] => Try[T]): Try[SpaceSaver[T]] =
+    fromByteBuffer(ByteBuffer.wrap(bytes), buffer => tDeserializer(buffer.array()))
+
+  def fromByteBuffer[T](bb: ByteBuffer, tDeserializer: ByteBuffer => Try[T]): Try[SpaceSaver[T]] =
+    Try {
+      bb.get.toInt match {
+        case 1 =>
+          val capacity = bb.getInt
+          val itemLength = bb.getInt
+          val itemAsBytes = new Array[Byte](itemLength)
+          bb.get(itemAsBytes)
+          tDeserializer(ByteBuffer.wrap(itemAsBytes)).map(item => SSOne(capacity, item))
+        case 2 =>
+          val capacity = bb.getInt
+
+          var countersToDeserialize = bb.getInt
+          val counters = scala.collection.mutable.Map.empty[T, (Long, Long)]
+          while (countersToDeserialize != 0) {
+            val itemLength = bb.getInt()
+            val itemAsBytes = new Array[Byte](itemLength)
+            bb.get(itemAsBytes)
+            val item = tDeserializer(ByteBuffer.wrap(itemAsBytes))
+
+            val a = bb.getLong
+            val b = bb.getLong
+
+            item match {
+              case Failure(e) => return Failure(e)
+              case Success(i) =>
+                counters += ((i, (a, b)))
+            }
+
+            countersToDeserialize -= 1
+          }
+
+          Success(SSMany(capacity, counters.toMap))
+      }
+    }.flatten
+}
+
+/**
+ * Data structure used in the Space-Saving Algorithm to find the approximate most frequent and top-k elements.
+ * The algorithm is described in "Efficient Computation of Frequent and Top-k Elements in Data Streams". See
+ * here: www.cs.ucsb.edu/research/tech_reports/reports/2005-23.pdf In the paper the data structure is called
+ * StreamSummary but we chose to call it SpaceSaver instead. Note that the adaptation to hadoop and
+ * parallelization were not described in the article and have not been proven to be mathematically correct or
+ * preserve the guarantees or benefits of the algorithm.
+ */
+sealed abstract class SpaceSaver[T] {
+  import SpaceSaver.ordering
+
+  /**
+   * Maximum number of counters to keep (parameter "m" in the research paper).
+   */
+  def capacity: Int
+
+  /**
+   * Current lowest value for count
+   */
+  def min: Long
+
+  /**
+   * Map of item to counter, where each counter consists of an observed count and possible over-estimation
+   * (error)
+   */
+  def counters: Map[T, (Long, Long)]
+
+  def ++(other: SpaceSaver[T]): SpaceSaver[T]
+
+  /**
+   * returns the frequency estimate for the item
+   */
+  def frequency(item: T): Approximate[Long] = {
+    val (count, err) = counters.getOrElse(item, (min, min))
+    Approximate(count - err, count, count, 1.0)
+  }
+
+  /**
+   * Get the elements that show up more than thres times. Returns sorted in descending order: (item,
+   * Approximate[Long], guaranteed)
+   */
+  def mostFrequent(thres: Int): Seq[(T, Approximate[Long], Boolean)] =
+    counters.iterator
+      .filter { case (_, (count, _)) => count >= thres }
+      .toList
+      .sorted(ordering)
+      .map { case (item, (count, err)) =>
+        (item, Approximate(count - err, count, count, 1.0), thres <= count - err)
+      }
+
+  /**
+   * Get the top-k elements. Returns sorted in descending order: (item, Approximate[Long], guaranteed)
+   */
+  def topK(k: Int): Seq[(T, Approximate[Long], Boolean)] = {
+    require(k < capacity)
+    val si = counters.toList
+      .sorted(ordering)
+    val siK = si.take(k)
+    val countKPlus1 = si.drop(k).headOption.map(_._2._1).getOrElse(0L)
+    siK.map { case (item, (count, err)) =>
+      (item, Approximate(count - err, count, count, 1.0), countKPlus1 < count - err)
+    }
+  }
+
+  /**
+   * Check consistency with other SpaceSaver, useful for testing. Returns boolean indicating if they are
+   * consistent
+   */
+  def consistentWith(that: SpaceSaver[T]): Boolean =
+    (counters.keys ++ that.counters.keys).forall(item => (frequency(item) - that.frequency(item)) ~ 0)
+}
+
+case class SSOne[T] private[algebird] (override val capacity: Int, item: T) extends SpaceSaver[T] {
+  require(capacity > 1)
+
+  override def min: Long = 0L
+
+  override def counters: Map[T, (Long, Long)] = Map(item -> ((1L, 1L)))
+
+  override def ++(other: SpaceSaver[T]): SpaceSaver[T] = other match {
+    case other: SSOne[?]  => SSMany(this).add(other)
+    case other: SSMany[?] => other.add(this)
+  }
+}
+
+object SSMany {
+  private def bucketsFromCounters[T](counters: Map[T, (Long, Long)]): SortedMap[Long, Set[T]] =
+    SortedMap[Long, Set[T]]() ++ counters.groupBy(_._2._1).mapValues(_.keySet).toMap
+
+  private[algebird] def apply[T](capacity: Int, counters: Map[T, (Long, Long)]): SSMany[T] =
+    SSMany(capacity, counters, bucketsFromCounters(counters))
+
+  private[algebird] def apply[T](one: SSOne[T]): SSMany[T] =
+    SSMany(one.capacity, Map(one.item -> ((1L, 0L))), SortedMap(1L -> Set(one.item)))
+}
+
+case class SSMany[T] private (
+    override val capacity: Int,
+    override val counters: Map[T, (Long, Long)],
+    buckets: SortedMap[Long, Set[T]]
+) extends SpaceSaver[T] {
+  private val exact: Boolean = counters.size < capacity
+
+  override val min: Long = if (counters.size < capacity) 0L else buckets.firstKey
+
+  // item is already present and just needs to be bumped up one
+  private def bump(item: T) = {
+    val (count, err) = counters(item)
+    val counters1 = counters + (item -> ((count + 1L, err))) // increment by one
+    val currBucket = buckets(count) // current bucket
+    val buckets1 = {
+      if (currBucket.size == 1) // delete current bucket since it will be empty
+        buckets - count
+      else // remove item from current bucket
+        buckets + (count -> (currBucket - item))
+    } + (count + 1L -> (buckets.getOrElse(count + 1L, Set()) + item))
+    SSMany(capacity, counters1, buckets1)
+  }
+
+  // lose one item to meet capacity constraint
+  private def loseOne = {
+    val firstBucket = buckets(buckets.firstKey)
+    val itemToLose = firstBucket.head
+    val counters1 = counters - itemToLose
+    val buckets1 =
+      if (firstBucket.size == 1)
+        buckets - min
+      else
+        buckets + (min -> (firstBucket - itemToLose))
+    SSMany(capacity, counters1, buckets1)
+  }
+
+  // introduce new item
+  private def introduce(item: T, count: Long, err: Long) = {
+    val counters1 = counters + (item -> ((count, err)))
+    val buckets1 = buckets + (count -> (buckets.getOrElse(count, Set()) + item))
+    SSMany(capacity, counters1, buckets1)
+  }
+
+  // add a single element
+  private[algebird] def add(x: SSOne[T]): SSMany[T] = {
+    require(x.capacity == capacity)
+    if (counters.contains(x.item))
+      bump(x.item)
+    else
+      (if (exact) this else this.loseOne).introduce(x.item, min + 1L, min)
+  }
+
+  // merge two stream summaries
+  private def merge(x: SSMany[T]): SSMany[T] = {
+    require(x.capacity == capacity)
+    val counters1 = Map() ++
+      (counters.keySet ++ x.counters.keySet).toList
+        .map { key =>
+          val (count1, err1) = counters.getOrElse(key, (min, min))
+          val (count2, err2) = x.counters.getOrElse(key, (x.min, x.min))
+          key -> ((count1 + count2, err1 + err2))
+        }
+        .sorted(SpaceSaver.ordering)
+        .take(capacity)
+    SSMany(capacity, counters1)
+  }
+
+  override def ++(other: SpaceSaver[T]): SpaceSaver[T] = other match {
+    case other: SSOne[?]  => add(other)
+    case other: SSMany[?] => merge(other)
+  }
+}
+
+class SpaceSaverSemigroup[T] extends Semigroup[SpaceSaver[T]] {
+  override def plus(x: SpaceSaver[T], y: SpaceSaver[T]): SpaceSaver[T] = x ++ y
+}
diff --git a/algebird-core/src/main/scala-3/VectorSpace.scala b/algebird-core/src/main/scala-3/VectorSpace.scala
new file mode 100644
index 000000000..f8818600c
--- /dev/null
+++ b/algebird-core/src/main/scala-3/VectorSpace.scala
@@ -0,0 +1,59 @@
+/*
+Copyright 2012 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+ */
+
+package com.twitter.algebird
+
+import scala.annotation.implicitNotFound
+
+/**
+ * This class represents a vector space. For the required properties see:
+ *
+ * http://en.wikipedia.org/wiki/Vector_space#Definition
+ */
+object VectorSpace extends VectorSpaceOps with Implicits
+
+sealed trait VectorSpaceOps {
+  def scale[F, C[_]](v: F, c: C[F])(implicit vs: VectorSpace[F, C]): C[F] =
+    vs.scale(v, c)
+  def from[F, C[_]](scaleFn: (F, C[F]) => C[F])(implicit r: Ring[F], cGroup: Group[C[F]]): VectorSpace[F, C] =
+    new VectorSpace[F, C] {
+      override def ring: Ring[F] = r
+      override def group: Group[C[F]] = cGroup
+      override def scale(v: F, c: C[F]): C[F] =
+        if (r.isNonZero(v)) scaleFn(v, c) else cGroup.zero
+    }
+}
+private object VectorSpaceOps extends VectorSpaceOps
+
+sealed trait Implicits extends LowPrioImpicits {
+  implicit def indexedSeqSpace[T: Ring]: VectorSpace[T, IndexedSeq] =
+    VectorSpaceOps.from[T, IndexedSeq]((s, seq) => seq.map(Ring.times(s, _)))
+}
+
+sealed trait LowPrioImpicits {
+  implicit def mapSpace[K, T: Ring]: VectorSpace[T, Map[K, _]] =
+    VectorSpaceOps.from[T, Map[K, _]] { (s, m) =>
+      m.transform { case (_, v) => Ring.times(s, v) }
+    }
+}
+
+@implicitNotFound(msg = "Cannot find VectorSpace type class for Container: ${C} and Ring: ${F}")
+trait VectorSpace[F, C[_]] extends java.io.Serializable {
+  implicit def ring: Ring[F]
+  def field: Ring[F] = ring // this is for compatibility with older versions
+  implicit def group: Group[C[F]]
+  def scale(v: F, c: C[F]): C[F]
+}
diff --git a/algebird-core/src/main/scala-3/monad/EitherMonad.scala b/algebird-core/src/main/scala-3/monad/EitherMonad.scala
new file mode 100644
index 000000000..b6d5e2ffc
--- /dev/null
+++ b/algebird-core/src/main/scala-3/monad/EitherMonad.scala
@@ -0,0 +1,37 @@
+/*
+ Copyright 2013 Twitter, Inc.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package com.twitter.algebird.monad
+
+import com.twitter.algebird.Monad
+
+// Monad for either, used for modeling Error where L is the type of the error
+object EitherMonad {
+  class Error[L] extends Monad[Either[L, *]] {
+    override def apply[R](r: R): Right[L, R] = Right(r)
+
+    override def flatMap[T, U](self: Either[L, T])(next: T => Either[L, U]): Either[L, U] =
+      self.right.flatMap(next)
+
+    override def map[T, U](self: Either[L, T])(fn: T => U): Either[L, U] =
+      self.right.map(fn)
+  }
+
+  implicit def monad[L]: Monad[Either[L, _]] = new Error[L]
+
+  def assert[L](truth: Boolean, failure: => L): Either[L, Unit] =
+    if (truth) Right(()) else Left(failure)
+}
diff --git a/algebird-core/src/main/scala-3/monad/Reader.scala b/algebird-core/src/main/scala-3/monad/Reader.scala
new file mode 100644
index 000000000..e0747af20
--- /dev/null
+++ b/algebird-core/src/main/scala-3/monad/Reader.scala
@@ -0,0 +1,76 @@
+/*
+ Copyright 2013 Twitter, Inc.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package com.twitter.algebird.monad
+
+import com.twitter.algebird.Monad
+
+// TODO this is general, move somewhere better
+
+// Reader Monad, represents a series of operations that mutate some environment
+// type (the input to the function)
+
+sealed trait Reader[-Env, +T] {
+  def apply(env: Env): T
+  def flatMap[E1 <: Env, U](next: T => Reader[E1, U]): Reader[E1, U] =
+    FlatMappedReader[E1, T, U](this, next)
+  def map[U](thatFn: T => U): Reader[Env, U] =
+    FlatMappedReader(this, (t: T) => ConstantReader(thatFn(t)))
+}
+
+final case class ConstantReader[+T](get: T) extends Reader[Any, T] {
+  override def apply(env: Any): T = get
+  override def map[U](fn: T => U): ConstantReader[U] = ConstantReader(fn(get))
+  override def flatMap[E1 <: Any, U](next: T => Reader[E1, U]): Reader[E1, U] =
+    next(get)
+}
+final case class ReaderFn[E, +T](fn: E => T) extends Reader[E, T] {
+  override def apply(env: E): T = fn(env)
+}
+final case class FlatMappedReader[E, U, +T](first: Reader[E, U], fn: U => Reader[E, T]) extends Reader[E, T] {
+  override def apply(env: E): T = {
+    @annotation.tailrec
+    def loop(r: Reader[E, Any], stack: List[(Any) => Reader[E, Any]]): Any =
+      r match {
+        case ConstantReader(get) =>
+          stack match {
+            case head :: tail => loop(head(get), tail)
+            case Nil          => get
+          }
+        case ReaderFn(fn) =>
+          stack match {
+            case head :: tail => loop(head(fn(env)), tail)
+            case Nil          => fn(env)
+          }
+        case FlatMappedReader(first, nextFn) => loop(first, nextFn :: stack)
+      }
+    loop(first, List(fn.asInstanceOf[(Any) => Reader[E, Any]])).asInstanceOf[T]
+  }
+}
+
+object Reader {
+  def const[T](t: T): Reader[Any, T] = ConstantReader(t)
+  implicit def apply[E, T](fn: (E) => T): Reader[E, T] = ReaderFn(fn)
+
+  class ReaderM[Env] extends Monad[Reader[Env, _]] {
+    override def apply[T](t: T): ConstantReader[T] = ConstantReader(t)
+    override def flatMap[T, U](self: Reader[Env, T])(next: T => Reader[Env, U]): Reader[Env, U] =
+      self.flatMap(next)
+    override def map[T, U](self: Reader[Env, T])(fn: T => U): Reader[Env, U] = self.map(fn)
+  }
+
+  implicit def monad[Env]: Monad[Reader[Env, _]] = new ReaderM[Env]
+}
diff --git a/algebird-core/src/main/scala-3/monad/StateWithError.scala b/algebird-core/src/main/scala-3/monad/StateWithError.scala
new file mode 100644
index 000000000..e15a9ebc3
--- /dev/null
+++ b/algebird-core/src/main/scala-3/monad/StateWithError.scala
@@ -0,0 +1,130 @@
+/*
+ Copyright 2013 Twitter, Inc.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package com.twitter.algebird.monad
+
+import com.twitter.algebird.{Monad, Semigroup}
+
+/**
+ * Monad to handle mutating input state and possible failures. This is used to interact in the planning phase
+ * with existing mutable APIs (like storm or cascading), but retain the ability to compose carefully.
+ */
+sealed trait StateWithError[S, +F, +T] {
+  def join[F1 >: F, U](
+      that: StateWithError[S, F1, U],
+      mergeErr: (F1, F1) => F1,
+      mergeState: (S, S) => S
+  ): StateWithError[S, F1, (T, U)] =
+    join(that)(Semigroup.from(mergeErr), Semigroup.from(mergeState))
+
+  def join[F1 >: F, U](that: StateWithError[S, F1, U])(implicit
+      sgf: Semigroup[F1],
+      sgs: Semigroup[S]
+  ): // TODO: deep joins could blow the stack, not yet using trampoline here
+  StateWithError[S, F1, (T, U)] =
+    StateFn { (requested: S) =>
+      (run(requested), that.run(requested)) match {
+        case (Right((s1, r1)), Right((s2, r2))) =>
+          Right((sgs.plus(s1, s2), (r1, r2)))
+        case (Left(err1), Left(err2)) =>
+          Left(sgf.plus(err1, err2)) // Our earlier is not ready
+        case (Left(err), _) => Left(err)
+        case (_, Left(err)) => Left(err)
+      }
+    }
+
+  def apply(state: S): Either[F, (S, T)] = run(state)
+
+  def run(state: S): Either[F, (S, T)]
+
+  def flatMap[F1 >: F, U](next: T => StateWithError[S, F1, U]): StateWithError[S, F1, U] =
+    FlatMappedState(this, next)
+
+  def map[U](fn: (T) => U): StateWithError[S, F, U] =
+    FlatMappedState(this, (t: T) => StateWithError.const(fn(t)))
+}
+
+/** Simple wrapper of a function in the Monad */
+final case class StateFn[S, F, T](fn: S => Either[F, (S, T)]) extends StateWithError[S, F, T] {
+  override def run(state: S): Either[F, (S, T)] = fn(state)
+}
+
+/**
+ * A Trampolining instance that should prevent stack overflow at the expense of performance
+ */
+final case class FlatMappedState[S, F, T, U](start: StateWithError[S, F, T], fn: T => StateWithError[S, F, U])
+    extends StateWithError[S, F, U] {
+  override def run(state: S): Either[F, (S, U)] = {
+    @annotation.tailrec
+    def loop(inState: S, st: StateWithError[S, F, Any], stack: List[Any => StateWithError[S, F, Any]]): Any =
+      st match {
+        case StateFn(fn) =>
+          fn(inState) match {
+            case err @ Left(_) => err // bail at first error
+            case noError @ Right((newState, out)) =>
+              stack match {
+                case head :: tailStack => loop(newState, head(out), tailStack)
+                case Nil               => noError // recursion ends
+              }
+          }
+        case FlatMappedState(st, next) => loop(inState, st, next :: stack)
+      }
+    loop(state, this, Nil).asInstanceOf[Either[F, (S, U)]]
+  }
+}
+
+object StateWithError {
+  def getState[S]: StateWithError[S, Nothing, S] =
+    StateFn((state: S) => Right((state, state)))
+  def putState[S](newState: S): StateWithError[S, Nothing, Unit] =
+    StateFn((_: S) => Right((newState, ())))
+  def swapState[S](newState: S): StateWithError[S, Nothing, S] =
+    StateFn((old: S) => Right((newState, old)))
+
+  def const[S, T](t: T): StateWithError[S, Nothing, T] =
+    StateFn((state: S) => Right((state, t)))
+  def lazyVal[S, T](t: => T): StateWithError[S, Nothing, T] =
+    StateFn((state: S) => Right((state, t)))
+  def failure[S, F](f: F): StateWithError[S, F, Nothing] =
+    StateFn(_ => Left(f))
+
+  /**
+   * Use like fromEither[Int](Right("good")) to get a constant Either in the monad
+   */
+  def fromEither[S]: ConstantStateMaker[S] = new ConstantStateMaker[S]
+  class ConstantStateMaker[S] {
+    def apply[F, T](either: Either[F, T]): StateWithError[S, F, T] = { (s: S) => either.right.map((s, _)) }
+  }
+
+  class FunctionLifter[S] {
+    def apply[I, F, T](fn: I => Either[F, T]): (I => StateWithError[S, F, T]) = { (i: I) =>
+      StateFn((s: S) => fn(i).right.map((s, _)))
+    }
+  }
+  // TODO this should move to Monad and work for any Monad
+  def toKleisli[S]: FunctionLifter[S] = new FunctionLifter[S]
+
+  implicit def apply[S, F, T](fn: S => Either[F, (S, T)]): StateWithError[S, F, T] = StateFn(fn)
+  implicit def monad[S, F]: Monad[StateWithError[S, F, _]] = new StateFMonad[F, S]
+
+  class StateFMonad[F, S] extends Monad[StateWithError[S, F, _]] {
+    override def apply[T](const: T): StateWithError[S, Nothing, T] = { (s: S) => Right((s, const)) }
+    override def flatMap[T, U](
+        earlier: StateWithError[S, F, T]
+    )(next: T => StateWithError[S, F, U]): StateWithError[S, F, U] =
+      earlier.flatMap(next)
+  }
+}
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/AdaptiveCache.scala b/algebird-core/src/main/scala/com/twitter/algebird/AdaptiveCache.scala
index 29329b788..53a0eff17 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/AdaptiveCache.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/AdaptiveCache.scala
@@ -87,9 +87,9 @@ class AdaptiveCache[K, V: Semigroup](maxCapacity: Int, growthMargin: Double = 3.
       summingCache = new SummingWithHitsCache(currentCapacity)
 
       if (currentCapacity == maxCapacity)
-        sentinelCache.stopGrowing
+        sentinelCache.stopGrowing()
       else
-        sentinelCache.clear
+        sentinelCache.clear()
     }
     ret
   }
@@ -101,7 +101,7 @@ class AdaptiveCache[K, V: Semigroup](maxCapacity: Int, growthMargin: Double = 3.
 
   override def flush: Option[Map[K, V]] = {
     val ret = summingCache.flush
-    sentinelCache.clear
+    sentinelCache.clear()
     ret
   }
 
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/AdaptiveVector.scala b/algebird-core/src/main/scala/com/twitter/algebird/AdaptiveVector.scala
index e47fb8792..31f5117bc 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/AdaptiveVector.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/AdaptiveVector.scala
@@ -145,7 +145,7 @@ object AdaptiveVector {
       def iteq: Boolean =
         (lit.hasNext, rit.hasNext) match {
           case (true, true) =>
-            val (lnext, rnext) = (lit.next, rit.next)
+            val (lnext, rnext) = (lit.next(), rit.next())
             if (lnext._1 == rnext._1 && Equiv[V].equiv(lnext._2, rnext._2))
               iteq
             else
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/Applicative.scala b/algebird-core/src/main/scala/com/twitter/algebird/Applicative.scala
index 32a66339a..211cac612 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/Applicative.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/Applicative.scala
@@ -42,7 +42,7 @@ trait Applicative[M[_]] extends Functor[M] {
       case _ =>
         val mb =
           ms.foldLeft(apply(Seq.newBuilder[T]))((mb, mt) => joinWith(mb, mt)((b, t) => b += t))
-        map(mb)(_.result)
+        map(mb)(_.result())
     }
   def joinWith[T, U, V](mt: M[T], mu: M[U])(fn: (T, U) => V): M[V] =
     map(join(mt, mu)) { case (t, u) => fn(t, u) }
@@ -102,7 +102,7 @@ object Applicative {
   )(implicit app: Applicative[M], cbf: Factory[T, R[T]]): M[R[T]] = {
     val bldr = cbf.newBuilder
     val mbldr = ms.iterator.foldLeft(app.apply(bldr))((mb, mt) => app.joinWith(mb, mt)(_ += _))
-    app.map(mbldr)(_.result)
+    app.map(mbldr)(_.result())
   }
 
   def joinWith[M[_], T, U, V](mt: M[T], mu: M[U])(fn: (T, U) => V)(implicit app: Applicative[M]): M[V] =
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/AveragedValue.scala b/algebird-core/src/main/scala/com/twitter/algebird/AveragedValue.scala
index 9d684db79..efef198e3 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/AveragedValue.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/AveragedValue.scala
@@ -112,7 +112,7 @@ object AveragedValue {
    */
   def numericAggregator[N](implicit num: Numeric[N]): MonoidAggregator[N, AveragedValue, Double] =
     Aggregator
-      .prepareMonoid { n: N => AveragedValue(num.toDouble(n)) }
+      .prepareMonoid { (n: N) => AveragedValue(num.toDouble(n)) }
       .andThenPresent(_.value)
 
   /**
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/Batched.scala b/algebird-core/src/main/scala/com/twitter/algebird/Batched.scala
index d209a98dc..0db108a3a 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/Batched.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/Batched.scala
@@ -104,7 +104,7 @@ object Batched {
     if (ts.iterator.isEmpty) None
     else {
       val it = ts.iterator
-      val t0 = it.next
+      val t0 = it.next()
       Some(Item(t0).append(it))
     }
 
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/BloomFilter.scala b/algebird-core/src/main/scala/com/twitter/algebird/BloomFilter.scala
index bda97981d..5ea0f11d5 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/BloomFilter.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/BloomFilter.scala
@@ -33,7 +33,7 @@ object RichCBitSet {
   def fromBitSet(bs: BitSet): CBitSet = {
     val nbs = new CBitSet
     val it = bs.iterator
-    while (it.hasNext) { nbs.set(it.next) }
+    while (it.hasNext) { nbs.set(it.next()) }
     nbs
   }
   implicit def cb2rcb(cb: CBitSet): RichCBitSet = new RichCBitSet(cb)
@@ -235,7 +235,7 @@ case class BloomFilterMonoid[A](numHashes: Int, width: Int)(implicit hash: Hash1
         case BFInstance(_, bitset, _) =>
           // these Ints are boxed so, that's a minor bummer
           val iter = bitset.iterator
-          while (iter.hasNext) { set(iter.next) }
+          while (iter.hasNext) { set(iter.next()) }
       }
       if (sets == 0) Some(zero)
       else if (sets == numHashes && (oneItem != null)) Some(oneItem)
@@ -307,7 +307,7 @@ object BF {
               new IntIterator {
                 val boxedIter: Iterator[Int] = bitset.iterator
                 override def hasNext: Boolean = boxedIter.hasNext
-                override def next: Int = boxedIter.next
+                override def next: Int = boxedIter.next()
               }
             case BFZero(_, _) =>
               new IntIterator {
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/BufferedOperation.scala b/algebird-core/src/main/scala/com/twitter/algebird/BufferedOperation.scala
index e8c45b668..102f2e3c7 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/BufferedOperation.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/BufferedOperation.scala
@@ -45,7 +45,7 @@ abstract class ArrayBufferedOperation[I, O](size: Int) extends Buffered[I, O] {
     if (buffer.isEmpty) None
     else {
       val res = operate(buffer.toSeq)
-      buffer.clear
+      buffer.clear()
       Some(res)
     }
 
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/ExpHist.scala b/algebird-core/src/main/scala/com/twitter/algebird/ExpHist.scala
index 2f6d6e988..3a01eee07 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/ExpHist.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/ExpHist.scala
@@ -105,7 +105,7 @@ case class ExpHist(
         b += bucket
       },
       _ => Vector.newBuilder[Bucket],
-      x => addAll(x.result)
+      x => addAll(x.result())
     )
 
   // This internal method assumes that the instance is stepped forward
@@ -182,7 +182,7 @@ object ExpHist {
   case class Bucket(size: Long, timestamp: Timestamp)
 
   object Bucket {
-    implicit val ord: Ordering[Bucket] = Ordering.by { b: Bucket => (b.timestamp, b.size) }
+    implicit val ord: Ordering[Bucket] = Ordering.by { (b: Bucket) => (b.timestamp, b.size) }
   }
 
   /**
@@ -260,7 +260,7 @@ object ExpHist {
     if (desired.isEmpty) Vector.empty
     else {
       val input = buckets.dropWhile(_.size == 0)
-      val bucketSize +: tail = desired
+      val bucketSize +: tail = desired : @unchecked
       val remaining = drop(bucketSize, input)
       input.head.copy(size = bucketSize) +: rebucket(remaining, tail)
     }
@@ -275,7 +275,7 @@ object ExpHist {
    *   If an element wasn't fully consumed, the remainder will be stuck back onto the head.
    */
   @tailrec private[this] def drop(toDrop: Long, input: Vector[Bucket]): Vector[Bucket] = {
-    val (b @ Bucket(count, _)) +: tail = input
+    val (b @ Bucket(count, _)) +: tail = input : @unchecked
     (toDrop - count) match {
       case 0          => tail
       case x if x < 0 => b.copy(size = -x) +: tail
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/HashingTrick.scala b/algebird-core/src/main/scala/com/twitter/algebird/HashingTrick.scala
index 0d86aa03e..03b1dad0c 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/HashingTrick.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/HashingTrick.scala
@@ -27,7 +27,7 @@ class HashingTrickMonoid[V: Group](bits: Int, seed: Int = 123456) extends Monoid
     Monoid.plus(left, right)
 
   def init[K](kv: (K, V))(implicit ev: K => Array[Byte]): AdaptiveVector[V] = {
-    val (long1, long2) = hash(kv._1)
+    val (long1, long2):(Long,Long) = hash(kv._1)
     val index = (long1 & bitMask).toInt
     val isNegative = (long2 & 1) == 1
 
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/HyperLogLog.scala b/algebird-core/src/main/scala/com/twitter/algebird/HyperLogLog.scala
index adac1141d..0fc0b97e6 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/HyperLogLog.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/HyperLogLog.scala
@@ -419,7 +419,7 @@ case class SparseHLL(override val bits: Int, maxRhow: Map[Int, Max[Byte]]) exten
         val iter: Iterator[(Int, Max[Byte])] = maxRhow.iterator
 
         while (iter.hasNext) {
-          val (idx, _) = iter.next
+          val (idx, _) = iter.next()
           val existing: Byte = newContents(idx)
           val other: Byte = maxRhow(idx).get
 
@@ -575,7 +575,7 @@ class HyperLogLogMonoid(val bits: Int) extends Monoid[HLL] with BoundedSemilatti
       None
     } else {
       val iter = items.iterator.buffered
-      var curValue = iter.next
+      var curValue = iter.next()
       while (iter.hasNext) {
         curValue = (curValue, iter.head) match {
           case (DenseHLL(_, _), _) => denseUpdate(curValue, iter)
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/HyperLogLogSeries.scala b/algebird-core/src/main/scala/com/twitter/algebird/HyperLogLogSeries.scala
index f795b1a4c..75b5c7ccc 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/HyperLogLogSeries.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/HyperLogLogSeries.scala
@@ -62,7 +62,7 @@ case class HLLSeries(bits: Int, rows: Vector[Map[Int, Long]]) {
     while (i >= 0) {
       val it = rows(i).iterator
       while (it.hasNext) {
-        val (k, t) = it.next
+        val (k, t) = it.next()
         if (t >= threshold && seen.add(k)) {
           sum += HyperLogLog.negativePowersOfTwo(i + 1)
         }
@@ -142,7 +142,7 @@ class HyperLogLogSeriesMonoid(val bits: Int) extends Monoid[HLLSeries] {
       val bldr = Vector.newBuilder[Map[Int, Long]]
       val lit = left.rows.iterator
       val rit = right.rows.iterator
-      while (lit.hasNext && rit.hasNext) bldr += combine(lit.next, rit.next)
+      while (lit.hasNext && rit.hasNext) bldr += combine(lit.next(), rit.next())
       val zipped = bldr.result()
       HLLSeries(bits, zipped ++ right.rows.slice(ln, rn))
     }
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/Max.scala b/algebird-core/src/main/scala/com/twitter/algebird/Max.scala
index df95c4691..6e84c7541 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/Max.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/Max.scala
@@ -160,8 +160,8 @@ private[algebird] sealed abstract class LowPriorityMaxInstances {
     while (true) {
       if (xs.hasNext) {
         if (ys.hasNext) {
-          val x = xs.next
-          val y = ys.next
+          val x = xs.next()
+          val y = ys.next()
           val cmp = ord.compare(x, y)
           if (cmp != 0) return cmp
         } else {
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/Metric.scala b/algebird-core/src/main/scala/com/twitter/algebird/Metric.scala
index e5c6df39b..fc4dd10e8 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/Metric.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/Metric.scala
@@ -73,7 +73,7 @@ object Metric {
 
   def minkowskiMap[K, V: Monoid: Metric](p: Double): Metric[Map[K, V]] =
     Metric.from { (a: Map[K, V], b: Map[K, V]) =>
-      val outP = (a.keySet ++ b.keySet).map { key: K =>
+      val outP = (a.keySet ++ b.keySet).map { (key: K) =>
         val v1 = a.getOrElse(key, Monoid.zero[V])
         val v2 = b.getOrElse(key, Monoid.zero[V])
         math.pow(implicitly[Metric[V]].apply(v1, v2), p)
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/MinHasher.scala b/algebird-core/src/main/scala/com/twitter/algebird/MinHasher.scala
index ada06450b..5c6b9ebc9 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/MinHasher.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/MinHasher.scala
@@ -69,7 +69,7 @@ abstract class MinHasher[H](val numHashes: Int, val numBands: Int)(implicit n: N
   private val hashFunctions = {
     val r = new scala.util.Random(seed)
     val numHashFunctions = math.ceil(numBytes / 16.0).toInt
-    (1 to numHashFunctions).map(_ => MurmurHash128(r.nextLong))
+    (1 to numHashFunctions).map(_ => MurmurHash128(r.nextLong()))
   }
 
   /** Signature for empty set, needed to be a proper Monoid */
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/MomentsGroup.scala b/algebird-core/src/main/scala/com/twitter/algebird/MomentsGroup.scala
index 74eb5a428..9da380b3e 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/MomentsGroup.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/MomentsGroup.scala
@@ -248,7 +248,7 @@ object Moments {
   val fold: Fold[Double, Moments] = momentsMonoid.zero.fold
 
   def numericAggregator[N](implicit num: Numeric[N]): MonoidAggregator[N, Moments, Moments] =
-    Aggregator.prepareMonoid { n: N => Moments(num.toDouble(n)) }
+    Aggregator.prepareMonoid { (n: N) => Moments(num.toDouble(n)) }
 
   /**
    * Create a Moments object given a single value. This is useful for initializing moment calculations at the
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/Monad.scala b/algebird-core/src/main/scala/com/twitter/algebird/Monad.scala
index de8c31a71..cd14c7a96 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/Monad.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/Monad.scala
@@ -57,7 +57,7 @@ object Monad {
     if (xs.isEmpty)
       monad.apply(acc)
     else
-      monad.flatMap(fn(acc, xs.head)) { t: T => foldM(t, xs.tail)(fn) }
+      monad.flatMap(fn(acc, xs.head)) { (t: T) => foldM(t, xs.tail)(fn) }
 
   // Some instances of the Monad typeclass (case for a macro):
   implicit val list: Monad[List] = new Monad[List] {
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/Preparer.scala b/algebird-core/src/main/scala/com/twitter/algebird/Preparer.scala
index a10d6d8a8..1d81a888e 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/Preparer.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/Preparer.scala
@@ -187,10 +187,10 @@ trait FlatMapPreparer[A, T] extends Preparer[A, T] {
   def prepareFn: A => TraversableOnce[T]
 
   def map[U](fn: T => U): FlatMapPreparer[A, U] =
-    FlatMapPreparer { a: A => prepareFn(a).map(fn) }
+    FlatMapPreparer { (a: A) => prepareFn(a).map(fn) }
 
   override def flatMap[U](fn: T => TraversableOnce[U]): FlatMapPreparer[A, U] =
-    FlatMapPreparer { a: A => prepareFn(a).flatMap(fn) }
+    FlatMapPreparer { (a: A) => prepareFn(a).flatMap(fn) }
 
   override def monoidAggregate[B, C](aggregator: MonoidAggregator[T, B, C]): MonoidAggregator[A, B, C] =
     aggregator.sumBefore.composePrepare(prepareFn)
@@ -242,10 +242,10 @@ object FlatMapPreparer {
     override val prepareFn: TraversableOnce[A] => TraversableOnce[A] = (a: TraversableOnce[A]) => a
 
     override def map[U](fn: A => U): FlatMapPreparer[TraversableOnce[A], U] =
-      FlatMapPreparer { a: TraversableOnce[A] => a.map(fn) }
+      FlatMapPreparer { (a: TraversableOnce[A]) => a.map(fn) }
 
     override def flatMap[U](fn: A => TraversableOnce[U]): FlatMapPreparer[TraversableOnce[A], U] =
-      FlatMapPreparer { a: TraversableOnce[A] => a.flatMap(fn) }
+      FlatMapPreparer { (a: TraversableOnce[A]) => a.flatMap(fn) }
 
     override def monoidAggregate[B, C](
         aggregator: MonoidAggregator[A, B, C]
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/QTree.scala b/algebird-core/src/main/scala/com/twitter/algebird/QTree.scala
index 2376cfbf8..c78897715 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/QTree.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/QTree.scala
@@ -151,9 +151,9 @@ class QTreeSemigroup[A](k: Int)(implicit val underlyingMonoid: Monoid[A]) extend
       val batchSize = compressBatchSize
       var count = 1 // start at 1, so we only compress after batchSize items
       val iter = items.toIterator
-      var result = iter.next // due to not being empty, this does not throw
+      var result = iter.next() // due to not being empty, this does not throw
       while (iter.hasNext) {
-        result = result.merge(iter.next)
+        result = result.merge(iter.next())
         count += 1
         if (count % batchSize == 0) {
           result = result.compress(k)
@@ -428,8 +428,8 @@ class QTree[@specialized(Int, Long, Float, Double) A] private[algebird] (
       print(" (" + parentCount + ")")
     }
     println(" {" + _sum + "}")
-    lowerChild.foreach(_.dump)
-    upperChild.foreach(_.dump)
+    lowerChild.foreach(_.dump())
+    upperChild.foreach(_.dump())
   }
 
   /**
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/SketchMap.scala b/algebird-core/src/main/scala/com/twitter/algebird/SketchMap.scala
index f5973c338..e327ed57c 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/SketchMap.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/SketchMap.scala
@@ -145,7 +145,7 @@ case class SketchMapParams[K](seed: Int, width: Int, depth: Int, heavyHittersCou
     val numCounters = width
     (0 to (numHashes - 1)).map { _ =>
       val smhash: SketchMapHash[K] =
-        SketchMapHash(CMSHash[Long](r.nextInt, 0, numCounters), seed)(serialization)
+        SketchMapHash(CMSHash[Long](r.nextInt(), 0, numCounters), seed)(serialization)
       (k: K) => smhash(k)
     }
   }
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/SummingCache.scala b/algebird-core/src/main/scala/com/twitter/algebird/SummingCache.scala
index 4cd9a1505..e2302e899 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/SummingCache.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/SummingCache.scala
@@ -57,7 +57,7 @@ class SummingCache[K, V](capacity: Int)(implicit sgv: Semigroup[V]) extends Stat
   override def flush: Option[Map[K, V]] = {
     // Get a copy of the cache, since it is mutable
     val res = optNonEmpty(cache.iterator.toMap)
-    cache.clear
+    cache.clear()
     res
   }
   override def isFlushed: Boolean = cache.isEmpty
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/SummingIterator.scala b/algebird-core/src/main/scala/com/twitter/algebird/SummingIterator.scala
index cd9e7deaf..7644aca2e 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/SummingIterator.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/SummingIterator.scala
@@ -49,16 +49,16 @@ class SummingIterator[V](summer: StatefulSummer[V], it: Iterator[V])
   // This has to be lazy because it shouldn't be touched until the val it is exhausted
   protected lazy val tailIter: Iterator[V] = summer.flush.iterator
   override def hasNext: Boolean = it.hasNext || tailIter.hasNext
-  override def next: V = nextInternal
+  override def next(): V = nextInternal
 
   @tailrec
   private def nextInternal: V =
     if (it.hasNext) {
-      summer.put(it.next) match {
+      summer.put(it.next()) match {
         case None    => nextInternal
         case Some(v) => v
       }
     } else {
-      tailIter.next
+      tailIter.next()
     }
 }
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/Window.scala b/algebird-core/src/main/scala/com/twitter/algebird/Window.scala
index 8df431d7e..199553780 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/Window.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/Window.scala
@@ -126,7 +126,7 @@ abstract class WindowMonoid[T](windowSize: Int) extends Monoid[Window[T]] {
       val it = ws.toIterator
       var queue = Queue.empty[T]
       while (it.hasNext) {
-        queue = (queue ++ it.next.items).takeRight(windowSize)
+        queue = (queue ++ it.next().items).takeRight(windowSize)
       }
       Some(Window(monoid.sum(queue), queue))
     }
@@ -140,7 +140,7 @@ abstract class WindowMonoid[T](windowSize: Int) extends Monoid[Window[T]] {
       while (it.hasNext) {
         // avoid materializing the whole list in memory
         // at one time
-        queue = queue :+ it.next
+        queue = queue :+ it.next()
         size = size + 1
         if (size > windowSize) {
           queue = queue.tail
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/immutable/BitSet.scala b/algebird-core/src/main/scala/com/twitter/algebird/immutable/BitSet.scala
index d58a6c9ab..3e90cadcf 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/immutable/BitSet.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/immutable/BitSet.scala
@@ -573,7 +573,7 @@ object BitSet {
         BitSet.adoptedUnion(this, rhs)
       } else {
         // height == rhs.height, so we know rhs is a Branch.
-        val Branch(_, _, rcs) = rhs
+        val Branch(_, _, rcs) = rhs : @unchecked
         val cs = new Array[BitSet](32)
         var i = 0
         while (i < 32) {
@@ -605,7 +605,7 @@ object BitSet {
         Empty
       } else {
         // height == rhs.height, so we know rhs is a Branch.
-        val Branch(_, _, rcs) = rhs
+        val Branch(_, _, rcs) = rhs: @unchecked
         val cs = new Array[BitSet](32)
         var i = 0
         var nonEmpty = false
@@ -643,7 +643,7 @@ object BitSet {
         false
       } else {
         // height == rhs.height, so we know rhs is a Branch.
-        val Branch(_, _, rcs) = rhs
+        val Branch(_, _, rcs) = rhs : @unchecked
         var i = 0
         while (i < 32) {
           val x = children(i)
@@ -688,7 +688,7 @@ object BitSet {
         this | rhs
       } else {
         // height == rhs.height, so we know rhs is a Branch.
-        val Branch(_, _, rcs) = rhs
+        val Branch(_, _, rcs) = rhs : @unchecked
         val cs = new Array[BitSet](32)
         var i = 0
         while (i < 32) {
@@ -805,7 +805,7 @@ object BitSet {
         throw InternalError("branch misaligned")
       } else {
         // height == rhs.height, so we know rhs is a Branch.
-        val Branch(_, _, rcs) = rhs
+        val Branch(_, _, rcs) = rhs: @unchecked
         var i = 0
         while (i < 32) {
           val x = children(i)
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/immutable/BloomFilter.scala b/algebird-core/src/main/scala/com/twitter/algebird/immutable/BloomFilter.scala
index 71a861075..572dce367 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/immutable/BloomFilter.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/immutable/BloomFilter.scala
@@ -272,7 +272,7 @@ final case class BloomFilter[A](numHashes: Int, width: Int)(implicit val hash: H
 
     override def +(other: A): Hash = {
       val bs = BitSet.newEmpty(0)
-      val hash = new Array[Int](numHashes)
+      val hash = new Array[Int](this.numHashes)
 
       hashToArray(item, hash)
       bs.mutableAdd(hash)
@@ -336,7 +336,7 @@ final case class BloomFilter[A](numHashes: Int, width: Int)(implicit val hash: H
 
     // use an approximation width of 0.05
     override def size: Approximate[Long] =
-      BloomFilter.sizeEstimate(numBits, numHashes, width, 0.05)
+      BloomFilter.sizeEstimate(this.numBits, numHashes, width, 0.05)
   }
 
   implicit val monoid: Monoid[Hash] with BoundedSemilattice[Hash] =
@@ -402,7 +402,7 @@ final case class BloomFilter[A](numHashes: Int, width: Int)(implicit val hash: H
   /**
    * Create a bloom filter with multiple items from an iterator
    */
-  def create(data: Iterator[A]): Hash = monoid.sum(data.map(Item))
+  def create(data: Iterator[A]): Hash = monoid.sum(data.map(Item.apply))
 
   val empty: Hash = Empty
 
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/matrix/AdaptiveMatrix.scala b/algebird-core/src/main/scala/com/twitter/algebird/matrix/AdaptiveMatrix.scala
index f970c43f3..c50d912d7 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/matrix/AdaptiveMatrix.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/matrix/AdaptiveMatrix.scala
@@ -95,7 +95,7 @@ object AdaptiveMatrix {
         var row = 0
         val iter = storage.iterator
         while (iter.hasNext) {
-          val curRow = iter.next
+          val curRow = iter.next()
           curRow.foreach { case (col, value) =>
             buffer(row * cols + col) = value
           }
@@ -114,7 +114,7 @@ object AdaptiveMatrix {
           val sparseStorage = (0 until rows).map(_ => MMap[Int, V]()).toIndexedSeq
 
           while (iter.hasNext) {
-            val current = iter.next
+            val current = iter.next()
             current match {
               case d @ DenseMatrix(_, _, _) => return denseUpdate(d, iter)
               case s @ SparseColumnMatrix(_) =>
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/matrix/SparseColumnMatrix.scala b/algebird-core/src/main/scala/com/twitter/algebird/matrix/SparseColumnMatrix.scala
index 69f553360..96f201eb8 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/matrix/SparseColumnMatrix.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/matrix/SparseColumnMatrix.scala
@@ -49,7 +49,7 @@ case class SparseColumnMatrix[V: Monoid](rowsByColumns: IndexedSeq[AdaptiveVecto
     while (row < rows) {
       val iter = rowsByColumns(row).denseIterator
       while (iter.hasNext) {
-        val (col, value) = iter.next
+        val (col, value):(Int,V) = iter.next()
         val indx = row * lcols + col
         buffer(indx) = valueMonoid.plus(buffer(indx), value)
       }
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/statistics/IterCallStatistics.scala b/algebird-core/src/main/scala/com/twitter/algebird/statistics/IterCallStatistics.scala
index 5c3e4c37b..38c026937 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/statistics/IterCallStatistics.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/statistics/IterCallStatistics.scala
@@ -36,7 +36,7 @@ private class IterCallStatistics(threadSafe: Boolean) {
       total.add(v)
       // log2(v + 1) for v up to 2^maxBucket
       val bucket = min(64 - numberOfLeadingZeros(v), maxBucket)
-      distribution(bucket).increment
+      distribution(bucket).increment()
     }
 
     def count: Long = distribution.foldLeft(0L)(_ + _.get) // sum
@@ -59,8 +59,8 @@ private class IterCallStatistics(threadSafe: Boolean) {
   private class CountingIterator[T](val i: Iterator[T]) extends Iterator[T] {
     private[this] final var nextCount: Long = 0
     override def hasNext: Boolean = i.hasNext
-    override def next: T = {
-      val n = i.next
+    override def next(): T = {
+      val n = i.next()
       nextCount += 1
       n
     }
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/statistics/Statistics.scala b/algebird-core/src/main/scala/com/twitter/algebird/statistics/Statistics.scala
index ce166c250..3becb8b8a 100644
--- a/algebird-core/src/main/scala/com/twitter/algebird/statistics/Statistics.scala
+++ b/algebird-core/src/main/scala/com/twitter/algebird/statistics/Statistics.scala
@@ -37,7 +37,7 @@ class StatisticsSemigroup[T](threadSafe: Boolean = true)(implicit wrappedSemigro
   def getSumOptionCallTime: Long = sumOptionCallsStats.getTotalCallTime
 
   override def plus(x: T, y: T): T = {
-    plusCallsCount.increment
+    plusCallsCount.increment()
     Semigroup.plus(x, y)
   }
 
@@ -66,7 +66,7 @@ class StatisticsMonoid[T](threadSafe: Boolean = true)(implicit wrappedMonoid: Mo
   def getSumCallTime: Long = sumCallsStats.getTotalCallTime
 
   override def zero: T = {
-    zeroCallsCount.increment
+    zeroCallsCount.increment()
     Monoid.zero
   }
 
@@ -95,12 +95,12 @@ class StatisticsGroup[T](threadSafe: Boolean = true)(implicit group: Group[T])
   def getMinusCallCount: Long = minusCallsCount.get
 
   override def negate(x: T): T = {
-    negateCallsCount.increment
+    negateCallsCount.increment()
     Group.negate(x)
   }
 
   override def minus(l: T, r: T): T = {
-    minusCallsCount.increment
+    minusCallsCount.increment()
     Group.minus(l, r)
   }
 
@@ -129,12 +129,12 @@ class StatisticsRing[T](threadSafe: Boolean = true)(implicit ring: Ring[T])
   def getProductCallTime: Long = productCallsStats.getTotalCallTime
 
   override def one: T = {
-    oneCallsCount.increment
+    oneCallsCount.increment()
     Ring.one
   }
 
   override def times(x: T, y: T): T = {
-    timesCallsCount.increment
+    timesCallsCount.increment()
     Ring.times(x, y)
   }
 
diff --git a/build.sbt b/build.sbt
index afc7de9c7..bcd23c4f2 100644
--- a/build.sbt
+++ b/build.sbt
@@ -31,6 +31,8 @@ def scalaBinaryVersion(scalaVersion: String) = scalaVersion match {
   case version                               => sys.error(s"unsupported scala version $version")
 }
 
+def isScala3(scalaVersion: String) = scalaVersion.startsWith("3.")
+
 def isScala212x(scalaVersion: String) = scalaBinaryVersion(scalaVersion) == "2.12"
 def isScala213x(scalaVersion: String) = scalaBinaryVersion(scalaVersion) == "2.13"
 
@@ -110,6 +112,16 @@ val sharedSettings = Seq(
     scalaVersion.value
   )
 ) ++ mimaSettings
+// NOTE: After dropping Scala 2.11, we can remove src/main/scala-2.11 and share sources between scala 2.12, 2.13 and 3.x.
+lazy val kindprojectorSettings = Seq(
+  Compile / scalacOptions ++= {
+    CrossVersion.partialVersion(scalaVersion.value) match {
+      case Some((3, _))       => Seq("-Ykind-projector:underscores")
+      case Some((2, 12 | 13)) => Seq("-Xsource:3", "-P:kind-projector:underscore-placeholders")
+      case _                  => Seq.empty
+    }
+  }
+)
 
 lazy val noPublishSettings = Seq(
   publish / skip := true,
@@ -208,33 +220,43 @@ def module(name: String) = {
     .settings(sharedSettings ++ Seq(Keys.name := id, mimaPreviousArtifacts := previousVersion(name).toSet))
 }
 
-lazy val algebirdCore = module("core").settings(
-  crossScalaVersions += "2.13.8",
-  initialCommands := """
+lazy val algebirdCore = module("core")
+  .settings(
+    crossScalaVersions += "2.13.8",
+    // crossScalaVersions += "3.2.2",
+    initialCommands := """
                      import com.twitter.algebird._
                      """.stripMargin('|'),
-  libraryDependencies ++=
-    Seq(
-      "com.googlecode.javaewah" % "JavaEWAH" % javaEwahVersion,
-      "org.typelevel" %% "algebra" % algebraVersion,
-      "org.scala-lang" % "scala-reflect" % scalaVersion.value,
-      "org.scalatest" %% "scalatest" % scalaTestVersion % "test",
-      "org.scala-lang.modules" %% "scala-collection-compat" % scalaCollectionCompat
-    ) ++ {
-      if (isScala213x(scalaVersion.value)) {
-        Seq()
-      } else {
-        Seq(compilerPlugin(("org.scalamacros" % "paradise" % paradiseVersion).cross(CrossVersion.full)))
-      }
-    },
-  addCompilerPlugin(("org.typelevel" % "kind-projector" % kindProjectorVersion).cross(CrossVersion.full)),
-  Compile / sourceGenerators += Def.task {
-    GenTupleAggregators.gen((Compile / sourceManaged).value)
-  }.taskValue,
-  // Scala 2.12's doc task was failing.
-  Compile / doc / sources ~= (_.filterNot(_.absolutePath.contains("javaapi"))),
-  Test / testOptions := Seq(Tests.Argument(TestFrameworks.JUnit, "-a"))
-)
+    libraryDependencies ++=
+      Seq(
+        "com.googlecode.javaewah" % "JavaEWAH" % javaEwahVersion,
+        ("org.typelevel" %% "algebra" % algebraVersion).cross(CrossVersion.for3Use2_13),
+        "org.scalatest" %% "scalatest" % scalaTestVersion % "test",
+        "org.scala-lang.modules" %% "scala-collection-compat" % scalaCollectionCompat
+      ) ++ {
+        if (isScala3(scalaVersion.value)) {
+          Seq.empty
+        } else if (isScala213x(scalaVersion.value)) {
+          Seq(
+            "org.scala-lang" % "scala-reflect" % scalaVersion.value,
+            compilerPlugin("org.typelevel" % "kind-projector" % kindProjectorVersion).cross(CrossVersion.full)
+          )
+        } else {
+          Seq(
+            "org.scala-lang" % "scala-reflect" % scalaVersion.value,
+            compilerPlugin(("org.scalamacros" % "paradise" % paradiseVersion).cross(CrossVersion.full)),
+            compilerPlugin("org.typelevel" % "kind-projector" % kindProjectorVersion).cross(CrossVersion.full)
+          )
+        }
+      },
+    Compile / sourceGenerators += Def.task {
+      GenTupleAggregators.gen((Compile / sourceManaged).value)
+    }.taskValue,
+    // Scala 2.12's doc task was failing.
+    Compile / doc / sources ~= (_.filterNot(_.absolutePath.contains("javaapi"))),
+    Test / testOptions := Seq(Tests.Argument(TestFrameworks.JUnit, "-a"))
+  )
+  .settings(kindprojectorSettings)
 
 lazy val algebirdTest = module("test")
   .settings(